diff --git a/.clang-format b/.clang-format index 42912faec..ad9f4bb18 100644 --- a/.clang-format +++ b/.clang-format @@ -2,19 +2,35 @@ Language: Cpp ColumnLimit: 110 IndentPPDirectives: BeforeHash -AlwaysBreakTemplateDeclarations : true -PackConstructorInitializers : CurrentLine +AlwaysBreakTemplateDeclarations: Yes +BreakAfterAttributes: Always +PackConstructorInitializers: Never AccessModifierOffset: -1 -IndentCaseLabels : true +IndentCaseLabels: true AllowShortLambdasOnASingleLine: Empty RequiresExpressionIndentation: OuterScope -BinPackArguments : false -BinPackParameters : false -LambdaBodyIndentation : Signature -PenaltyReturnTypeOnItsOwnLine : 1 +LambdaBodyIndentation: Signature +PenaltyReturnTypeOnItsOwnLine: 1 + +# TODO: update for clang-format 23 +BinPackArguments: false +BinPackParameters: OnePerLine + +BreakBeforeConceptDeclarations: Always + +Macros: + - LF_TRY=if + - LF_CATCH_ALL=else + - LF_CATCH(x)=else + - LF_HOF(x)={x;} + - LF_HOF(x,y)={x,y;} + - LF_HOF(x,y,z)={x,y,z;} + - LF_HOF(x,y,z,w)={x,y,z,w;} + - LF_HOF(a,b,c,d,e)={a;} + - LF_HOF(a,b,c,d,e,f)={a,b,c,d,e,f;} SpaceBeforeParens: Custom SpaceBeforeParensOptions: - AfterRequiresInClause: true - AfterRequiresInExpression : true + AfterRequiresInClause: true + AfterRequiresInExpression: true ... diff --git a/.clang-tidy b/.clang-tidy index 5d813fd55..d05681165 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -10,148 +10,150 @@ Checks: "*,\ -llvm-header-guard,\ -llvm-include-order,\ -llvmlibc-*,\ - -modernize-use-nodiscard,\ + -readability-identifier-length,\ -misc-non-private-member-variables-in-classes" -WarningsAsErrors: '' +WarningsAsErrors: "" CheckOptions: - key: readability-function-cognitive-complexity.IgnoreMacros - value: 'true' - - key: 'bugprone-argument-comment.StrictMode' - value: 'true' -# Prefer using enum classes with 2 values for parameters instead of bools - - key: 'bugprone-argument-comment.CommentBoolLiterals' - value: 'true' - - key: 'bugprone-misplaced-widening-cast.CheckImplicitCasts' - value: 'true' - - key: 'bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression' - value: 'true' - - key: 'bugprone-suspicious-string-compare.WarnOnLogicalNotComparison' - value: 'true' - - key: 'readability-simplify-boolean-expr.ChainedConditionalReturn' - value: 'true' - - key: 'readability-simplify-boolean-expr.ChainedConditionalAssignment' - value: 'true' - - key: 'readability-uniqueptr-delete-release.PreferResetCall' - value: 'true' - - key: 'cppcoreguidelines-init-variables.MathHeader' - value: '' - - key: 'cppcoreguidelines-narrowing-conversions.PedanticMode' - value: 'true' - - key: 'readability-else-after-return.WarnOnUnfixable' - value: 'true' - - key: 'readability-else-after-return.WarnOnConditionVariables' - value: 'true' - - key: 'readability-inconsistent-declaration-parameter-name.Strict' - value: 'true' - - key: 'readability-qualified-auto.AddConstToQualified' - value: 'true' - - key: 'readability-redundant-access-specifiers.CheckFirstDeclaration' - value: 'true' -# These seem to be the most common identifier styles - - key: 'readability-identifier-naming.AbstractClassCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantPointerParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstexprFunctionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstexprMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstexprVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.EnumCase' - value: 'lower_case' - - key: 'readability-identifier-naming.EnumConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.FunctionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalConstantPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalFunctionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.InlineNamespaceCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalConstantPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.MacroDefinitionCase' - value: 'UPPER_CASE' - - key: 'readability-identifier-naming.MemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.MethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.NamespaceCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ParameterPackCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PointerParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PrivateMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PrivateMemberPrefix' - value: 'm_' - - key: 'readability-identifier-naming.PrivateMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ProtectedMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ProtectedMemberPrefix' - value: 'm_' - - key: 'readability-identifier-naming.ProtectedMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PublicMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PublicMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ScopedEnumConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.StaticConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.StaticVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.StructCase' - value: 'lower_case' - - key: 'readability-identifier-naming.TemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.TemplateTemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.TypeAliasCase' - value: 'lower_case' - - key: 'readability-identifier-naming.TypedefCase' - value: 'lower_case' - - key: 'readability-identifier-naming.TypeTemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.UnionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ValueTemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.VariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.VirtualMethodCase' - value: 'lower_case' + value: "true" + - key: "cppcoreguidelines-avoid-do-while.IgnoreMacros" + value: "true" + - key: "bugprone-argument-comment.StrictMode" + value: "true" + # Prefer using enum classes with 2 values for parameters instead of bools + - key: "bugprone-argument-comment.CommentBoolLiterals" + value: "true" + - key: "bugprone-misplaced-widening-cast.CheckImplicitCasts" + value: "true" + - key: "bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression" + value: "true" + - key: "bugprone-suspicious-string-compare.WarnOnLogicalNotComparison" + value: "true" + - key: "readability-simplify-boolean-expr.ChainedConditionalReturn" + value: "true" + - key: "readability-simplify-boolean-expr.ChainedConditionalAssignment" + value: "true" + - key: "readability-uniqueptr-delete-release.PreferResetCall" + value: "true" + - key: "cppcoreguidelines-init-variables.MathHeader" + value: "" + - key: "cppcoreguidelines-narrowing-conversions.PedanticMode" + value: "true" + - key: "readability-else-after-return.WarnOnUnfixable" + value: "true" + - key: "readability-else-after-return.WarnOnConditionVariables" + value: "true" + - key: "readability-inconsistent-declaration-parameter-name.Strict" + value: "true" + - key: "readability-qualified-auto.AddConstToQualified" + value: "true" + - key: "readability-redundant-access-specifiers.CheckFirstDeclaration" + value: "true" + # These seem to be the most common identifier styles + - key: "readability-identifier-naming.AbstractClassCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantPointerParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstexprFunctionCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstexprMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstexprVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.EnumCase" + value: "lower_case" + - key: "readability-identifier-naming.EnumConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.FunctionCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalConstantPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalFunctionCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.InlineNamespaceCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalConstantPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.MacroDefinitionCase" + value: "UPPER_CASE" + - key: "readability-identifier-naming.MemberCase" + value: "lower_case" + - key: "readability-identifier-naming.MethodCase" + value: "lower_case" + - key: "readability-identifier-naming.NamespaceCase" + value: "lower_case" + - key: "readability-identifier-naming.ParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.ParameterPackCase" + value: "lower_case" + - key: "readability-identifier-naming.PointerParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.PrivateMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.PrivateMemberPrefix" + value: "m_" + - key: "readability-identifier-naming.PrivateMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ProtectedMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.ProtectedMemberPrefix" + value: "m_" + - key: "readability-identifier-naming.ProtectedMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.PublicMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.PublicMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ScopedEnumConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.StaticConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.StaticVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.StructCase" + value: "lower_case" + - key: "readability-identifier-naming.TemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.TemplateTemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.TypeAliasCase" + value: "lower_case" + - key: "readability-identifier-naming.TypedefCase" + value: "lower_case" + - key: "readability-identifier-naming.TypeTemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.UnionCase" + value: "lower_case" + - key: "readability-identifier-naming.ValueTemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.VariableCase" + value: "lower_case" + - key: "readability-identifier-naming.VirtualMethodCase" + value: "lower_case" ... diff --git a/.clangd b/.clangd index ef86cb6b0..fd3d2a8f4 100644 --- a/.clangd +++ b/.clangd @@ -1,2 +1,2 @@ CompileFlags: - CompilationDatabase: build/dev \ No newline at end of file + CompilationDatabase: build/dev diff --git a/.codespellrc b/.codespellrc index c3920f351..86730201c 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,7 +1,7 @@ [codespell] -builtin = clear,rare,en-GB_to_en-US,names,informal,code +builtin = clear,rare,names,informal,code check-filenames = check-hidden = ignore-words-list = deque,warmup,stdio,copyable,combinate -skip = */.git,*/build,*/prefix,*/vcpkg,*/_build,*/bench +skip = */.git,*/build,*/.legacy quiet-level = 2 diff --git a/.gemini/settings.json b/.gemini/settings.json new file mode 100644 index 000000000..b8dce87f3 --- /dev/null +++ b/.gemini/settings.json @@ -0,0 +1,8 @@ +{ + "context": { + "fileName": "AGENTS.md" + }, + "ui": { + "hideBanner": true + } +} \ No newline at end of file diff --git a/.github/workflows/linear.yml b/.github/workflows/linear.yml new file mode 100644 index 000000000..8d1ba0975 --- /dev/null +++ b/.github/workflows/linear.yml @@ -0,0 +1,33 @@ +name: Linear History + +on: + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + check-linear-history: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ github.event.pull_request.head.sha || github.sha }} + fetch-depth: 0 + + - name: Check for merge commits + run: | + BASE_REF=${{ github.base_ref || 'modules' }} + echo "Comparing against base: $BASE_REF" + git fetch origin $BASE_REF:$BASE_REF + MERGE_COMMITS=$(git rev-list --merges $BASE_REF..HEAD) + if [ -n "$MERGE_COMMITS" ]; then + echo "Error: Merge commits detected. libfork requires a linear history." + echo "Please rebase your branch onto $BASE_REF to remove merge commits." + echo "" + echo "Merge commits found:" + git log --merges --oneline $BASE_REF..HEAD + exit 1 + else + echo "No merge commits detected. Linear history check passed." + fi diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..82cfdc9a7 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,28 @@ +name: Lint + +on: + push: + branches: ["modules"] + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + lint: + runs-on: macos-latest + + steps: + - uses: actions/checkout@v6 + + - name: Set up Homebrew + uses: Homebrew/actions/setup-homebrew@main + + - name: Install Dependencies + run: brew install clang-format codespell + + - name: Run codespell + run: codespell + + - name: Run clang-format + run: | + find src include test benchmark/src -name "*.cpp" -o -name "*.hpp" -o -name "*.cxx" | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml new file mode 100644 index 000000000..4904ae1e5 --- /dev/null +++ b/.github/workflows/linux.yml @@ -0,0 +1,35 @@ +name: Linux + +on: + push: + branches: ["modules"] + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + build-and-test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + preset: [ci-hardened, ci-release, ci-no-except-rtti] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Homebrew + uses: Homebrew/actions/setup-homebrew@main + + - name: Install Dependencies + run: brew install cmake ninja gcc binutils catch2 google-benchmark + + - name: Configure + run: cmake --preset ${{ matrix.preset }} + -DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake + + - name: Build + run: cmake --build --preset ${{ matrix.preset }} + + - name: Test + run: ctest --preset ${{ matrix.preset }} diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 000000000..17eb53779 --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,35 @@ +name: MacOS + +on: + push: + branches: ["modules"] + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + build-and-test: + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + preset: [ci-hardened, ci-release, ci-no-except-rtti, ci-sanitize] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Homebrew + uses: Homebrew/actions/setup-homebrew@main + + - name: Install Dependencies + run: brew install cmake ninja llvm catch2 google-benchmark + + - name: Configure + run: cmake --preset ${{ matrix.preset }} + -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake + + - name: Build + run: cmake --build --preset ${{ matrix.preset }} + + - name: Test + run: ctest --preset ${{ matrix.preset }} diff --git a/.gitignore b/.gitignore index 47a6b5d9d..41a66b83c 100644 --- a/.gitignore +++ b/.gitignore @@ -26,5 +26,3 @@ output.png **/.DS_Store compile_commands.json -CMakeLists.txt.user -CMakeUserPresets.json diff --git a/.legacy/include/libfork/core/macro.hpp b/.legacy/include/libfork/core/macro.hpp index 0944c1644..e42fcfc18 100644 --- a/.legacy/include/libfork/core/macro.hpp +++ b/.legacy/include/libfork/core/macro.hpp @@ -61,17 +61,6 @@ #define LF_STATIC_CONST const #endif -// clang-format off - -/** - * @brief Use like `BOOST_HOF_RETURNS` to define a function/lambda with all the noexcept/requires/decltype specifiers. - * - * This macro is not truly variadic but the ``...`` allows commas in the macro argument. - */ -#define LF_HOF_RETURNS(...) noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) requires requires { __VA_ARGS__; } { return __VA_ARGS__;} - -// clang-format on - /** * @brief __[public]__ Detects if the compiler has exceptions enabled. * @@ -192,28 +181,6 @@ using std::unreachable; #define LF_ASSERT(expr) LF_ASSUME(expr) #endif -/** - * @brief Macro to prevent a function to be inlined. - */ -#if !defined(LF_NOINLINE) - #if defined(_MSC_VER) && !defined(__clang__) - #define LF_NOINLINE __declspec(noinline) - #elif defined(__GNUC__) && __GNUC__ > 3 - // Clang also defines __GNUC__ (as 4) - #if defined(__CUDACC__) - // nvcc doesn't always parse __noinline__, see: https://svn.boost.org/trac/boost/ticket/9392 - #define LF_NOINLINE __attribute__((noinline)) - #elif defined(__HIP__) - // See https://github.com/boostorg/config/issues/392 - #define LF_NOINLINE __attribute__((noinline)) - #else - #define LF_NOINLINE __attribute__((__noinline__)) - #endif - #else - #define LF_NOINLINE - #endif -#endif - /** * @brief Force no-inline for clang, works-around https://github.com/llvm/llvm-project/issues/63022. * @@ -229,28 +196,6 @@ using std::unreachable; #define LF_CLANG_TLS_NOINLINE #endif -/** - * @brief Macro to use next to 'inline' to force a function to be inlined. - * - * \rst - * - * .. note:: - * - * This does not imply the c++'s `inline` keyword which also has an effect on linkage. - * - * \endrst - */ -#if !defined(LF_FORCEINLINE) - #if defined(_MSC_VER) && !defined(__clang__) - #define LF_FORCEINLINE __forceinline - #elif defined(__GNUC__) && __GNUC__ > 3 - // Clang also defines __GNUC__ (as 4) - #define LF_FORCEINLINE __attribute__((__always_inline__)) - #else - #define LF_FORCEINLINE - #endif -#endif - #if defined(__clang__) && defined(__has_attribute) /** * @brief Compiler specific attribute. diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..cb1e1f47d --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,206 @@ +# Libfork Copilot Instructions + +## Project Overview + +**libfork** is a continuation-stealing coroutine-tasking library implementing +strict fork-join parallelism using C++20 coroutines. + +- **Type**: C++ library with module/`import std` support +- **Languages**: C++26 + +## Critical Build Requirements + +### Compiler & Module Support + +This project **requires C++23's `import std`** and **MUST** use the appropriate +toolchain file: + +- **MacOS**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake` +- **Linux**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake` + +**Common Error**: Without the toolchain file, CMake will fail. + +**Always include the toolchain file** in configure commands. + +### Dependencies (Homebrew) + +Make sure Homebrew is installed and `brew` is in your `PATH`: + +```bash +brew --version +``` + +**Required for building/testing:** + +- `cmake` +- `ninja` +- `catch2` +- `google-benchmark` +- `clang-format` +- `codespell` + +If on MacOS, also require: + +- `llvm` + +If on Linux, also require: + +- `gcc` +- `binutils` + +Install all at once (MacOS): + +```bash +brew install cmake ninja catch2 google-benchmark clang-format codespell llvm +``` + +Install all at once (Linux): + +```bash +brew install cmake ninja catch2 google-benchmark clang-format codespell gcc binutils +``` + +## Build & Test Workflow + +### 1. Configure + +Always use presets with the toolchain file: + +```bash +cmake --preset -DCMAKE_TOOLCHAIN_FILE=cmake/.cmake +``` + +**Relevant available presets** (from `CMakePresets.json`): + +- `ci-hardened` - Debug build with warnings and hardening flags +- `ci-release` - Optimized release build + +All presets enable developer mode (`libfork_DEV_MODE=ON`) and use Ninja generator. + +You should use the `ci-hardened` preset for development/testing and +`ci-release` for benchmarking. + +### 2. Build + +```bash +cmake --build --preset +``` + +**Build warnings** (expected and safe): + +- "It is recommended to build benchmarks in Release mode" - only relevant for `ci-hardened` +- CMake experimental `import std;` warning - expected for C++23's `import std` + +### 3. Test + +```bash +ctest --preset +``` + +All tests should pass. If tests fail, check that: + +- Configuration used the correct toolchain file +- Build completed without errors +- Any changes you have made are correct + +## Project Structure + +### Source Layout + +```sh +libfork/ +├── cmake/ # CMake utilities +├── include/libfork/**/*.hpp # Public headers (macros, version) +├── src/ # C++26 module source files (.cxx) and impl (.cpp) +│ ├── libfork.cxx # libfork — meta-module, re-exports all public modules +│ ├── utils/ # libfork.utils — internal utilities (not public API) +│ │ ├── utils.cxx # aggregator +│ │ └── *.cxx # :partitions +│ ├── core/ # libfork.core — core task/scheduler primitives +│ │ ├── core.cxx # aggregator +│ │ └── *.cxx # :partitions +│ ├── batteries/ # libfork.batteries — stacks, contexts, adaptors +│ │ ├── batteries.cxx # aggregator +│ │ └── *.cxx # :partitions +│ └── schedulers/ # libfork.schedulers — concrete schedulers +│ │ ├── schedulers.cxx # aggregator +│ │ └── *.cxx # :partitions +├── test/src/**/ # Test suite (Catch2) — uses `import libfork;` +│ └── *.cpp +├── benchmark/ # Benchmarking suite (google-benchmark) +│ ├── lib/ # Shared benchmark utilities and definitions +│ │ ├── *.hpp # headers +│ │ └── *.cpp # common source +│ ├── src/ # Implementation-specific benchmarks +│ │ ├── libfork/ # libfork-based benchmarks +│ │ ├── serial/ # serial benchmarks +│ │ └── */ # Other library benchmarks (e.g. OpenMP, TBB, Cilk Plus) +│ └── external/ # External benchmark code (e.g. UTS) +├── .github/workflows/ # CI workflows +│ ├── linux.yml # Linux builds +│ ├── macos.yml # MacOS builds +│ ├── lint.yml # Linting +│ └── linear.yml # Enforces linear history (no merge commits) +└── CMakeLists.txt # Main build configuration +``` + +## Workflows + +### Workflow Command Pattern + +All workflows follow this pattern: + +```yaml +- Install Dependencies: brew install ... +- Configure: cmake --preset -DCMAKE_TOOLCHAIN_FILE=.cmake +- Build: cmake --build --preset +- Test: ctest --preset +``` + +## Common Development Tasks + +### Making Code Changes + +1. **Modify source files** in `src/`, `include/`, `test/`, or `benchmark/` +2. **Rebuild**: `cmake --build --preset ` +3. **Test**: `ctest --preset ` + +#### Adding/removing files from `src/` or `include/` + +- Update the root `CMakeLists.txt` with new/removed files. + +#### Adding/removing files from benchmarks + +- Update the relevant `CMakeLists.txt` in `benchmark/lib/` or `benchmark/src//`. + +### Adding Tests + +Strive to add tests for new features/bug fixes. + +- Add `.cpp` files to `test/src/` +- Tests auto-discovered by CMake (GLOB_RECURSE) +- Links against `libfork::libfork` and `Catch2::Catch2WithMain` + +### Modifying Build Configuration + +**Warning**: Module-related changes are complex. Test thoroughly with clean builds. + +## Troubleshooting + +### Build Failures + +**Problem**: Configuration/Build fails after adding/removing files or modifying CMakeLists.txt +**Solution**: Try a clean build directory: + +```bash +rm -rf build/ +``` + +**Problem**: "compiler does not provide a way to discover the import graph" +**Solution**: Add `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake` to configure + +**Problem**: "Could not find 'brew' executable" +**Solution**: Install Homebrew + +**Problem**: "Could not automatically find libc++.modules.json" +**Solution**: Ensure LLVM is installed via Homebrew; toolchain auto-detects the path diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..68490f0c1 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,3 @@ +# In ./CLAUDE.md + +@AGENTS.md diff --git a/CMakeLists.txt b/CMakeLists.txt index d193bc879..577173f29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,38 +1,121 @@ -cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR) +cmake_minimum_required(VERSION 4.3 FATAL_ERROR) # See `Help/dev/experimental.rst` -set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "d0edc3af-4c50-42ea-a356-e2862fe7a444") +set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "451f2fe2-a8a2-47c3-bc32-94786d8fc91b") include(cmake/read_version.cmake) -read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/core/macro.hpp) +read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/version.hpp) project( libfork VERSION ${version_major}.${version_minor}.${version_patch} - DESCRIPTION "A bleeding-edge, lock-free, wait-free, continuation-stealing fork-join library built on C++20's coroutines." LANGUAGES CXX ) +# ---- Project options ---- + +option(libfork_DEV_MODE "Enable developer build (tests/benchmarks/etc) for libfork" OFF) + +# ---- System dependencies ---- + +find_package(Threads REQUIRED) + +# =========================== + # Tell CMake that we explicitly want `import std`. This will initialize the # property on all targets declared after this to 1 # TODO: set property per target set(CMAKE_CXX_MODULE_STD 1) -# Make a library. -add_library(uses_std STATIC) +add_library(libfork_libfork) +add_library(libfork::libfork ALIAS libfork_libfork) + +target_link_libraries(libfork_libfork PUBLIC Threads::Threads) -# Add sources. -target_sources(uses_std PRIVATE uses_std.cxx) +set_property(TARGET libfork_libfork PROPERTY EXPORT_NAME libfork) -# Tell CMake we're using C++23 but only C++20 is needed to consume it. -target_compile_features(uses_std INTERFACE cxx_std_23) +target_compile_features(libfork_libfork PUBLIC cxx_std_26) -# Make an executable. -add_executable(main) +# Public headers, __impl must be public because consumers need +# them to build the module BMI +target_sources(libfork_libfork + PUBLIC + FILE_SET HEADERS FILES + include/libfork/version.hpp + include/libfork/__impl/compiler.hpp + include/libfork/__impl/exception.hpp + include/libfork/__impl/utils.hpp + include/libfork/__impl/assume.hpp + BASE_DIRS + include +) -target_sources(main PRIVATE main.cxx) -target_link_libraries(main PRIVATE uses_std) +# Add the module files to the library, must be public because +# consumers will need bo build the BMI +target_sources(libfork_libfork + PUBLIC + FILE_SET CXX_MODULES FILES + # libfork (meta) + src/libfork.cxx + # libfork.utils + src/utils/utils.cxx + src/utils/utility.cxx + src/utils/constants.cxx + src/utils/tuple.cxx + src/utils/concepts.cxx + src/utils/defer.cxx + src/utils/uninitialized.cxx + # libfork.core + src/core/core.cxx + src/core/exception.cxx + src/core/concepts/stack.cxx + src/core/concepts/context.cxx + src/core/concepts/scheduler.cxx + src/core/concepts/invocable.cxx + src/core/frame.cxx + src/core/task.cxx + src/core/ops.cxx + src/core/poly_context.cxx + src/core/thread_locals.cxx + src/core/schedule.cxx + src/core/handles.cxx + src/core/root.cxx + src/core/execute.cxx + src/core/receiver.cxx + src/core/final_suspend.cxx + src/core/awaitables.cxx + src/core/promise.cxx + src/core/stop.cxx + # libfork.batteries + src/batteries/batteries.cxx + src/batteries/deque.cxx + src/batteries/adaptors.cxx + src/batteries/contexts.cxx + src/batteries/geometric_stack.cxx + src/batteries/adaptor_stack.cxx + src/batteries/slab_stack.cxx + # libfork.schedulers + src/schedulers/schedulers.cxx + src/schedulers/inline.cxx + src/schedulers/busy.cxx + PRIVATE + src/exception.cpp +) + +# ====================== + +if(libfork_DEV_MODE) + + include(CTest) # Enables the BUILD_TESTING option + + if(BUILD_TESTING) + add_subdirectory(test) + endif() + + add_subdirectory(benchmark) + +endif() # list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # @@ -43,9 +126,6 @@ target_link_libraries(main PRIVATE uses_std) # # message(STATUS "CMAKE_BUILD_TYPE is set to '${CMAKE_BUILD_TYPE}'") # -# # ---- System dependencies ---- -# -# find_package(Threads REQUIRED) # # # ------ Declare library ------ # diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 000000000..ac23b1e37 --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,126 @@ +{ + "version": 10, + "configurePresets": [ + { + "name": "cmake-pedantic", + "hidden": true, + "warnings": { + "dev": true, + "deprecated": true, + "uninitialized": true, + "unusedCli": true, + "systemVars": false + }, + "errors": { + "deprecated": true + } + }, + { + "name": "ci-base", + "inherits": "cmake-pedantic", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build/${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "libfork_DEV_MODE": "ON" + } + }, + { + "name": "ci-hardened", + "inherits": "ci-base", + "displayName": "Debug with warnings and hardening", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_CXX_FLAGS": "-O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -Wcast-qual -Wformat -Wformat=2 -Wundef -Werror=float-equal -Wshadow -Wcast-align -Wunused -Wnull-dereference -Wdouble-promotion -Wimplicit-fallthrough -Wextra-semi -Woverloaded-virtual -Wnon-virtual-dtor -Wold-style-cast -Werror=format-security -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS -fstrict-flex-arrays=3 -fstack-protector-strong -Wno-missing-braces -Wno-missing-field-initializers -Wno-c2y-extensions" + } + }, + { + "name": "ci-release", + "inherits": "ci-base", + "displayName": "Release", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -falign-functions=64" + } + }, + { + "name": "ci-no-except-rtti", + "inherits": "ci-base", + "displayName": "Release no RTTI or exceptions", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -fno-exceptions -fno-rtti -falign-functions=64" + } + }, + { + "name": "ci-sanitize", + "inherits": "ci-base", + "displayName": "Debug with sanitizers", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Sanitize", + "CMAKE_CXX_FLAGS": "-O2 -g -fsanitize=address,undefined -fno-omit-frame-pointer -fno-common -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS" + } + } + ], + "buildPresets": [ + { + "name": "ci-hardened", + "configurePreset": "ci-hardened" + }, + { + "name": "ci-release", + "configurePreset": "ci-release" + }, + { + "name": "ci-no-except-rtti", + "configurePreset": "ci-no-except-rtti" + }, + { + "name": "ci-sanitize", + "configurePreset": "ci-sanitize" + } + ], + "testPresets": [ + { + "name": "ci-hardened", + "configurePreset": "ci-hardened", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + }, + { + "name": "ci-release", + "configurePreset": "ci-release", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + }, + { + "name": "ci-no-except-rtti", + "configurePreset": "ci-no-except-rtti", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + }, + { + "name": "ci-sanitize", + "configurePreset": "ci-sanitize", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + } + ] +} diff --git a/CMakeUserPresets.json b/CMakeUserPresets.json new file mode 100644 index 000000000..26e650168 --- /dev/null +++ b/CMakeUserPresets.json @@ -0,0 +1,79 @@ +{ + "version": 10, + "configurePresets": [ + { + "name": "dev", + "inherits": "ci-hardened", + "displayName": "Hardened development build", + "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake", + "cacheVariables": { + "CMAKE_COLOR_DIAGNOSTICS": "ON" + } + }, + { + "name": "bench", + "inherits": "ci-release", + "displayName": "Release build for benchmarks", + "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake", + "cacheVariables": { + "CMAKE_COLOR_DIAGNOSTICS": "ON" + } + } + ], + "buildPresets": [ + { + "name": "dev", + "configurePreset": "dev" + }, + { + "name": "bench", + "configurePreset": "bench" + } + ], + "testPresets": [ + { + "name": "dev", + "configurePreset": "dev", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + } + ], + "workflowPresets": [ + { + "name": "dev", + "displayName": "Development Debug Hardened Workflow", + "steps": [ + { + "type": "configure", + "name": "dev" + }, + { + "type": "build", + "name": "dev" + }, + { + "type": "test", + "name": "dev" + } + ] + }, + { + "name": "bench", + "displayName": "Release Build (including Benchmarks)", + "steps": [ + { + "type": "configure", + "name": "bench" + }, + { + "type": "build", + "name": "bench" + } + ] + } + ] +} diff --git a/.legacy/LICENSE.md b/LICENSE.md similarity index 100% rename from .legacy/LICENSE.md rename to LICENSE.md diff --git a/actions/setup/action.yaml b/actions/setup/action.yaml deleted file mode 100644 index cd451a14b..000000000 --- a/actions/setup/action.yaml +++ /dev/null @@ -1,52 +0,0 @@ -name: 'setup' -description: 'setup vcpkg/cmake/ninja and caching' - -runs: - using: "composite" - - steps: - # Set env vars needed for vcpkg to leverage the GitHub Action cache as a storage for Binary Caching. - - uses: actions/github-script@v6 - with: - script: | - core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || ''); - core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); - - - uses: actions/checkout@v3 - with: - submodules: true - - name: "Create directory '${{ env.VCPKG_DEFAULT_BINARY_CACHE }}'" - run: mkdir -p $VCPKG_DEFAULT_BINARY_CACHE - shell: bash - - # Setup the build machine with the most recent versions of CMake and Ninja. - # Both are cached if not already: on subsequent runs both will be quickly restored from GitHub cache service. - - uses: lukka/get-cmake@latest - - # Restore vcpkg from the GitHub Action cache service. - # Note that packages are restored by vcpkg's binary caching when it is being run afterwards by CMake. - - name: Restore vcpkg - uses: actions/cache@v3 - with: - # The first path is the location of vcpkg: it contains the vcpkg executable and data files, as long as the - # built package archives (aka binary cache) which are located by VCPKG_DEFAULT_BINARY_CACHE env var. - # The other paths starting with '!' are exclusions: they contain temporary files generated - # during the build of the installed packages. - path: | - ${{ env.VCPKG_ROOT_DIR }} - !${{ env.VCPKG_ROOT_DIR }}/buildtrees - !${{ env.VCPKG_ROOT_DIR }}/packages - !${{ env.VCPKG_ROOT_DIR }}/downloads - !${{ env.VCPKG_ROOT_DIR }}/installed - # The key is composed in a way that it gets properly invalidated whenever a different version of vcpkg is being used. - key: | - ${{ hashFiles( '.git/modules/vcpkg/HEAD' )}} - - # On Windows runners, let's ensure to have the Developer Command Prompt environment setup correctly. - # As used here the Developer Command Prompt created is targeting x64 and using the default the Windows SDK. - - uses: ilammy/msvc-dev-cmd@v1 - - - name: Setup xcode - if: matrix.os == 'macos-13' - shell: bash - run: sudo xcode-select --switch /Applications/Xcode_15.0.app/Contents/Developer \ No newline at end of file diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 000000000..5ce1a7480 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR) + +project(libfork_benchmark LANGUAGES CXX) + +if(NOT CMAKE_BUILD_TYPE STREQUAL "Release") + message(WARNING "It is recommended to build benchmarks in Release mode for accurate results.") +endif() + +# ---- Dependencies ---- + +find_package(benchmark REQUIRED) + +# ---- Benchmarks ---- + +add_subdirectory(lib) + +add_subdirectory(src/serial) +add_subdirectory(src/baremetal) +add_subdirectory(src/libfork) + +# WHOLE_ARCHIVE ensures benchmark registrations (global initialisers) are not +# dropped by the linker when pulling objects from the static libraries above. +add_executable(libfork_benchmark src/benchmarks.cpp) + +target_link_libraries(libfork_benchmark + PRIVATE + $ + benchmark::benchmark_main +) + +if(BUILD_TESTING) + add_test(NAME Benchmark + COMMAND libfork_benchmark --benchmark_dry_run --benchmark_filter=^test/ + ) +endif() + +# ---- OpenMP Benchmarks ---- + +find_package(OpenMP REQUIRED) + +if(OpenMP_CXX_FOUND) + + add_subdirectory(src/openmp) + + target_link_libraries(libfork_benchmark + PRIVATE + $ + ) +endif() + + diff --git a/benchmark/external/uts/CMakeLists.txt b/benchmark/external/uts/CMakeLists.txt new file mode 100644 index 000000000..89c8b5e6e --- /dev/null +++ b/benchmark/external/uts/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR) + +project(uts_external LANGUAGES C) + +add_library(uts_c) + +target_sources(uts_c + PRIVATE + src/uts.c + src/rng/brg_sha1.c + PUBLIC + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include + FILES + include/uts/uts.h + include/uts/rng/rng.h + include/uts/rng/brg_sha1.h + include/uts/rng/brg_types.h +) diff --git a/benchmark/external/uts/include/uts/rng/brg_sha1.h b/benchmark/external/uts/include/uts/rng/brg_sha1.h new file mode 100644 index 000000000..d30f12c0d --- /dev/null +++ b/benchmark/external/uts/include/uts/rng/brg_sha1.h @@ -0,0 +1,100 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 01/08/2005 +*/ + +#ifndef _SHA1_H +#define _SHA1_H + +#include "uts/rng/brg_types.h" + +#define SHA1_BLOCK_SIZE 64 +#define SHA1_DIGEST_SIZE 20 + +#if defined(__cplusplus) +extern "C" { +#endif + +/** BEGIN: UTS RNG Harness **/ + +#define POS_MASK 0x7fffffff +#define HIGH_BITS 0x80000000 + +#define sha1_context sha1_ctx_s + +/**********************************/ +/* random number generator state */ +/**********************************/ +struct state_t { + uint_8t state[20]; +}; + +typedef uint_8t RNG_state; + +/***************************************/ +/* random number generator operations */ +/***************************************/ +void rng_init(RNG_state *state, int seed); +void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnNumber); +int rng_rand(RNG_state *mystate); +int rng_nextrand(RNG_state *mystate); +char *rng_showstate(RNG_state *state, char *s); +int rng_showtype(char *strBuf, int ind); + +/** END: UTS RNG Harness **/ +/* type to hold the SHA256 context */ + +struct sha1_ctx_s { + uint_32t count[2]; + uint_32t hash[5]; + uint_32t wbuf[16]; +}; + +typedef struct sha1_ctx_s sha1_ctx; + +/* Note that these prototypes are the same for both bit and */ +/* byte oriented implementations. However the length fields */ +/* are in bytes or bits as appropriate for the version used */ +/* and bit sequences are input as arrays of bytes in which */ +/* bit sequences run from the most to the least significant */ +/* end of each byte */ + +VOID_RETURN sha1_compile(sha1_ctx ctx[1]); + +VOID_RETURN sha1_begin(sha1_ctx ctx[1]); +VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]); +VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]); +VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len); + +#if defined(__cplusplus) +} +#endif + +#endif \ No newline at end of file diff --git a/benchmark/external/uts/include/uts/rng/brg_types.h b/benchmark/external/uts/include/uts/rng/brg_types.h new file mode 100644 index 000000000..9532acce6 --- /dev/null +++ b/benchmark/external/uts/include/uts/rng/brg_types.h @@ -0,0 +1,214 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 09/09/2006 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form unit_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef BRG_TYPES_H +#define BRG_TYPES_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +/* Try one of these if things don't work automatically */ +#ifdef BRG_C99_TYPES + #include + #include + #define BRG_UI8 +typedef uint8_t uint_8t; + #define BRG_UI16 +typedef uint16_t uint_16t; + #define BRG_UI32 + #define li_32(h) 0x##h##u +typedef uint32_t uint_32t; + #define BRG_UI64 + #define li_64(h) 0x##h##u +typedef uint64_t uint_64t; + +#elif defined(BRG_STD_TYPES) + #include + #define BRG_UI8 +typedef u_int8_t uint_8t; + #define BRG_UI16 +typedef u_int16_t uint_16t; + #define BRG_UI32 + #define li_32(h) 0x##h##u +typedef u_int32_t uint_32t; + #define BRG_UI64 + #define li_64(h) 0x##h##u +typedef u_int64_t uint_64t; + +#endif + +#ifndef BRG_UI8 + #define BRG_UI8 + #if UCHAR_MAX == 255u +typedef unsigned char uint_8t; + #else + #error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h + #endif +#endif + +#ifndef BRG_UI16 + #define BRG_UI16 + #if USHRT_MAX == 65535u +typedef unsigned short uint_16t; + #else + #error Please define uint_16t as a 16-bit unsigned short type in brg_types.h + #endif +#endif + +#ifndef BRG_UI32 + #define BRG_UI32 + #if UINT_MAX == 4294967295u + #define li_32(h) 0x##h##u +typedef unsigned int uint_32t; + #elif ULONG_MAX == 4294967295u + #define li_32(h) 0x##h##ul +typedef unsigned long uint_32t; + #elif defined(_CRAY) + #error This code needs 32-bit data types, which Cray machines do not provide + #else + #error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h + #endif +#endif + +#ifndef BRG_UI64 + #if defined(__BORLANDC__) && !defined(__MSDOS__) + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned __int64 uint_64t; + #elif defined(_MSC_VER) && (_MSC_VER < 1300) /* 1300 == VC++ 7.0 */ + #define BRG_UI64 + #define li_64(h) 0x##h##ui64 +typedef unsigned __int64 uint_64t; + #elif defined(__sun) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned long long uint_64t; + #elif defined(UINT_MAX) && UINT_MAX > 4294967295u + #if UINT_MAX == 18446744073709551615u + #define BRG_UI64 + #define li_64(h) 0x##h##u +typedef unsigned int uint_64t; + #endif + #elif defined(ULONG_MAX) && ULONG_MAX > 4294967295u + #if ULONG_MAX == 18446744073709551615ul + #define BRG_UI64 + #define li_64(h) 0x##h##ul +typedef unsigned long uint_64t; + #endif + #elif defined(ULLONG_MAX) && ULLONG_MAX > 4294967295u + #if ULLONG_MAX == 18446744073709551615ull + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned long long uint_64t; + #endif + #elif defined(ULONG_LONG_MAX) && ULONG_LONG_MAX > 4294967295u + #if ULONG_LONG_MAX == 18446744073709551615ull + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned long long uint_64t; + #endif + #endif +#endif + +#if defined(NEED_UINT_64T) && !defined(BRG_UI64) + #error Please define uint_64t as an unsigned 64 bit type in brg_types.h +#endif + +#ifndef RETURN_VALUES + #define RETURN_VALUES + #if defined(DLL_EXPORT) + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define VOID_RETURN __declspec(dllexport) void __stdcall + #define INT_RETURN __declspec(dllexport) int __stdcall + #elif defined(__GNUC__) + #define VOID_RETURN __declspec(__dllexport__) void + #define INT_RETURN __declspec(__dllexport__) int + #else + #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers + #endif + #elif defined(DLL_IMPORT) + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define VOID_RETURN __declspec(dllimport) void __stdcall + #define INT_RETURN __declspec(dllimport) int __stdcall + #elif defined(__GNUC__) + #define VOID_RETURN __declspec(__dllimport__) void + #define INT_RETURN __declspec(__dllimport__) int + #else + #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers + #endif + #elif defined(__WATCOMC__) + #define VOID_RETURN void __cdecl + #define INT_RETURN int __cdecl + #else + #define VOID_RETURN void + #define INT_RETURN int + #endif +#endif + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8 + + dec_unit_type(size,x) declares a variable 'x' of length + 'size' bits + + dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + ptr_cast(x,size) casts a pointer to a pointer to a + variable of length 'size' bits +*/ + +#define ui_type(size) uint_##size##t +#define dec_unit_type(size, x) typedef ui_type(size) x +#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)] +#define ptr_cast(x, size) ((ui_type(size) *)(x)) + +#if defined(__cplusplus) +} +#endif + +#endif \ No newline at end of file diff --git a/benchmark/external/uts/include/uts/rng/rng.h b/benchmark/external/uts/include/uts/rng/rng.h new file mode 100644 index 000000000..105c40466 --- /dev/null +++ b/benchmark/external/uts/include/uts/rng/rng.h @@ -0,0 +1,6 @@ +#ifndef _RNG_H +#define _RNG_H + +#include "uts/rng/brg_sha1.h" + +#endif /* _RNG_H */ \ No newline at end of file diff --git a/benchmark/external/uts/include/uts/uts.h b/benchmark/external/uts/include/uts/uts.h new file mode 100644 index 000000000..e86e68f3e --- /dev/null +++ b/benchmark/external/uts/include/uts/uts.h @@ -0,0 +1,120 @@ +#ifndef A0179FFF_4078_4EEB_BB6E_1E8C75CC694C +#define A0179FFF_4078_4EEB_BB6E_1E8C75CC694C +/* + * ---- The Unbalanced Tree Search (UTS) Benchmark ---- + * + * Copyright (c) 2010 See AUTHORS file for copyright holders + * + * This file is part of the unbalanced tree search benchmark. This + * project is licensed under the MIT Open Source license. See the LICENSE + * file for copyright and licensing information. + * + * UTS is a collaborative project between researchers at the University of + * Maryland, the University of North Carolina at Chapel Hill, and the Ohio + * State University. See AUTHORS file for more information. + * + */ + +#ifndef _UTS_H + #define _UTS_H + + #ifdef __cplusplus +extern "C" { + #endif + + #include "uts/rng/rng.h" + + #define UTS_VERSION "2.1" + + /*********************************************************** + * Tree node descriptor and statistics * + ***********************************************************/ + + #define MAXNUMCHILDREN 100 // cap on children (BIN root is exempt) + +struct node_t { + int type; // distribution governing number of children + int height; // depth of this node in the tree + int numChildren; // number of children, -1 => not yet determined + + /* for RNG state associated with this node */ + struct state_t state; +}; + +typedef struct node_t Node; + +/* Tree type + * Trees are generated using a Galton-Watson process, in + * which the branching factor of each node is a random + * variable. + * + * The random variable can follow a binomial distribution + * or a geometric distribution. Hybrid tree are + * generated with geometric distributions near the + * root and binomial distributions towards the leaves. + */ +enum uts_trees_e { BIN = 0, GEO, HYBRID, BALANCED }; +enum uts_geoshape_e { LINEAR = 0, EXPDEC, CYCLIC, FIXED }; + +typedef enum uts_trees_e tree_t; +typedef enum uts_geoshape_e geoshape_t; + +/* Strings for the above enums */ +extern char *uts_trees_str[]; +extern char *uts_geoshapes_str[]; + +/* Tree parameters */ +extern tree_t type; +extern double b_0; +extern int rootId; +extern int nonLeafBF; +extern double nonLeafProb; +extern int gen_mx; +extern geoshape_t shape_fn; +extern double shiftDepth; + +/* Benchmark parameters */ +extern int computeGranularity; +extern int debug; +extern int verbose; + +/* For stats generation: */ +typedef unsigned long long counter_t; + + /* Utility Functions */ + #define max(a, b) (((a) > (b)) ? (a) : (b)) + #define min(a, b) (((a) < (b)) ? (a) : (b)) + +void uts_error(char *str); +void uts_parseParams(int argc, char **argv); +int uts_paramsToStr(char *strBuf, int ind); +void uts_printParams(); +void uts_helpMessage(); + +void uts_showStats( + int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth); +double uts_wctime(); + +double rng_toProb(int n); + +/* Common tree routines */ +void uts_initRoot(Node *root, int type); +int uts_numChildren(Node *parent); +int uts_numChildren_bin(Node *parent); +int uts_numChildren_geo(Node *parent); +int uts_childType(Node *parent); + +/* Implementation Specific Functions */ +char *impl_getName(); +int impl_paramsToStr(char *strBuf, int ind); +int impl_parseParam(char *param, char *value); +void impl_helpMessage(); +void impl_abort(int err); + + #ifdef __cplusplus +} + #endif + +#endif /* _UTS_H */ + +#endif /* A0179FFF_4078_4EEB_BB6E_1E8C75CC694C */ diff --git a/benchmark/external/uts/src/rng/brg_endian.h b/benchmark/external/uts/src/rng/brg_endian.h new file mode 100644 index 000000000..96082e57b --- /dev/null +++ b/benchmark/external/uts/src/rng/brg_endian.h @@ -0,0 +1,132 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 20/10/2006 +*/ + +#ifndef BRG_ENDIAN_H +#define BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + #include +#elif defined(BSD) && (BSD >= 199103) || defined(__APPLE__) || defined(__CYGWIN32__) || \ + defined(__DJGPP__) || defined(__osf__) + #include +#elif defined(__linux__) || defined(__GNUC__) || defined(__GNU_LIBRARY__) + #if !defined(__MINGW32__) && !defined(__sun__) + #include + #if !defined(__BEOS__) + #include + #endif + #endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined(BIG_ENDIAN) && defined(LITTLE_ENDIAN) + #if defined(BYTE_ORDER) && BYTE_ORDER == BIG_ENDIAN + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(BIG_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(LITTLE_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN) + #if defined(_BYTE_ORDER) && _BYTE_ORDER == _BIG_ENDIAN + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(_BYTE_ORDER) && _BYTE_ORDER == _LITTLE_ENDIAN + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(_BIG_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN) + #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(__BIG_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN__) && defined(__LITTLE_ENDIAN__) + #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __BIG_ENDIAN__ + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(__BIG_ENDIAN__) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN__) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + + #if defined(__alpha__) || defined(__alpha) || defined(i386) || defined(__i386__) || defined(_M_I86) || \ + defined(_M_IX86) || defined(__OS2__) || defined(sun386) || defined(__TURBOC__) || defined(vax) || \ + defined(vms) || defined(VMS) || defined(__VMS) || defined(_M_X64) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + + #elif defined(AMIGA) || defined(applec) || defined(__AS400__) || defined(_CRAY) || defined(__hppa) || \ + defined(__hp9000) || defined(ibm370) || defined(mc68000) || defined(m68k) || defined(__MRC__) || \ + defined(__MVS__) || defined(__MWERKS__) || defined(sparc) || defined(__sparc) || \ + defined(SYMANTEC_C) || defined(__VOS__) || defined(__TIGCC__) || defined(__TANDEM) || \ + defined(THINK_C) || defined(__VMCMS__) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + + #elif 0 /* **** EDIT HERE IF NECESSARY **** */ + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #elif 0 /* **** EDIT HERE IF NECESSARY **** */ + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #else + #error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order + #endif + +#endif + +#endif \ No newline at end of file diff --git a/benchmark/external/uts/src/rng/brg_sha1.c b/benchmark/external/uts/src/rng/brg_sha1.c new file mode 100644 index 000000000..f6757bafc --- /dev/null +++ b/benchmark/external/uts/src/rng/brg_sha1.c @@ -0,0 +1,340 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 01/08/2005 + + This is a byte oriented version of SHA1 that operates on arrays of bytes + stored in memory. +*/ + +#include +#include /* for memcpy() etc. */ + +#include "brg_endian.h" +#include "uts/rng/brg_sha1.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** BEGIN: UTS RNG Harness **/ + +void rng_init(RNG_state *newstate, int seed) { + struct sha1_context ctx; + struct state_t gen; + int i; + + for (i = 0; i < 16; i++) + gen.state[i] = 0; + gen.state[16] = 0xFF & (seed >> 24); + gen.state[17] = 0xFF & (seed >> 16); + gen.state[18] = 0xFF & (seed >> 8); + gen.state[19] = 0xFF & (seed >> 0); + + sha1_begin(&ctx); + sha1_hash(gen.state, 20, &ctx); + sha1_end(newstate, &ctx); +} + +void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnnumber) { + struct sha1_context ctx; + uint_8t bytes[4]; + + bytes[0] = 0xFF & (spawnnumber >> 24); + bytes[1] = 0xFF & (spawnnumber >> 16); + bytes[2] = 0xFF & (spawnnumber >> 8); + bytes[3] = 0xFF & spawnnumber; + + sha1_begin(&ctx); + sha1_hash(mystate, 20, &ctx); + sha1_hash(bytes, 4, &ctx); + sha1_end(newstate, &ctx); +} + +int rng_rand(RNG_state *mystate) { + int r; + uint_32t b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0); + b = b & POS_MASK; + + r = (int)b; + // printf("b: %d\t, r: %d\n", b, r); + return r; +} + +int rng_nextrand(RNG_state *mystate) { + struct sha1_context ctx; + int r; + uint_32t b; + + sha1_begin(&ctx); + sha1_hash(mystate, 20, &ctx); + sha1_end(mystate, &ctx); + b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0); + b = b & POS_MASK; + + r = (int)b; + return r; +} + +/* condense state into string to display during debugging */ +char *rng_showstate(RNG_state *state, char *s) { + sprintf(s, "%.2X%.2X...", state[0], state[1]); + return s; +} + +/* describe random number generator type into string */ +int rng_showtype(char *strBuf, int ind) { + ind += sprintf(strBuf + ind, "SHA-1 (state size = %uB)", (unsigned)sizeof(struct state_t)); + return ind; +} + +/** END: UTS RNG Harness **/ + +#if defined(_MSC_VER) && (_MSC_VER > 800) + #pragma intrinsic(memcpy) +#endif + +#if 0 && defined(_MSC_VER) + #define rotl32 _lrotl + #define rotr32 _lrotr +#else + #define rotl32(x, n) (((x) << n) | ((x) >> (32 - n))) + #define rotr32(x, n) (((x) >> n) | ((x) << (32 - n))) +#endif + +#if !defined(bswap_32) + #define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00)) +#endif + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + #define SWAP_BYTES +#else + #undef SWAP_BYTES +#endif + +#if defined(SWAP_BYTES) + #define bsw_32(p, n) \ + { \ + int _i = (n); \ + while (_i--) \ + ((uint_32t *)p)[_i] = bswap_32(((uint_32t *)p)[_i]); \ + } +#else + #define bsw_32(p, n) +#endif + +#define SHA1_MASK (SHA1_BLOCK_SIZE - 1) + +#if 0 + + #define ch(x, y, z) (((x) & (y)) ^ (~(x) & (z))) + #define parity(x, y, z) ((x) ^ (y) ^ (z)) + #define maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +#else /* Discovered by Rich Schroeppel and Colin Plumb */ + + #define ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) + #define parity(x, y, z) ((x) ^ (y) ^ (z)) + #define maj(x, y, z) (((x) & (y)) | ((z) & ((x) ^ (y)))) + +#endif + +/* Compile 64 bytes of hash data into SHA1 context. Note */ +/* that this routine assumes that the byte order in the */ +/* ctx->wbuf[] at this point is in such an order that low */ +/* address bytes in the ORIGINAL byte stream will go in */ +/* this buffer to the high end of 32-bit words on BOTH big */ +/* and little endian systems */ + +#ifdef ARRAY + #define q(v, n) v[n] +#else + #define q(v, n) v##n +#endif + +#define one_cycle(v, a, b, c, d, e, f, k, h) \ + q(v, e) += rotr32(q(v, a), 27) + f(q(v, b), q(v, c), q(v, d)) + k + h; \ + q(v, b) = rotr32(q(v, b), 2) + +#define five_cycle(v, f, k, i) \ + one_cycle(v, 0, 1, 2, 3, 4, f, k, hf(i)); \ + one_cycle(v, 4, 0, 1, 2, 3, f, k, hf(i + 1)); \ + one_cycle(v, 3, 4, 0, 1, 2, f, k, hf(i + 2)); \ + one_cycle(v, 2, 3, 4, 0, 1, f, k, hf(i + 3)); \ + one_cycle(v, 1, 2, 3, 4, 0, f, k, hf(i + 4)) + +VOID_RETURN sha1_compile(sha1_ctx ctx[1]) { + uint_32t *w = ctx->wbuf; + +#ifdef ARRAY + uint_32t v[5]; + memcpy(v, ctx->hash, 5 * sizeof(uint_32t)); +#else + uint_32t v0, v1, v2, v3, v4; + v0 = ctx->hash[0]; + v1 = ctx->hash[1]; + v2 = ctx->hash[2]; + v3 = ctx->hash[3]; + v4 = ctx->hash[4]; +#endif + +#define hf(i) w[i] + + five_cycle(v, ch, 0x5a827999, 0); + five_cycle(v, ch, 0x5a827999, 5); + five_cycle(v, ch, 0x5a827999, 10); + one_cycle(v, 0, 1, 2, 3, 4, ch, 0x5a827999, hf(15)); + +#undef hf +#define hf(i) (w[(i)&15] = rotl32(w[((i) + 13) & 15] ^ w[((i) + 8) & 15] ^ w[((i) + 2) & 15] ^ w[(i)&15], 1)) + + one_cycle(v, 4, 0, 1, 2, 3, ch, 0x5a827999, hf(16)); + one_cycle(v, 3, 4, 0, 1, 2, ch, 0x5a827999, hf(17)); + one_cycle(v, 2, 3, 4, 0, 1, ch, 0x5a827999, hf(18)); + one_cycle(v, 1, 2, 3, 4, 0, ch, 0x5a827999, hf(19)); + + five_cycle(v, parity, 0x6ed9eba1, 20); + five_cycle(v, parity, 0x6ed9eba1, 25); + five_cycle(v, parity, 0x6ed9eba1, 30); + five_cycle(v, parity, 0x6ed9eba1, 35); + + five_cycle(v, maj, 0x8f1bbcdc, 40); + five_cycle(v, maj, 0x8f1bbcdc, 45); + five_cycle(v, maj, 0x8f1bbcdc, 50); + five_cycle(v, maj, 0x8f1bbcdc, 55); + + five_cycle(v, parity, 0xca62c1d6, 60); + five_cycle(v, parity, 0xca62c1d6, 65); + five_cycle(v, parity, 0xca62c1d6, 70); + five_cycle(v, parity, 0xca62c1d6, 75); + +#ifdef ARRAY + ctx->hash[0] += v[0]; + ctx->hash[1] += v[1]; + ctx->hash[2] += v[2]; + ctx->hash[3] += v[3]; + ctx->hash[4] += v[4]; +#else + ctx->hash[0] += v0; + ctx->hash[1] += v1; + ctx->hash[2] += v2; + ctx->hash[3] += v3; + ctx->hash[4] += v4; +#endif +} + +VOID_RETURN sha1_begin(sha1_ctx ctx[1]) { + ctx->count[0] = ctx->count[1] = 0; + ctx->hash[0] = 0x67452301; + ctx->hash[1] = 0xefcdab89; + ctx->hash[2] = 0x98badcfe; + ctx->hash[3] = 0x10325476; + ctx->hash[4] = 0xc3d2e1f0; +} + +/* SHA1 hash data in an array of bytes into hash buffer and */ +/* call the hash_compile function as required. */ + +VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]) { + uint_32t pos = (uint_32t)(ctx->count[0] & SHA1_MASK), space = SHA1_BLOCK_SIZE - pos; + const unsigned char *sp = data; + + if ((ctx->count[0] += len) < len) + ++(ctx->count[1]); + + while (len >= space) /* transfer whole blocks if possible */ + { + memcpy(((unsigned char *)ctx->wbuf) + pos, sp, space); + sp += space; + len -= space; + space = SHA1_BLOCK_SIZE; + pos = 0; + bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2); + sha1_compile(ctx); + } + + memcpy(((unsigned char *)ctx->wbuf) + pos, sp, len); +} + +/* SHA1 final padding and digest calculation */ + +VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]) { + uint_32t i = (uint_32t)(ctx->count[0] & SHA1_MASK); + + /* put bytes in the buffer in an order in which references to */ + /* 32-bit words will put bytes with lower addresses into the */ + /* top of 32 bit words on BOTH big and little endian machines */ + bsw_32(ctx->wbuf, (i + 3) >> 2); + + /* we now need to mask valid bytes and add the padding which is */ + /* a single 1 bit and as many zero bits as necessary. Note that */ + /* we can always add the first padding byte here because the */ + /* buffer always has at least one empty slot */ + ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3); + ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3); + + /* we need 9 or more empty positions, one for the padding byte */ + /* (above) and eight for the length count. If there is not */ + /* enough space, pad and empty the buffer */ + if (i > SHA1_BLOCK_SIZE - 9) { + if (i < 60) + ctx->wbuf[15] = 0; + sha1_compile(ctx); + i = 0; + } else /* compute a word index for the empty buffer positions */ + i = (i >> 2) + 1; + + while (i < 14) /* and zero pad all but last two positions */ + ctx->wbuf[i++] = 0; + + /* the following 32-bit length fields are assembled in the */ + /* wrong byte order on little endian machines but this is */ + /* corrected later since they are only ever used as 32-bit */ + /* word values. */ + ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29); + ctx->wbuf[15] = ctx->count[0] << 3; + sha1_compile(ctx); + + /* extract the hash value as bytes in case the hash buffer is */ + /* misaligned for 32-bit words */ + for (i = 0; i < SHA1_DIGEST_SIZE; ++i) + hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3))); +} + +VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len) { + sha1_ctx cx[1]; + + sha1_begin(cx); + sha1_hash(data, len, cx); + sha1_end(hval, cx); +} + +#if defined(__cplusplus) +} +#endif \ No newline at end of file diff --git a/benchmark/external/uts/src/uts.c b/benchmark/external/uts/src/uts.c new file mode 100644 index 000000000..507915bea --- /dev/null +++ b/benchmark/external/uts/src/uts.c @@ -0,0 +1,474 @@ +/* + * ---- The Unbalanced Tree Search (UTS) Benchmark ---- + * + * Copyright (c) 2010 See AUTHORS file for copyright holders + * + * This file is part of the unbalanced tree search benchmark. This + * project is licensed under the MIT Open Source license. See the LICENSE + * file for copyright and licensing information. + * + * UTS is a collaborative project between researchers at the University of + * Maryland, the University of North Carolina at Chapel Hill, and the Ohio + * State University. See AUTHORS file for more information. + * + */ + +#include +#include +#include +#include +#include + +#include "uts/uts.h" + +/*********************************************************** + * tree generation and search parameters * + * * + * Tree generation strategy is controlled via various * + * parameters set from the command line. The parameters * + * and their default values are given below. * + ***********************************************************/ + +char *uts_trees_str[] = {"Binomial", "Geometric", "Hybrid", "Balanced"}; +char *uts_geoshapes_str[] = {"Linear decrease", "Exponential decrease", "Cyclic", "Fixed branching factor"}; + +/* Tree type + * Trees are generated using a Galton-Watson process, in + * which the branching factor of each node is a random + * variable. + * + * The random variable can follow a binomial distribution + * or a geometric distribution. Hybrid tree are + * generated with geometric distributions near the + * root and binomial distributions towards the leaves. + */ +tree_t type = GEO; // Default tree type +double b_0 = 4.0; // default branching factor at the root +int rootId = 0; // default seed for RNG state at root + +/* Tree type BIN (BINOMIAL) + * The branching factor at the root is specified by b_0. + * The branching factor below the root follows an + * identical binomial distribution at all nodes. + * A node has m children with prob q, or no children with + * prob (1-q). The expected branching factor is q * m. + * + * Default parameter values + */ +int nonLeafBF = 4; // m +double nonLeafProb = 15.0 / 64.0; // q + +/* Tree type GEO (GEOMETRIC) + * The branching factor follows a geometric distribution with + * expected value b. + * The probability that a node has 0 <= n children is p(1-p)^n for + * 0 < p <= 1. The distribution is truncated at MAXNUMCHILDREN. + * The expected number of children b = (1-p)/p. Given b (the + * target branching factor) we can solve for p. + * + * A shape function computes a target branching factor b_i + * for nodes at depth i as a function of the root branching + * factor b_0 and a maximum depth gen_mx. + * + * Default parameter values + */ +int gen_mx = 6; // default depth of tree +geoshape_t shape_fn = LINEAR; // default shape function (b_i decr linearly) + +/* In type HYBRID trees, each node is either type BIN or type + * GEO, with the generation strategy changing from GEO to BIN + * at a fixed depth, expressed as a fraction of gen_mx + */ +double shiftDepth = 0.5; + +/* compute granularity - number of rng evaluations per tree node */ +int computeGranularity = 1; + +/* display parameters */ +int debug = 0; +int verbose = 1; + +/*********************************************************** + * * + * FUNCTIONS * + * * + ***********************************************************/ + +/* fatal error */ +void uts_error(char *str) { + printf("*** Error: %s\n", str); + impl_abort(1); +} + +/* + * wall clock time + * for detailed accounting of work, this needs + * high resolution + */ +double uts_wctime() { + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + return (tv.tv_sec + 1E-9 * tv.tv_nsec); +} + +// Interpret 32 bit positive integer as value on [0,1) +double rng_toProb(int n) { + if (n < 0) { + printf("*** toProb: rand n = %d out of range\n", n); + } + return ((n < 0) ? 0.0 : ((double)n) / 2147483648.0); +} + +void uts_initRoot(Node *root, int type) { + root->type = type; + root->height = 0; + root->numChildren = -1; // means not yet determined + rng_init(root->state.state, rootId); + + if (debug & 1) + printf("root node of type %d at %p\n", type, root); +} + +int uts_numChildren_bin(Node *parent) { + // distribution is identical everywhere below root + int v = rng_rand(parent->state.state); + double d = rng_toProb(v); + + return (d < nonLeafProb) ? nonLeafBF : 0; +} + +int uts_numChildren_geo(Node *parent) { + double b_i = b_0; + int depth = parent->height; + int numChildren, h; + double p, u; + + // use shape function to compute target b_i + if (depth > 0) { + switch (shape_fn) { + + // expected size polynomial in depth + case EXPDEC: + b_i = b_0 * pow((double)depth, -log(b_0) / log((double)gen_mx)); + break; + + // cyclic tree size + case CYCLIC: + if (depth > 5 * gen_mx) { + b_i = 0.0; + break; + } + b_i = pow(b_0, sin(2.0 * 3.141592653589793 * (double)depth / (double)gen_mx)); + break; + + // identical distribution at all nodes up to max depth + case FIXED: + b_i = (depth < gen_mx) ? b_0 : 0; + break; + + // linear decrease in b_i + case LINEAR: + default: + b_i = b_0 * (1.0 - (double)depth / (double)gen_mx); + break; + } + } + + // given target b_i, find prob p so expected value of + // geometric distribution is b_i. + p = 1.0 / (1.0 + b_i); + + // get uniform random number on [0,1) + h = rng_rand(parent->state.state); + u = rng_toProb(h); + + // max number of children at this cumulative probability + // (from inverse geometric cumulative density function) + numChildren = (int)floor(log(1 - u) / log(1 - p)); + + return numChildren; +} + +int uts_numChildren(Node *parent) { + int numChildren = 0; + + /* Determine the number of children */ + switch (type) { + case BIN: + if (parent->height == 0) + numChildren = (int)floor(b_0); + else + numChildren = uts_numChildren_bin(parent); + break; + + case GEO: + numChildren = uts_numChildren_geo(parent); + break; + + case HYBRID: + if (parent->height < shiftDepth * gen_mx) + numChildren = uts_numChildren_geo(parent); + else + numChildren = uts_numChildren_bin(parent); + break; + case BALANCED: + if (parent->height < gen_mx) + numChildren = (int)b_0; + break; + default: + uts_error("parTreeSearch(): Unknown tree type"); + } + + // limit number of children + // only a BIN root can have more than MAXNUMCHILDREN + if (parent->height == 0 && parent->type == BIN) { + int rootBF = (int)ceil(b_0); + if (numChildren > rootBF) { + printf("*** Number of children of root truncated from %d to %d\n", numChildren, rootBF); + numChildren = rootBF; + } + } else if (type != BALANCED) { + if (numChildren > MAXNUMCHILDREN) { + printf("*** Number of children truncated from %d to %d\n", numChildren, MAXNUMCHILDREN); + numChildren = MAXNUMCHILDREN; + } + } + + return numChildren; +} + +int uts_childType(Node *parent) { + switch (type) { + case BIN: + return BIN; + case GEO: + return GEO; + case HYBRID: + if (parent->height < shiftDepth * gen_mx) + return GEO; + else + return BIN; + case BALANCED: + return BALANCED; + default: + uts_error("uts_get_childtype(): Unknown tree type"); + return -1; + } +} + +// construct string with all parameter settings +int uts_paramsToStr(char *strBuf, int ind) { + // version + execution model + ind += sprintf(strBuf + ind, "UTS - Unbalanced Tree Search %s (%s)\n", UTS_VERSION, impl_getName()); + + // tree type + ind += sprintf(strBuf + ind, "Tree type: %d (%s)\n", type, uts_trees_str[type]); + + // tree shape parameters + ind += sprintf(strBuf + ind, "Tree shape parameters:\n"); + ind += sprintf(strBuf + ind, " root branching factor b_0 = %.1f, root seed = %d\n", b_0, rootId); + + if (type == GEO || type == HYBRID) { + ind += sprintf(strBuf + ind, + " GEO parameters: gen_mx = %d, shape function = %d (%s)\n", + gen_mx, + shape_fn, + uts_geoshapes_str[shape_fn]); + } + + if (type == BIN || type == HYBRID) { + double q = nonLeafProb; + int m = nonLeafBF; + double es = (1.0 / (1.0 - q * m)); + ind += + sprintf(strBuf + ind, " BIN parameters: q = %f, m = %d, E(n) = %f, E(s) = %.2f\n", q, m, q * m, es); + } + + if (type == HYBRID) { + ind += sprintf( + strBuf + ind, " HYBRID: GEO from root to depth %d, then BIN\n", (int)ceil(shiftDepth * gen_mx)); + } + + if (type == BALANCED) { + ind += sprintf(strBuf + ind, " BALANCED parameters: gen_mx = %d\n", gen_mx); + ind += sprintf(strBuf + ind, + " Expected size: %llu nodes, %llu leaves\n", + (counter_t)((pow(b_0, gen_mx + 1) - 1.0) / (b_0 - 1.0)) /* geometric series */, + (counter_t)pow(b_0, gen_mx)); + } + + // random number generator + ind += sprintf(strBuf + ind, "Random number generator: "); + ind = rng_showtype(strBuf, ind); + ind += sprintf(strBuf + ind, "\nCompute granularity: %d\n", computeGranularity); + + return ind; +} + +// show parameter settings +void uts_printParams() { + char strBuf[5000] = ""; + int ind = 0; + + if (verbose > 0) { + ind = uts_paramsToStr(strBuf, ind); + ind = impl_paramsToStr(strBuf, ind); + printf("%s\n", strBuf); + } +} + +void uts_parseParams(int argc, char *argv[]) { + int i = 1; + int err = -1; + while (i < argc && err == -1) { + if (argv[i][0] == '-' && argv[i][1] == 'h') { + uts_helpMessage(); + impl_abort(0); + + } else if (argv[i][0] != '-' || strlen(argv[i]) != 2 || argc <= i + 1) { + err = i; + break; + } + + // Matched by implementation -- return 0 on success + // This is fragile, don't override parameters in impl_parseParam()! + if (!impl_parseParam(argv[i], argv[i + 1])) { + i += 2; + continue; + } + + switch (argv[i][1]) { + case 'q': + nonLeafProb = atof(argv[i + 1]); + break; + case 'm': + nonLeafBF = atoi(argv[i + 1]); + break; + case 'r': + rootId = atoi(argv[i + 1]); + break; + case 'x': + debug = atoi(argv[i + 1]); + break; + case 'v': + verbose = atoi(argv[i + 1]); + break; + case 't': + type = (tree_t)atoi(argv[i + 1]); + if (type != BIN && type != GEO && type != HYBRID && type != BALANCED) + err = i; + break; + case 'a': + shape_fn = (geoshape_t)atoi(argv[i + 1]); + if (shape_fn > FIXED) + err = i; + break; + case 'b': + b_0 = atof(argv[i + 1]); + break; + case 'd': + gen_mx = atoi(argv[i + 1]); + break; + case 'f': + shiftDepth = atof(argv[i + 1]); + break; + case 'g': + computeGranularity = max(1, atoi(argv[i + 1])); + break; + default: + err = i; + } + + if (err != -1) + break; + + i += 2; + } + + if (err != -1) { + printf("Unrecognized parameter or incorrect/missing value: '%s %s'\n", + argv[i], + (i + 1 < argc) ? argv[i + 1] : "[none]"); + printf("Try -h for help.\n"); + impl_abort(4); + } +} + +void uts_helpMessage() { + printf(" UTS - Unbalanced Tree Search %s (%s)\n\n", UTS_VERSION, impl_getName()); + printf(" usage: uts-bin [parameter value] ...\n\n"); + printf(" parameter type description\n"); + printf(" ==== ==== =========================================\n"); + printf("\n Benchmark Parameters:\n"); + printf(" -t int tree type (0: BIN, 1: GEO, 2: HYBRID, 3: BALANCED)\n"); + printf(" -b dble root branching factor\n"); + printf(" -r int root seed 0 <= r < 2^31 \n"); + printf(" -a int GEO: tree shape function \n"); + printf(" -d int GEO, BALANCED: tree depth\n"); + printf(" -q dble BIN: probability of non-leaf node\n"); + printf(" -m int BIN: number of children for non-leaf node\n"); + printf(" -f dble HYBRID: fraction of depth for GEO -> BIN transition\n"); + printf(" -g int compute granularity: number of rng_spawns per node\n"); + printf(" -v int nonzero to set verbose output\n"); + printf(" -x int debug level\n"); + + // Get help message from the implementation + printf("\n Additional Implementation Parameters:\n"); + impl_helpMessage(); + printf("\n"); +} + +void uts_showStats( + int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth) { + // summarize execution info for machine consumption + if (verbose == 0) { + printf("%4d %7.3f %9llu %7.0llu %7.0llu %d %d %.2f %d %d %1d %f %3d\n", + nPes, + walltime, + nNodes, + (long long)(nNodes / walltime), + (long long)((nNodes / walltime) / nPes), + chunkSize, + type, + b_0, + rootId, + gen_mx, + shape_fn, + nonLeafProb, + nonLeafBF); + } + + // summarize execution info for human consumption + else { + printf("Tree size = %llu, tree depth = %llu, num leaves = %llu (%.2f%%)\n", + nNodes, + maxDepth, + nLeaves, + nLeaves / (float)nNodes * 100.0); + printf("Wallclock time = %.3f sec, performance = %.0f nodes/sec (%.0f nodes/sec per PE)\n\n", + walltime, + (nNodes / walltime), + (nNodes / walltime / nPes)); + } +} + +// --------------------------------------------------------------------- // + +// The name of this implementation +char *impl_getName() { return "Sequential Recursive Search"; } + +int impl_paramsToStr(char *strBuf, int ind) { + ind += sprintf(strBuf + ind, "Execution strategy: %s\n", impl_getName()); + return ind; +} + +// Not using UTS command line params, return non-success +int impl_parseParam(char *param, char *value) { + return 1; + (void)param; + (void)value; +} + +void impl_helpMessage() { printf(" none.\n"); } + +void impl_abort(int err) { exit(err); } \ No newline at end of file diff --git a/benchmark/lib/CMakeLists.txt b/benchmark/lib/CMakeLists.txt new file mode 100644 index 000000000..46fd98c80 --- /dev/null +++ b/benchmark/lib/CMakeLists.txt @@ -0,0 +1,23 @@ +add_library(benchmark_common) + +target_compile_features(benchmark_common PUBLIC cxx_std_26) + +target_sources(benchmark_common + PRIVATE + uts.cpp + PUBLIC + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + FILES + fib.hpp + uts.hpp + macros.hpp +) + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../external/uts external/uts) + +target_link_libraries(benchmark_common + PUBLIC + benchmark::benchmark + uts_c +) diff --git a/benchmark/lib/fib.hpp b/benchmark/lib/fib.hpp new file mode 100644 index 000000000..08e11ec80 --- /dev/null +++ b/benchmark/lib/fib.hpp @@ -0,0 +1,31 @@ +#pragma once + +#ifdef LF_BENCH_NO_IMPORT_STD + #include +#else +import std; +#endif + +inline constexpr int fib_test = 8; +inline constexpr int fib_base = 37; + +/** + * @brief Non-recursive Fibonacci calculation + */ +constexpr auto fib_ref(std::int64_t n) -> std::int64_t { + + if (n < 2) { + return n; + } + + std::int64_t prev = 0; + std::int64_t curr = 1; + + for (std::int64_t i = 2; i <= n; ++i) { + std::int64_t next = prev + curr; + prev = curr; + curr = next; + } + + return curr; +} diff --git a/benchmark/lib/macros.hpp b/benchmark/lib/macros.hpp new file mode 100644 index 000000000..a32642f43 --- /dev/null +++ b/benchmark/lib/macros.hpp @@ -0,0 +1,172 @@ +#pragma once + +#include + +// Use `import std;` by default. Textually `#include ` drags in +// ``, which triggers a libc++ 22 link-time bug (undefined +// `__atomic_unique_lock::__set_locked_bit`) in TUs that later instantiate +// anything touching std::stop_*. Targets that can't use modules (e.g. the +// openmp benchmarks, see benchmark/src/openmp/CMakeLists.txt) define +// LF_BENCH_NO_IMPORT_STD and get textual includes instead. +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include +#else +import std; +#endif + +#define BENCH_GET_FN(bench_fn, ...) bench_fn __VA_OPT__(<__VA_ARGS__>) + +namespace lf_bench { + +inline void bench_thread_args(benchmark::Benchmark *bench, auto make_args) { + unsigned hw = std::max(1U, std::thread::hardware_concurrency()); + for (unsigned t : {1U, 2U, 4U, 6U, 8U, 12U, 16U, 24U, 32U, 48U, 64U, 96U}) { + if (t > hw) { + return; + } + make_args(bench, t); + } +} + +inline auto sanitize(std::string s) -> std::string { + s.erase(std::remove(s.begin(), s.end(), ' '), s.end()); + return s; +} + +inline auto +format_name(std::string mode, std::string category, std::string name, std::string args) -> std::string { + std::string res = sanitize(mode) + "/" + sanitize(category) + "/" + sanitize(name); + std::string s_args = sanitize(args); + if (!s_args.empty()) { + res += "/" + s_args; + } + return res; +} + +inline auto inverse_complexity(benchmark::IterationCount n) -> double { return 1.0 / static_cast(n); } + +inline void setup_single(benchmark::Benchmark *b, std::int64_t size) { b->Arg(size)->UseRealTime(); } + +inline void setup_mt(benchmark::Benchmark *b, std::int64_t size) { + b->Apply([size](benchmark::Benchmark *bm) { + bench_thread_args(bm, [size](benchmark::Benchmark *inner_b, unsigned t) { + inner_b->Args({size, static_cast(t)}); + }); + }) + ->Complexity(inverse_complexity) + ->UseRealTime(); +} + +inline void setup_uts_mt(benchmark::Benchmark *b) { + b->Apply([](benchmark::Benchmark *bm) { + bench_thread_args(bm, [](benchmark::Benchmark *inner_b, unsigned t) { + inner_b->Arg(static_cast(t)); + }); + }) + ->Complexity(inverse_complexity) + ->UseRealTime(); +} + +} // namespace lf_bench + +// --- Standard Benchmarks --- + +#define BENCH_ONE_WITH_ID(id, bench_fn, category, name, mode, prefix, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__), \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)); \ + lf_bench::setup_single(b, prefix##_##mode); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define BENCH_ONE_HIDDEN(id, ...) BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define BENCH_ONE(bench_fn, category, name, mode, prefix, ...) \ + BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define BENCH_ALL(bench_fn, category, name, prefix, ...) \ + BENCH_ONE(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__) + +// --- Multi-Threaded Benchmarks --- + +#define BENCH_ONE_MT_WITH_ID(id, bench_fn, category, name, mode, prefix, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__), \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)); \ + lf_bench::setup_mt(b, prefix##_##mode); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define BENCH_ONE_MT_HIDDEN(id, ...) BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define BENCH_ONE_MT(bench_fn, category, name, mode, prefix, ...) \ + BENCH_ONE_MT_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define BENCH_ALL_MT(bench_fn, category, name, prefix, ...) \ + BENCH_ONE_MT(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE_MT(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__) + +// --- UTS Benchmarks --- + +#define UTS_BENCH_ONE_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark( \ + lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__), \ + [=](benchmark::State &state) { \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id); \ + }); \ + b->UseRealTime(); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define UTS_BENCH_ONE_HIDDEN(id, ...) UTS_BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define UTS_BENCH_ONE(bench_fn, category, mode, tree_name, tree_id, ...) \ + UTS_BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__) + +#define UTS_BENCH_ALL(bench_fn, category, ...) \ + UTS_BENCH_ONE(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__) + +// --- UTS Multi-Threaded Benchmarks --- + +#define UTS_BENCH_ONE_MT_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark( \ + lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__), \ + [=](benchmark::State &state) { \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id); \ + }); \ + lf_bench::setup_uts_mt(b); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define UTS_BENCH_ONE_MT_HIDDEN(id, ...) UTS_BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define UTS_BENCH_ONE_MT(bench_fn, category, mode, tree_name, tree_id, ...) \ + UTS_BENCH_ONE_MT_HIDDEN( \ + __COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__) + +#define UTS_BENCH_ALL_MT(bench_fn, category, ...) \ + UTS_BENCH_ONE_MT(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__) diff --git a/benchmark/lib/uts.cpp b/benchmark/lib/uts.cpp new file mode 100644 index 000000000..90964d48c --- /dev/null +++ b/benchmark/lib/uts.cpp @@ -0,0 +1,159 @@ +#include "uts.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include +#else +import std; +#endif + +namespace { + +void reset_uts() { + type = GEO; + b_0 = 4.0; + rootId = 0; + nonLeafBF = 4; + nonLeafProb = 15.0 / 64.0; + gen_mx = 6; + shape_fn = LINEAR; + shiftDepth = 0.5; + computeGranularity = 1; + debug = 0; + verbose = 1; +} + +// (T1 mini) Geometric +void setup_t1_mini() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 7; + b_0 = 4; + rootId = 19; +} + +// (T1) Geometric +void setup_t1() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 10; + b_0 = 4; + rootId = 19; +} + +// (T1L) Geometric +void setup_t1l() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 13; + b_0 = 4; + rootId = 29; +} + +// (T1XXL) +void setup_t1xxl() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 15; + b_0 = 4; + rootId = 19; +} + +// (T3 mini) +void setup_t3_mini() { + reset_uts(); + type = static_cast(0); + b_0 = 20; + nonLeafBF = 8; + nonLeafProb = 0.124875; + rootId = 42; +} + +// (T3) Binomial +void setup_t3() { + reset_uts(); + type = static_cast(0); + b_0 = 2000; + nonLeafBF = 8; + nonLeafProb = 0.124875; + rootId = 42; +} + +// (T3L) Binomial +void setup_t3l() { + reset_uts(); + type = static_cast(0); + b_0 = 2000; + nonLeafBF = 5; + nonLeafProb = 0.200014; + rootId = 7; +} + +// (T3XXL) Binomial +void setup_t3xxl() { + reset_uts(); + type = static_cast(0); + b_0 = 2000; + nonLeafBF = 2; + nonLeafProb = 0.499995; + rootId = 316; +} + +} // namespace + +void setup_tree(uts_tree tree) { + switch (tree) { + case uts_t1_mini: + setup_t1_mini(); + break; + case uts_t1: + setup_t1(); + break; + case uts_t1l: + setup_t1l(); + break; + case uts_t1xxl: + setup_t1xxl(); + break; + case uts_t3_mini: + setup_t3_mini(); + break; + case uts_t3: + setup_t3(); + break; + case uts_t3l: + setup_t3l(); + break; + case uts_t3xxl: + setup_t3xxl(); + break; + default: + std::terminate(); + } +} + +auto expected_result(uts_tree tree) -> result { + switch (tree) { + case uts_t1_mini: + return {.maxdepth = 7, .size = 63914, .leaves = 51124}; + case uts_t1: + return {.maxdepth = 10, .size = 4130071, .leaves = 3305118}; + case uts_t1l: + return {.maxdepth = 13, .size = 102181082, .leaves = 81746377}; + case uts_t1xxl: + return {.maxdepth = 15, .size = 4230646601, .leaves = 3384495738}; + case uts_t3_mini: + return {.maxdepth = 67, .size = 6213, .leaves = 5438}; + case uts_t3: + return {.maxdepth = 1572, .size = 4112897, .leaves = 3599034}; + case uts_t3l: + return {.maxdepth = 17844, .size = 111345631, .leaves = 89076904}; + case uts_t3xxl: + return {.maxdepth = 99049, .size = 2793220501, .leaves = 1396611250}; + default: + std::terminate(); + } +} diff --git a/benchmark/lib/uts.hpp b/benchmark/lib/uts.hpp new file mode 100644 index 000000000..2d3a2065f --- /dev/null +++ b/benchmark/lib/uts.hpp @@ -0,0 +1,50 @@ +#pragma once + +// Include the C UTS library header first (it defines max/min macros that would +// clash with std::max/std::min after import std). +#include "uts/uts.h" + +#undef max +#undef min + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include +#else +import std; +#endif + +struct result { + counter_t maxdepth; + counter_t size; + counter_t leaves; + auto operator<=>(const result &) const = default; +}; + +template <> +struct std::formatter : std::formatter { + auto format(const result &r, auto &ctx) const { + return std::formatter::format( + std::format("{{maxdepth={}, size={}, leaves={}}}", r.maxdepth, r.size, r.leaves), ctx); + } +}; + +struct pair { + result res; + Node child; +}; + +enum uts_tree : char { + uts_t1_mini, // Geometric [fixed], ~64K nodes (test only) + uts_t1, // Geometric [fixed], ~4M nodes + uts_t1l, // Geometric [fixed], ~102M nodes + uts_t1xxl, // Geometric [fixed], ~4.2B nodes + uts_t3_mini, // Binomial, ~6K nodes (test only) + uts_t3, // Binomial, ~4M nodes + uts_t3l, // Binomial, ~111M nodes + uts_t3xxl, // Binomial, ~2.8B nodes +}; + +void setup_tree(uts_tree tree); + +auto expected_result(uts_tree tree) -> result; diff --git a/benchmark/src/baremetal/CMakeLists.txt b/benchmark/src/baremetal/CMakeLists.txt new file mode 100644 index 000000000..ea6f1e69f --- /dev/null +++ b/benchmark/src/baremetal/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library(baremetal_benchmarks) + +target_sources(baremetal_benchmarks PRIVATE fib.cpp) + +target_link_libraries(baremetal_benchmarks PUBLIC benchmark_common libfork::libfork) diff --git a/benchmark/src/baremetal/fib.cpp b/benchmark/src/baremetal/fib.cpp new file mode 100644 index 000000000..247336594 --- /dev/null +++ b/benchmark/src/baremetal/fib.cpp @@ -0,0 +1,177 @@ +#include + +#include "fib.hpp" +#include "macros.hpp" + +import std; + +import libfork; + +// === Coroutine + +namespace { + +// ==== Allocators ==== // + +[[nodiscard]] +inline auto fib_align_size(std::size_t n) -> std::size_t { + constexpr std::size_t align = __STDCPP_DEFAULT_NEW_ALIGNMENT__; + return (n + align - 1) & ~(align - 1); +} + +constinit inline thread_local std::byte *tls_bump_ptr = nullptr; + +struct task { + struct promise_type { + + static auto operator new(std::size_t sz) -> void * { + auto *prev = tls_bump_ptr; + tls_bump_ptr += fib_align_size(sz); + return prev; + } + + static auto operator delete(void *p, [[maybe_unused]] std::size_t sz) noexcept -> void { + tls_bump_ptr = std::bit_cast(p); + } + + auto get_return_object() -> task { return {std::coroutine_handle::from_promise(*this)}; } + + auto initial_suspend() -> std::suspend_always { return {}; } + + auto final_suspend() noexcept { + struct final_awaitable : std::suspend_always { + auto await_suspend(std::coroutine_handle h) noexcept -> std::coroutine_handle<> { + + std::coroutine_handle<> cont = h.promise().continuation; + + h.destroy(); + + if (cont) { + return cont; + } + + return std::noop_coroutine(); + } + }; + + return final_awaitable{}; + } + + void return_value(std::int64_t val) { *value = val; } + void unhandled_exception() { std::terminate(); } + + std::int64_t *value = nullptr; + std::coroutine_handle<> continuation = nullptr; + }; + + std::coroutine_handle coro; + + auto set(std::int64_t &out) -> task & { + coro.promise().value = &out; + return *this; + } + + auto await_ready() noexcept -> bool { return false; } + + auto await_suspend(std::coroutine_handle<> h) -> std::coroutine_handle { + coro.promise().continuation = h; + return coro; + } + + void await_resume() noexcept {} +}; + +auto fib(std::int64_t n) -> task { + if (n <= 1) { + co_return n; + } + std::int64_t a = 0; + std::int64_t b = 0; + co_await fib(n - 2).set(a); + co_await fib(n - 1).set(b); + co_return a + b; +} + +template +void fib_coro_no_queue(benchmark::State &state) { + + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + + state.counters["n"] = static_cast(n); + + // 8MB stack + std::unique_ptr buffer = std::make_unique(1024 * 1024 * 8); + tls_bump_ptr = buffer.get(); + + for (auto _ : state) { + benchmark::DoNotOptimize(n); + std::int64_t result = 0; + fib(n).set(result).coro.resume(); + + if (result != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", result, expect)); + break; + } + + benchmark::DoNotOptimize(result); + } + + if (tls_bump_ptr != buffer.get()) { + std::terminate(); // Stack leak + } +} + +// === Recursive with Deque overhead + +constinit inline thread_local lf::deque *tls_deque = nullptr; + +auto deque() -> lf::deque & { return *tls_deque; } + +auto fib_recursive_deque_impl(std::int64_t n) -> std::int64_t { + if (n <= 1) { + return n; + } + + // Emulate work item creation/scheduling overhead + deque().push(n); + std::int64_t a = fib_recursive_deque_impl(n - 2); + deque().pop(); + + std::int64_t b = fib_recursive_deque_impl(n - 1); + + return a + b; +} + +template +void fib_recursive_deque(benchmark::State &state) { + + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + + state.counters["n"] = static_cast(n); + + lf::deque deque{64}; + tls_deque = &deque; + + for (auto _ : state) { + benchmark::DoNotOptimize(n); + std::int64_t result = fib_recursive_deque_impl(n); + + if (result != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", result, expect)); + break; + } + + benchmark::DoNotOptimize(result); + } + + tls_deque = nullptr; +} + +} // namespace + +// Minimal coroutine, bump allocated (thread-local) stack +BENCH_ALL(fib_coro_no_queue, baremetal, coro, fib) + +BENCH_ALL(fib_recursive_deque, baremetal, deque, fib) diff --git a/benchmark/src/benchmarks.cpp b/benchmark/src/benchmarks.cpp new file mode 100644 index 000000000..d6cf26f54 --- /dev/null +++ b/benchmark/src/benchmarks.cpp @@ -0,0 +1 @@ +// Benchmarks are registered in the linked sub-libraries. diff --git a/benchmark/src/libfork/CMakeLists.txt b/benchmark/src/libfork/CMakeLists.txt new file mode 100644 index 000000000..2e2def872 --- /dev/null +++ b/benchmark/src/libfork/CMakeLists.txt @@ -0,0 +1,17 @@ +add_library(libfork_benchmarks) + +target_sources(libfork_benchmarks + PRIVATE + fib.cpp uts.cpp + PRIVATE + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + FILES + helpers.hpp +) + +target_link_libraries(libfork_benchmarks + PUBLIC + benchmark_common + libfork::libfork +) diff --git a/benchmark/src/libfork/fib.cpp b/benchmark/src/libfork/fib.cpp new file mode 100644 index 000000000..beaace9c7 --- /dev/null +++ b/benchmark/src/libfork/fib.cpp @@ -0,0 +1,94 @@ +#include + +#include "fib.hpp" + +#include "helpers.hpp" + +import std; + +import libfork; + +// === Coroutine + +namespace { + +struct fib { + template + static auto operator()(lf::env, std::int64_t n) -> lf::task { + if (n < 2) { + co_return n; + } + + std::int64_t lhs = 0; + std::int64_t rhs = 0; + + auto sc = co_await lf::scope(); + + co_await sc.fork(&rhs, fib{}, n - 2); + co_await sc.call(&lhs, fib{}, n - 1); + + co_await sc.join(); + + co_return lhs + rhs; + } +}; + +template +void run(benchmark::State &state) { + + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + + state.counters["n"] = static_cast(n); + state.counters["p"] = static_cast(thread_count(state)); + state.SetComplexityN(static_cast(thread_count(state))); + + Sch scheduler = make_scheduler(state); + + for (auto _ : state) { + benchmark::DoNotOptimize(n); + lf::receiver recv = lf::schedule(scheduler, fib{}, n); + std::int64_t return_value = std::move(recv).get(); + + if (return_value != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", return_value, expect)); + break; + } + + benchmark::DoNotOptimize(return_value); + } +} + +} // namespace + +using lf::adapt_deque; +using lf::adapt_vector; + +using lf::adaptor_stack; +using lf::geometric_stack; +using lf::slab_stack; + +// -- Vector + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_vector<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_vector<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_vector<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_vector<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_vector<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_vector<>>) + +// -- Deque + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_deque<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_deque<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_deque<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_deque<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_deque<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_deque<>>) + +LIBFORK_BENCH_ALL_MT(run, fib, fib, mono_busy_pool) +LIBFORK_BENCH_ALL_MT(run, fib, fib, poly_busy_pool) diff --git a/benchmark/src/libfork/helpers.hpp b/benchmark/src/libfork/helpers.hpp new file mode 100644 index 000000000..8fbef45dc --- /dev/null +++ b/benchmark/src/libfork/helpers.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include + +#include "macros.hpp" + +import std; + +import libfork; + +template +auto thread_count(benchmark::State &state) -> std::size_t { + if constexpr (std::constructible_from) { + return static_cast(state.range(1)); + } else { + return 1; + } +} + +template +auto make_scheduler(benchmark::State &state) -> Sch { + if constexpr (std::constructible_from) { + return Sch{static_cast(state.range(1))}; + } else { + return Sch{}; + } +} + +using mono_busy_pool = lf::mono_busy_pool>; +using poly_busy_pool = lf::poly_busy_pool>; + +#define LIBFORK_BENCH_ALL(bench_fn, name, prefix, ...) \ + BENCH_ALL(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define LIBFORK_BENCH_ALL_MT(bench_fn, name, prefix, ...) \ + BENCH_ALL_MT(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define LIBFORK_UTS_BENCH_ONE_MT(bench_fn, mode, tree_name, tree_id, ...) \ + UTS_BENCH_ONE_MT(bench_fn, libfork, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__) + +#define LIBFORK_UTS_BENCH_ALL_MT(bench_fn, ...) UTS_BENCH_ALL_MT(bench_fn, libfork __VA_OPT__(, ) __VA_ARGS__) diff --git a/benchmark/src/libfork/uts.cpp b/benchmark/src/libfork/uts.cpp new file mode 100644 index 000000000..91ef3f793 --- /dev/null +++ b/benchmark/src/libfork/uts.cpp @@ -0,0 +1,93 @@ +#include + +#include "uts.hpp" + +#include "helpers.hpp" + +import std; + +import libfork; + +// === Coroutine + +namespace { + +// TODO: try a version that uses try_fork + +struct uts_fn { + template + static auto operator()(lf::env, int depth, Node *parent) -> lf::task { + + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + std::vector cs(static_cast(num_children)); + + auto sc = co_await lf::scope(); + + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + cs[i].child.type = child_type; + cs[i].child.height = parent->height + 1; + cs[i].child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs[i].child.state.state, static_cast(i)); + } + + if (i + 1 == static_cast(num_children)) { + co_await sc.call(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child); + } else { + co_await sc.fork(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child); + } + } + + co_await sc.join(); + + for (auto &&elem : cs) { + r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth); + r.size += elem.res.size; + r.leaves += elem.res.leaves; + } + } else { + r.leaves = 1; + } + + co_return r; + } +}; + +template +void run(benchmark::State &state, uts_tree tree) { + setup_tree(tree); + auto expect = expected_result(tree); + + std::size_t threads = static_cast(state.range(0)); + state.counters["p"] = static_cast(threads); + state.SetComplexityN(static_cast(threads)); + + Sch scheduler = Sch{threads}; + + for (auto _ : state) { + Node root; + uts_initRoot(&root, type); + lf::receiver recv = lf::schedule(scheduler, uts_fn{}, 0, &root); + result r = std::move(recv).get(); + + if (r != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", r, expect)); + break; + } + + benchmark::DoNotOptimize(r); + } +} + +} // namespace + +LIBFORK_UTS_BENCH_ALL_MT(run, mono_busy_pool) +LIBFORK_UTS_BENCH_ALL_MT(run, poly_busy_pool) diff --git a/benchmark/src/openmp/CMakeLists.txt b/benchmark/src/openmp/CMakeLists.txt new file mode 100644 index 000000000..035d5326f --- /dev/null +++ b/benchmark/src/openmp/CMakeLists.txt @@ -0,0 +1,23 @@ +add_library(openmp_benchmarks) + +# OpenMP compiles with -fopenmp which conflicts with the shared std.pcm (built +# without OpenMP). Disable module scanning so CMake doesn't inject the +# incompatible modmap for this target. +set_target_properties(openmp_benchmarks PROPERTIES CXX_SCAN_FOR_MODULES OFF) + +# TODO: remove this hack when we have LLVM 23 + +# Signal to shared benchmark headers that this target cannot `import std;` +# and must use textual standard headers instead. +target_compile_definitions(openmp_benchmarks PRIVATE LF_BENCH_NO_IMPORT_STD) + +target_sources(openmp_benchmarks + PRIVATE + fib.cpp uts.cpp +) + +target_link_libraries(openmp_benchmarks + PUBLIC + benchmark_common + OpenMP::OpenMP_CXX +) diff --git a/benchmark/src/openmp/fib.cpp b/benchmark/src/openmp/fib.cpp new file mode 100644 index 000000000..4643ee67d --- /dev/null +++ b/benchmark/src/openmp/fib.cpp @@ -0,0 +1,59 @@ +#include +#include + +#include + +#include "fib.hpp" +#include "macros.hpp" + +namespace { + +auto fib(std::int64_t n) -> std::int64_t { + if (n < 2) { + return n; + } + + std::int64_t lhs = 0; + std::int64_t rhs = 0; + +#pragma omp task untied shared(lhs) firstprivate(n) default(none) + lhs = fib(n - 2); + + rhs = fib(n - 1); + +#pragma omp taskwait + return lhs + rhs; +} + +template +void fib_run(benchmark::State &state) { + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + int threads = static_cast(state.range(1)); + + state.counters["n"] = static_cast(n); + state.counters["p"] = static_cast(threads); + state.SetComplexityN(static_cast(threads)); + + for (auto _ : state) { + benchmark::DoNotOptimize(n); + std::int64_t return_value = 0; + +#pragma omp parallel num_threads(threads) default(shared) +#pragma omp single nowait + { + return_value = fib(n); + } + + if (return_value != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", return_value, expect)); + break; + } + + benchmark::DoNotOptimize(return_value); + } +} + +} // namespace + +BENCH_ALL_MT(fib_run, openmp, fib, fib) diff --git a/benchmark/src/openmp/uts.cpp b/benchmark/src/openmp/uts.cpp new file mode 100644 index 000000000..b91d9e1e5 --- /dev/null +++ b/benchmark/src/openmp/uts.cpp @@ -0,0 +1,86 @@ +#include +#include +#include +#include + +#include + +#include "macros.hpp" +#include "uts.hpp" + +namespace { + +auto uts(int depth, Node *parent) -> result { + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + std::vector cs(static_cast(num_children)); + + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + cs[i].child.type = child_type; + cs[i].child.height = parent->height + 1; + cs[i].child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs[i].child.state.state, static_cast(i)); + } + + if (i + 1 == static_cast(num_children)) { + cs[i].res = uts(depth + 1, &cs[i].child); + } else { +#pragma omp task untied shared(cs) firstprivate(depth, i) default(none) + cs[i].res = uts(depth + 1, &cs[i].child); + } + } + +#pragma omp taskwait + + for (auto &&elem : cs) { + r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth); + r.size += elem.res.size; + r.leaves += elem.res.leaves; + } + } else { + r.leaves = 1; + } + + return r; +} + +void uts_run(benchmark::State &state, uts_tree tree) { + int threads = static_cast(state.range(0)); + + setup_tree(tree); + auto expect = expected_result(tree); + + state.counters["p"] = static_cast(threads); + state.SetComplexityN(static_cast(threads)); + + for (auto _ : state) { + Node root; + uts_initRoot(&root, type); + result r; + +#pragma omp parallel num_threads(threads) default(shared) +#pragma omp single nowait + { + r = uts(0, &root); + } + + if (r != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", r, expect)); + break; + } + + benchmark::DoNotOptimize(r); + } +} + +} // namespace + +UTS_BENCH_ALL_MT(uts_run, openmp) diff --git a/benchmark/src/serial/CMakeLists.txt b/benchmark/src/serial/CMakeLists.txt new file mode 100644 index 000000000..80c53454c --- /dev/null +++ b/benchmark/src/serial/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library(serial_benchmarks) + +target_sources(serial_benchmarks PRIVATE fib.cpp uts.cpp) + +target_link_libraries(serial_benchmarks PUBLIC benchmark_common) diff --git a/benchmark/src/serial/fib.cpp b/benchmark/src/serial/fib.cpp new file mode 100644 index 000000000..2b65104ef --- /dev/null +++ b/benchmark/src/serial/fib.cpp @@ -0,0 +1,78 @@ +#include + +#include "fib.hpp" +#include "macros.hpp" + +import std; + +namespace { + +auto fib_impl(std::int64_t &ret, std::int64_t n) -> void { + if (n < 2) { + ret = n; + return; + } + + std::int64_t lhs = 0; + std::int64_t rhs = 0; + + fib_impl(lhs, n - 2); + fib_impl(rhs, n - 1); + + ret = lhs + rhs; +} + +template +void fib_serial(benchmark::State &state) { + + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + + state.counters["n"] = static_cast(n); + + for (auto _ : state) { + benchmark::DoNotOptimize(n); + std::int64_t result = 0; + fib_impl(result, n); + if (result != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", result, expect)); + break; + } + benchmark::DoNotOptimize(result); + } +} + +auto fib_ret_impl(std::int64_t n) -> std::int64_t { + if (n < 2) { + return n; + } + + std::int64_t lhs = fib_ret_impl(n - 1); + std::int64_t rhs = fib_ret_impl(n - 2); + + return lhs + rhs; +} + +template +void fib_serial_return(benchmark::State &state) { + + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + + state.counters["n"] = static_cast(n); + + for (auto _ : state) { + benchmark::DoNotOptimize(n); + std::int64_t result = fib_ret_impl(n); + if (result != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", result, expect)); + break; + } + benchmark::DoNotOptimize(result); + } +} + +} // namespace + +BENCH_ALL(fib_serial, serial, fib, fib) +BENCH_ALL(fib_serial_return, serial, fib / return, fib) diff --git a/benchmark/src/serial/uts.cpp b/benchmark/src/serial/uts.cpp new file mode 100644 index 000000000..291934e79 --- /dev/null +++ b/benchmark/src/serial/uts.cpp @@ -0,0 +1,125 @@ +#include + +#include "macros.hpp" +#include "uts.hpp" + +import std; + +namespace { + +auto uts_traverse(int depth, Node *parent) -> result { + + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + std::vector cs(static_cast(num_children)); + + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + cs[i].child.type = child_type; + cs[i].child.height = parent->height + 1; + cs[i].child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs[i].child.state.state, static_cast(i)); + } + + cs[i].res = uts_traverse(depth + 1, &cs[i].child); + } + + for (auto &&elem : cs) { + r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth); + r.size += elem.res.size; + r.leaves += elem.res.leaves; + } + } else { + r.leaves = 1; + } + + return r; +} + +void uts_serial(benchmark::State &state, uts_tree tree) { + setup_tree(tree); + auto expect = expected_result(tree); + + for (auto _ : state) { + Node root; + uts_initRoot(&root, type); + result r = uts_traverse(0, &root); + + if (r != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", r, expect)); + break; + } + + benchmark::DoNotOptimize(r); + } +} + +} // namespace + +UTS_BENCH_ALL(uts_serial, serial) + +namespace { + +auto uts_traverse_no_alloc(int depth, Node *parent) -> result { + + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + + pair cs; + + cs.child.type = child_type; + cs.child.height = parent->height + 1; + cs.child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs.child.state.state, static_cast(i)); + } + + cs.res = uts_traverse(depth + 1, &cs.child); + + r.maxdepth = std::max(r.maxdepth, cs.res.maxdepth); + r.size += cs.res.size; + r.leaves += cs.res.leaves; + } + } else { + r.leaves = 1; + } + + return r; +} + +void uts_serial_no_alloc(benchmark::State &state, uts_tree tree) { + setup_tree(tree); + auto expect = expected_result(tree); + + for (auto _ : state) { + Node root; + uts_initRoot(&root, type); + result r = uts_traverse_no_alloc(0, &root); + + if (r != expect) { + state.SkipWithError(std::format("incorrect result: {} != {}", r, expect)); + break; + } + + benchmark::DoNotOptimize(r); + } +} + +} // namespace + +UTS_BENCH_ALL(uts_serial_no_alloc, serial / no_alloc) diff --git a/cmake/gcc-brew-toolchain.cmake b/cmake/gcc-brew-toolchain.cmake new file mode 100644 index 000000000..aa67ccaaf --- /dev/null +++ b/cmake/gcc-brew-toolchain.cmake @@ -0,0 +1,92 @@ +cmake_minimum_required(VERSION 4.2.1) + +# Set up Homebrew GCC@15 & Ninja toolchain for CMake + +find_program(BREW_EXE brew) + +if(NOT BREW_EXE) + message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.") +endif() + +# --- Ninja + +execute_process( + COMMAND ${BREW_EXE} --prefix ninja + OUTPUT_VARIABLE NINJA_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_MAKE_PROGRAM + NAMES ninja + HINTS "${NINJA_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# --- GCC + +execute_process( + COMMAND ${BREW_EXE} --prefix gcc + OUTPUT_VARIABLE GCC_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_C_COMPILER + NAMES gcc-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_CXX_COMPILER + NAMES g++-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_AR + NAMES gcc-ar-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_RANLIB + NAMES gcc-ranlib-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_NM + NAMES gcc-nm-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# --- Binutils + +execute_process( + COMMAND ${BREW_EXE} --prefix binutils + OUTPUT_VARIABLE BINUTILS_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE) + + +# Get macOS SDK path (only on macOS) +if(APPLE) + execute_process( + COMMAND xcrun --show-sdk-path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY + ) +endif() diff --git a/cmake/llvm-brew-toolchain.cmake b/cmake/llvm-brew-toolchain.cmake new file mode 100644 index 000000000..199bdae34 --- /dev/null +++ b/cmake/llvm-brew-toolchain.cmake @@ -0,0 +1,88 @@ +cmake_minimum_required(VERSION 4.2.1) + +# Set up Homebrew LLVM & Ninja toolchain for CMake + +find_program(BREW_EXE brew) + +if(NOT BREW_EXE) + message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.") +endif() + +# --- Ninja + +execute_process( + COMMAND ${BREW_EXE} --prefix ninja + OUTPUT_VARIABLE NINJA_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_MAKE_PROGRAM + NAMES ninja + HINTS "${NINJA_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# --- LLVM + +execute_process( + COMMAND ${BREW_EXE} --prefix llvm + OUTPUT_VARIABLE LLVM_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_C_COMPILER + NAMES clang + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_CXX_COMPILER + NAMES clang++ + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_AR + NAMES llvm-ar + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_RANLIB + NAMES llvm-ranlib + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_NM + NAMES llvm-nm + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# Dynamically find the standard library modules JSON, brew puts it in the wrong place +file(GLOB_RECURSE LIBCXX_MODULES_JSON "${LLVM_PREFIX}/lib/**/libc++.modules.json") + +if(LIBCXX_MODULES_JSON) + set(CMAKE_CXX_STDLIB_MODULES_JSON "${LIBCXX_MODULES_JSON}") +else() + message(FATAL_ERROR "Could not automatically find libc++.modules.json in ${LLVM_PREFIX}") +endif() + +# Get macOS SDK path (only on macOS) +if(APPLE) + execute_process( + COMMAND xcrun --show-sdk-path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY + ) +endif() diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 000000000..edf767d82 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,287 @@ +# libfork Public API + +All symbols live in the `lf` namespace. Access them via `import libfork;`. + +--- + +## Core concepts + +### `concept returnable` — `:task` + +`T` is `void` or a `std::movable` plain object type. Used as the return-type constraint on async functions. + +### `concept worker_stack` — `:concepts_stack` + +A type that provides a contiguous stack with `push`, `pop`, `checkpoint`, `prepare_release`, `release`, and `acquire`. + +### `concept lifo_stack` — `:concepts_context` + +`T` is a plain object type supporting `push(U)` and a `noexcept pop() -> U`. Used to define `worker_context`. + +### `concept worker_context` — `:concepts_context` + +A type that satisfies `lifo_stack>` and exposes a `worker_stack` via a `noexcept stack()`. + +### `using stack_t` — `:concepts_context` + +Extracts the stack type from a `worker_context`. + +### `concept has_context_typedef` — `:concepts_scheduler` + +`T` has a `context_type` member typedef. Used to define `scheduler` and constrain `context_t`. + +### `concept scheduler` — `:concepts_scheduler` + +A type satisfying `has_context_typedef` with a `post(sched_handle)` method. + +### `using context_t` — `:concepts_scheduler` + +Extracts `T::context_type`. Requires `has_context_typedef`. + +### `concept async_invocable` — `:concepts_invocable` + +`Fn` is callable with an `env` (or without one) and `Args...`, returning an `lf::task`. + +### `concept async_nothrow_invocable` — `:concepts_invocable` + +Subsumes `async_invocable` and requires the call to be `noexcept`. + +### `using async_result_t` — `:concepts_invocable` + +The `value_type` of the `task` returned by invoking `Fn`. + +### `concept async_invocable_to` — `:concepts_invocable` + +Subsumes `async_invocable` and constrains the result type to `R`. + +### `concept async_nothrow_invocable_to` — `:concepts_invocable` + +Subsumes both `async_nothrow_invocable` and `async_invocable_to`. + +--- + +## Coroutine types + +### `struct env` — `:task` + +The Y-combinator environment. Passed as the first argument to every async function, allowing recursive self-calls. Users declare it but never construct it directly. + +### `class task` — `:task` + +The return type of all async functions. `T` must satisfy `returnable`. Users never store or manipulate instances — the type exists solely to identify libfork coroutines. + +--- + +## Handles + +### `struct unsafe_steal_handle` — `:handles` + +Untyped steal handle. Used by `deque_policy` implementations to store handles without knowing the full context type. + +### `struct unsafe_sched_handle` — `:handles` + +Untyped schedule handle. Used when type-erasing across context types. + +### `struct steal_handle` — `:handles` + +Typed handle to a task suspended at a fork point; passed to `context.push()` and returned by `context.pop()` / `context.steal()`. + +### `struct sched_handle` — `:handles` + +Typed handle to a task ready to be started or resumed; passed to `scheduler::post()` and `execute()`. + +--- + +## Scope and task operations + +### `constexpr auto scope() -> scope_type` — `:ops` + +Primary entry point for fork/call/join. `co_await` this to obtain a `scope_ops` which provides: + +- `.fork(ret, fn, args...)` / `.fork(fn, args...)` / `.fork_drop(fn, args...)` — spawn concurrent child +- `.call(ret, fn, args...)` / `.call(fn, args...)` / `.call_drop(fn, args...)` — inline child call +- `.join()` — wait for all outstanding children + +### `constexpr auto child_scope() -> child_scope_type` — `:ops` + +Entry point for a cancellable scope. `co_await` this to obtain a `child_scope_ops`, which extends `scope_ops` and also inherits from `stop_source`. Tasks forked/called through this scope receive the scope's stop token. + +--- + +## Cancellation + +### `class stop_source` — `:stop` + +A non-copyable, non-movable stop source. Methods: `token()`, `stop_possible()`, `stop_requested()`, `request_stop()`, `race_request_stop()`. + +### `class stop_source::stop_token` — `:stop` + +Lightweight copyable token. Methods: `stop_possible()`, `stop_requested()`. + +--- + +## Scheduling + +### `class recv_state` — `:receiver` + +Pre-allocated shared state for a root task. Constructors mirror `make_shared` / `allocate_shared`: + +```cpp +recv_state s; // default-init +recv_state s{42}; // in-place init +recv_state s{std::allocator_arg, alloc}; // custom allocator +recv_state s{std::allocator_arg, alloc, 42}; // custom allocator + in-place init +recv_state s; // cancellable variant +``` + +Move-only. Pass to `schedule()` to get back a `receiver`. + +### `class receiver` — `:receiver` + +Handle to the result of a scheduled root task. Methods: + +- `.valid()` — whether the receiver is connected to state +- `.ready()` — whether the task has completed +- `.wait()` — block until complete (may be called multiple times) +- `.stop_source()` — access the stop source (only when `Stoppable = true`) +- `.get()` — consume the result, rethrowing any stored exception; throws `operation_cancelled_error` if cancelled + +### `auto schedule(Sch&&, recv_state, Fn&&, Args&&...) -> receiver` — `:schedule` + +Schedule an async function as a root task using a pre-allocated `recv_state`. + +### `auto schedule(Sch&&, Fn&&, Args&&...) -> receiver` — `:schedule` + +Convenience overload: default-constructs a non-cancellable `recv_state`. + +### `void execute(Context&, sched_handle)` — `:execute` + +Bind the calling thread to `context` and resume the scheduled task. Used by scheduler implementations. + +### `void execute(Context&, steal_handle)` — `:execute` + +Bind the calling thread to `context` and resume a stolen task. Used by scheduler implementations. + +--- + +## Polymorphic context base classes + +### `class base_context` — `:poly_context` + +CRTP base providing `stack()` -> `Stack&`. Inherit from this (or `poly_context`) when implementing a custom context. + +### `class poly_context` — `:poly_context` + +Abstract base for polymorphic contexts. Provides pure-virtual `push(steal_handle)`, `pop()`, and a defaulting `post(sched_handle)` that throws `post_error`. + +--- + +## Exception hierarchy + +All exceptions derive from `lf::libfork_exception : std::exception`. + +| Type | Thrown by | Condition | +| --------------------------- | ---------------------- | ---------------------------------------- | +| `libfork_exception` | — | Base type; catch-all for libfork errors | +| `schedule_error` | `schedule()` | Called from a worker thread | +| `execute_error` | `execute()` | Called from a worker thread | +| `steal_overflow_error` | `execute()` | A single task stolen > 65,535 times | +| `root_alloc_error` | `schedule()` | Root frame too large for inline buffer | +| `broken_receiver_error` | `receiver` methods | Receiver is in an invalid state | +| `operation_cancelled_error` | `receiver::get()` | Task was cancelled via stop token | +| `post_error` | `poly_context::post()` | Derived context does not override `post` | +| `deque_full_error` | `deque::push()` | Deque has reached maximum capacity | + +--- + +## Batteries: stacks + +All stacks satisfy `worker_stack`. Template parameter is an allocator for `std::byte`. + +### `class geometric_stack` — `:geometric_stack` + +Segmented stack with geometric growth and segment caching. Recommended default. + +### `class adaptor_stack` — `:adaptor_stack` + +Thin allocator-backed stack; allocates/deallocates on every push/pop. + +### `class slab_stack` — `:slab_stack` + +Fixed-capacity slab stack; throws on overflow. + +--- + +## Batteries: deque and adaptors + +### `class deque` — `:deque` + +Lock-free Chase-Lev work-stealing deque. `T` must be `lock_free` and `default_initializable`. Methods: `push(T)`, `pop() -> std::optional`, `get_thief() -> thief_handle`. + +### `class deque::thief_handle` — `:deque` + +Non-owning steal handle obtained via `deque::get_thief()`. Method: `steal(Fn on_empty) -> std::optional`. + +### `enum class err` — `:deque` + +Return code from low-level steal operations: `none`, `lost`, `empty`. + +### `struct steal_t` — `:deque` + +Steal result wrapper returned by `thief_handle::steal`. Has `err code` and `T val` fields; `operator bool` tests `code == err::none`. + +### `class adapt_vector` — `:adaptors` + +`std::vector`-backed LIFO deque policy. Satisfies `deque_policy`. + +### `class adapt_deque` — `:adaptors` + +Lock-free deque-backed policy. Satisfies both `deque_policy` and `stealable_deque_policy`. + +--- + +## Batteries: context policies and contexts + +### `concept deque_policy` — `:contexts` + +A type that is a LIFO stack over `unsafe_steal_handle` (has `push` and `pop`). + +### `concept stealable_deque_policy` — `:contexts` + +Extends `deque_policy` with a `steal() -> unsafe_steal_handle` method for FIFO work stealing. + +### `class mono_context` — `:contexts` + +Monomorphic worker context. Composes a `worker_stack` and a `deque_policy`. Satisfies `worker_context`. Exposes `steal()` when `Deque` satisfies `stealable_deque_policy`. + +### `class derived_poly_context` — `:contexts` + +Polymorphic worker context. Derives from `poly_context` and implements `push`/`pop` via `Deque`. Exposes `steal()` when `Deque` satisfies `stealable_deque_policy`. The `context_type` alias is `poly_context`. + +--- + +## Schedulers + +### `concept derived_worker_context` — `:inline_scheduler` + +`Context` has a `context_type` typedef and is derived from it (i.e., it is a concrete subclass of its own context type). + +### `class inline_scheduler` — `:inline_scheduler` + +Single-threaded synchronous scheduler. Stores one `Context` instance; `post()` calls `execute()` directly on the calling thread. + +### `enum class pool_kind` — `:basic_busy_pool` + +`mono` — uses `mono_context`; `poly` — uses `derived_poly_context`. + +### `class basic_busy_pool` — `:basic_busy_pool` + +Work-stealing thread pool using busy-wait. Spawns `N` worker threads (default: `std::thread::hardware_concurrency()`). Constructor: `basic_busy_pool(n_threads)`. + +### `using mono_busy_pool` — `:basic_busy_pool` + +Alias for `basic_busy_pool`. + +### `using poly_busy_pool` — `:basic_busy_pool` + +Alias for `basic_busy_pool`. diff --git a/docs/structure.md b/docs/structure.md new file mode 100644 index 000000000..cb1e5203d --- /dev/null +++ b/docs/structure.md @@ -0,0 +1,24 @@ +# Structure of libfork + +Libfork is organized into several modules: + +- `libfork`: Meta module that re-exports all public modules + - tuple + - etc +- `libfork.utils`: Independent internal utilities, not part of the public API +- `libfork.core`: Core functionality of libfork including: + - Task template + - Task handles + - Concepts for context/stack/scheduler + - Fork/call primitives + - Execute primitives (for starting work) + - Schedule primitive (for launching work) + - Polymorphic context ABC + - \[internal\] Promise/frame + - \[internal\] Thread locals +- `libfork.batteries`: Collection of context, stack and other types + - The `::stacks` namespace + - Contexts + - adaptors +- `libfork.schedulers`: Collection of schedulers + - Inline scheduler diff --git a/docs/tour.md b/docs/tour.md index 4c1800dd3..45b7b1fe5 100644 --- a/docs/tour.md +++ b/docs/tour.md @@ -25,7 +25,7 @@ Definitions: - __Parent:__ A task that spawns other tasks. - __Child:__ A task that is spawned by another task. -The tasking/fork-join interface is designed to mirror [Cilk](https://en.wikipedia.org/wiki/Cilk) and other fork-join frameworks. The best way to learn is by example, lets start with the canonical introduction to fork-join, the recursive Fibonacci function, in regular C++ it looks like this: +The tasking/fork-join interface is designed to mirror [Cilk](https://en.wikipedia.org/wiki/Cilk) and other fork-join frameworks. The best way to learn is by example, let's start with the canonical introduction to fork-join, the recursive Fibonacci function, in regular C++ it looks like this: ```cpp auto fib(int n) -> int { diff --git a/include/libfork/__impl/assume.hpp b/include/libfork/__impl/assume.hpp new file mode 100644 index 000000000..b40cd4fab --- /dev/null +++ b/include/libfork/__impl/assume.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include "libfork/__impl/exception.hpp" + +/** + * @file assume.hpp + * + * @brief A collection of internal macros. + * + * These macros are not safe to use unless `import std` is in scope. + */ + +/** + * @brief If expr evaluates to `false`, terminates the program with an error message. + * + * This macro is always active, regardless of optimization settings or `NDEBUG`. + */ +#define LF_ENSURE(...) \ + do { \ + if (!(__VA_ARGS__)) { \ + LF_TERMINATE("Assumption '" #__VA_ARGS__ "' failed!"); \ + } \ + } while (false) + +/** + * @brief Invokes undefined behavior if ``expr`` evaluates to `false`. + * + * \rst + * + * .. warning:: + * + * This has different semantics than ``[[assume(expr)]]`` as it WILL evaluate the + * expression at runtime. Hence you should conservatively only use this macro + * if ``expr`` is side-effect free and cheap to evaluate. + * + * \endrst + */ +#ifdef NDEBUG + #define LF_ASSUME(...) \ + do { \ + if (!(__VA_ARGS__)) { \ + ::std::unreachable(); \ + } \ + } while (false) +#else + #define LF_ASSUME(...) LF_ENSURE(__VA_ARGS__) +#endif + +#ifdef NDEBUG + #define LF_UNREACHABLE() \ + do { \ + ::std::unreachable(); \ + } while (false) +#else + #define LF_UNREACHABLE() LF_TERMINATE("This code should be unreachable!"); +#endif diff --git a/include/libfork/__impl/compiler.hpp b/include/libfork/__impl/compiler.hpp new file mode 100644 index 000000000..8e71083fc --- /dev/null +++ b/include/libfork/__impl/compiler.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include "libfork/__impl/exception.hpp" + +/** + * @file compiler.hpp + * + * @brief A collection of internal macros. + * + * These macros are standalone i.e. they can be used without importing/including anything else. + */ + +// =============== Inlining/optimization =============== // + +/** + * @brief Macro to use next to 'inline' to force a function to be inlined. + * + * \rst + * + * .. note:: + * + * This does not imply the c++'s `inline` keyword which also has an effect on linkage. + * + * \endrst + */ +#if !defined(LF_FORCE_INLINE) + #if defined(_MSC_VER) && !defined(__clang__) + #define LF_FORCE_INLINE __forceinline + #elif defined(__GNUC__) && __GNUC__ > 3 + // Clang also defines __GNUC__ (as 4) + #define LF_FORCE_INLINE __attribute__((__always_inline__)) + #else + #define LF_FORCE_INLINE + #endif +#endif + +/** + * @brief Macro to prevent a function to be inlined. + */ +#if !defined(LF_NO_INLINE) + #if defined(_MSC_VER) && !defined(__clang__) + #define LF_NO_INLINE __declspec(noinline) + #elif defined(__GNUC__) && __GNUC__ > 3 + // Clang also defines __GNUC__ (as 4) + #if defined(__CUDACC__) + // nvcc doesn't always parse __noinline__, see: https://svn.boost.org/trac/boost/ticket/9392 + #define LF_NO_INLINE __attribute__((noinline)) + #elif defined(__HIP__) + // See https://github.com/boostorg/config/issues/392 + #define LF_NO_INLINE __attribute__((noinline)) + #else + #define LF_NO_INLINE __attribute__((__noinline__)) + #endif + #else + #define LF_NO_INLINE + #endif +#endif diff --git a/include/libfork/__impl/exception.hpp b/include/libfork/__impl/exception.hpp new file mode 100644 index 000000000..9868d7e75 --- /dev/null +++ b/include/libfork/__impl/exception.hpp @@ -0,0 +1,78 @@ +#pragma once + +/** + * @file exception.hpp + * + * @brief A collection of internal macros for exception handling. + * + * These macros are standalone i.e. they can be used without importing/including anything else. + */ + +/** + * @brief Detects if the compiler has exceptions enabled. + * + * Overridable by defining `LF_COMPILER_EXCEPTIONS` globally. + */ +#ifndef LF_COMPILER_EXCEPTIONS + #if defined(__cpp_exceptions) || (defined(_MSC_VER) && defined(_CPPUNWIND)) || defined(__EXCEPTIONS) + #define LF_COMPILER_EXCEPTIONS 1 + #else + #define LF_COMPILER_EXCEPTIONS 0 + #endif +#endif + +namespace lf::impl { + +/** + * @brief Calls `std::terminate` after printing `msg`. + */ +[[noreturn]] +void terminate_with(char const *message, char const *file, int line) noexcept; + +} // namespace lf::impl + +#define LF_TERMINATE(message) ::lf::impl::terminate_with((message), __FILE__, __LINE__) + +#if LF_COMPILER_EXCEPTIONS + /** + * @brief Expands to ``try`` if exceptions are enabled, otherwise expands to ``if constexpr (true)``. + */ + #define LF_TRY try + /** + * @brief Expands to ``catch (...)`` if exceptions are enabled, otherwise ``if constexpr (false)``. + */ + #define LF_CATCH_ALL catch (...) + /** + * @brief Expands to ``catch (__VA_ARGS__)`` if exceptions are enabled, otherwise ``if constexpr (false)``. + */ + #define LF_CATCH(...) catch (__VA_ARGS__) + /** + * @brief Expands to ``throw X`` if exceptions are enabled, otherwise terminates the program. + */ + #define LF_THROW(X) throw X + /** + * @brief Expands to ``throw`` if exceptions are enabled, otherwise terminates the program. + */ + #define LF_RETHROW throw +#else + /** + * @brief Expands to ``try`` if exceptions are enabled, otherwise expands to ``if constexpr (true)``. + */ + #define LF_TRY if constexpr (true) + /** + * @brief Expands to ``catch (...)`` if exceptions are enabled, otherwise ``if constexpr (false)``. + */ + #define LF_CATCH_ALL if constexpr (false) + /** + * @brief Expands to ``catch (__VA_ARGS__)`` if exceptions are enabled, otherwise ``if constexpr (false)``. + */ + #define LF_CATCH(...) if constexpr (false) + /** + * @brief Expands to ``throw X`` if exceptions are enabled, otherwise terminates the program. + */ + #define LF_THROW(X) LF_TERMINATE("Tried to throw '" #X "' without compiler exceptions") + /** + * @brief Expands to ``throw`` if exceptions are enabled, otherwise terminates the program. + */ + #define LF_RETHROW LF_TERMINATE("Tried to rethrow without compiler exceptions") +#endif diff --git a/include/libfork/__impl/utils.hpp b/include/libfork/__impl/utils.hpp new file mode 100644 index 000000000..2adbf49c9 --- /dev/null +++ b/include/libfork/__impl/utils.hpp @@ -0,0 +1,32 @@ +#pragma once + +/** + * @file utils.hpp + * + * @brief A collection of internal utility macros. + * + * These macros are not safe to use unless `import std` is in scope. + */ + +// =============== Utility =============== // + +// clang-format off + +/** + * @brief Use like `BOOST_HOF_RETURNS` to define a function/lambda with all the noexcept/decltype specifiers. + * + * This macro is not truly variadic but the ``...`` allows commas in the macro argument. + */ +#define LF_HOF(...) noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) { return __VA_ARGS__;} + +// clang-format on + +/** + * @brief Use like `std::forward` to perfectly forward an expression. + */ +#define LF_FWD(...) ::std::forward(__VA_ARGS__) + +/** + * @brief Use to define a `T` that is aligned to the required alignment of `std::atomic_ref`. + */ +#define ATOMIC_ALIGN(T) alignas(std::atomic_ref::required_alignment) T diff --git a/include/libfork/version.hpp b/include/libfork/version.hpp index d048ad622..03f18744e 100644 --- a/include/libfork/version.hpp +++ b/include/libfork/version.hpp @@ -1,3 +1,5 @@ +#pragma once + /** * @brief __[public]__ The major version of libfork. * diff --git a/src/batteries/adaptor_stack.cxx b/src/batteries/adaptor_stack.cxx new file mode 100644 index 000000000..2c333b7b7 --- /dev/null +++ b/src/batteries/adaptor_stack.cxx @@ -0,0 +1,117 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/compiler.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.batteries:adaptor_stack; + +import std; + +import libfork.utils; + +namespace lf { + +/** + * @brief An adaptor_stack wraps a standard allocator to satisfy the worker_stack concept. + * + * Every push/pop directly allocates/deallocates through the allocator. This is the simplest + * possible stack implementation — no caching, no geometric growth — just a thin wrapper. + * + * For this to conform to `worker_stack` the allocators void pointer type must be `void *`. + */ +export template Allocator = std::allocator> +class adaptor_stack { + + struct alignas(k_new_align) aligned {}; + + static_assert(sizeof(aligned) == k_new_align); + + using align_trait = std::allocator_traits::template rebind_traits; + using align_alloc = align_trait::allocator_type; + + using alloc_ptr = align_trait::pointer; + using void_ptr = align_trait::void_pointer; + + using size_type = align_trait::size_type; + + struct release_t { + explicit constexpr release_t(key_t /*unused*/) noexcept {} + }; + + class checkpoint_t { + public: + constexpr checkpoint_t() noexcept = default; + constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default; + + private: + friend adaptor_stack; + explicit constexpr checkpoint_t(align_alloc const &alloc) noexcept + : m_alloc(alloc) {} + + struct empty { + constexpr empty() noexcept = default; + constexpr auto operator==(empty const &) const noexcept -> bool = default; + explicit constexpr empty(align_alloc const & /*unused*/) noexcept {} + }; + + std::conditional_t m_alloc; + }; + + public: + constexpr adaptor_stack() noexcept(noexcept(Allocator())) + : adaptor_stack(Allocator()) {} + explicit constexpr adaptor_stack(Allocator const &alloc) noexcept + : m_alloc(alloc) {} + + // TODO: drop constexpr for =default and =delete across the lib + + constexpr adaptor_stack(adaptor_stack const &) = delete; + constexpr adaptor_stack(adaptor_stack &&) = delete; + + constexpr auto operator=(adaptor_stack const &) -> adaptor_stack & = delete; + constexpr auto operator=(adaptor_stack &&) -> adaptor_stack & = delete; + + /** + * @brief Get a checkpoint of the stack. + */ + [[nodiscard]] + constexpr auto checkpoint() noexcept -> checkpoint_t { + return checkpoint_t{m_alloc}; + } + + /** + * @brief Allocate size bytes and return a pointer to the allocation. + */ + [[nodiscard]] + constexpr auto push(std::size_t size) -> void_ptr { + LF_ASSUME(size > 0); + size_type num_aligned = safe_cast(((size - 1) / k_new_align) + 1); + return static_cast(align_trait::allocate(m_alloc, num_aligned)); + } + + /** + * @brief Deallocate the allocation at ptr of size n. + */ + constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t size) noexcept { + LF_ASSUME(size > 0); + size_type num_aligned = safe_cast(((size - 1) / k_new_align) + 1); + align_trait::deallocate(m_alloc, static_cast(ptr), num_aligned); + } + + [[nodiscard]] + constexpr auto prepare_release() const noexcept -> release_t { + return release_t{key()}; + } + + constexpr void release([[maybe_unused]] release_t key) noexcept {} + + constexpr void acquire(checkpoint_t const &ckpt) noexcept { + if constexpr (!align_trait::is_always_equal::value) { + m_alloc = ckpt.m_alloc; + } + } + + private: + align_alloc m_alloc; +}; + +} // namespace lf diff --git a/src/batteries/adaptors.cxx b/src/batteries/adaptors.cxx new file mode 100644 index 000000000..243fe043a --- /dev/null +++ b/src/batteries/adaptors.cxx @@ -0,0 +1,71 @@ +export module libfork.batteries:adaptors; + +import std; + +import libfork.core; +import libfork.utils; + +import :deque; + +namespace lf { + +export template Allocator = std::allocator> +class adapt_vector { + public: + constexpr adapt_vector() noexcept(noexcept(Allocator())) + : adapt_vector(Allocator()) {} + + explicit constexpr adapt_vector(Allocator const &alloc) noexcept + : m_vector(alloc) {} + + constexpr void push(unsafe_steal_handle value) { m_vector.push_back(value); } + + constexpr auto pop() noexcept -> unsafe_steal_handle { + if (!m_vector.empty()) { + unsafe_steal_handle value = m_vector.back(); + m_vector.pop_back(); + return value; + } + return {}; + } + + private: + std::vector m_vector; +}; + +export template > Allocator = + std::allocator>> +class adapt_deque { + public: + using size_type = deque::size_type; + + private: + static constexpr size_type k_default_capacity = 1024 * 32; + + public: + constexpr adapt_deque() + : adapt_deque(k_default_capacity, Allocator()) {} + + explicit constexpr adapt_deque(size_type capacity, Allocator const &alloc = Allocator()) + : m_deque{capacity, alloc} {} + + constexpr void push(unsafe_steal_handle value) { m_deque.push(value); } + + constexpr auto pop() noexcept -> unsafe_steal_handle { + return m_deque.pop([] static noexcept -> unsafe_steal_handle { + return {}; + }); + } + + constexpr auto steal() noexcept -> unsafe_steal_handle { + if (auto [_, result] = m_deque.thief().steal()) { + return result; + } + return {}; + } + + private: + deque m_deque; +}; + +} // namespace lf diff --git a/src/batteries/batteries.cxx b/src/batteries/batteries.cxx new file mode 100644 index 000000000..c58f0e23b --- /dev/null +++ b/src/batteries/batteries.cxx @@ -0,0 +1,8 @@ +export module libfork.batteries; + +export import :deque; +export import :geometric_stack; +export import :adaptor_stack; +export import :slab_stack; +export import :adaptors; +export import :contexts; diff --git a/src/batteries/contexts.cxx b/src/batteries/contexts.cxx new file mode 100644 index 000000000..15319beee --- /dev/null +++ b/src/batteries/contexts.cxx @@ -0,0 +1,115 @@ +module; +#include "libfork/__impl/exception.hpp" +export module libfork.batteries:contexts; + +import std; + +import libfork.core; +import libfork.utils; + +namespace lf { + +// =================== Context Policies =================== // + +/** + * @brief The simplest context policy is just a LIFO stack of type-erased handles. + * + * Context policies (unlike full contexts) are not aware of the full context + * type hence, operate on untyped handles. This is inherently unsafe. To + * prevent UB a policy must not give-out the handles it receives. All + * operations must be managed through either `derived_poly_context` or + * `mono_context`. + */ +export template +concept deque_policy = lifo_stack; + +// TODO: consider the methods/concepts needed for a auto/scheduling worker +// context that has a `post` method. + +/** + * @brief An extension of `deque_policy` that supports FIFO stealing of handles. + */ +export template +concept stealable_deque_policy = deque_policy && requires (T &policy) { + { policy.steal() } -> std::same_as; +}; + +// =================== Contexts =================== // + +template +class context_base : public Base { + public: + constexpr context_base() = default; + + template + requires std::constructible_from && std::constructible_from + constexpr context_base( + std::piecewise_construct_t, + std::tuple stack_args, + std::tuple deque_args) noexcept(std::is_nothrow_constructible_v && + std::is_nothrow_constructible_v) + : context_base(std::move(stack_args), + std::move(deque_args), + std::index_sequence_for{}, + std::index_sequence_for{}) {} + + [[nodiscard]] + constexpr auto steal() noexcept(noexcept(m_container.steal())) -> steal_handle + requires stealable_deque_policy + { + return {key(), get(key(), m_container.steal())}; + } + + protected: + Deque m_container; + + private: + template + constexpr context_base( + std::tuple stack_args, + std::tuple deque_args, + std::index_sequence, + std::index_sequence) noexcept(std::is_nothrow_constructible_v && + std::is_nothrow_constructible_v) + : Base(std::get(std::move(stack_args))...), + m_container(std::get(std::move(deque_args))...) {} +}; + +/** + * @brief A polymorphic worker context composed of a `worker_stack` and a `deque_policy`. + */ +export template +class derived_poly_context : public context_base, Stack, Deque, poly_context> { + using base = context_base, Stack, Deque, poly_context>; + + public: + using context_type = poly_context; + + using base::base; + + constexpr void push(steal_handle handle) final { this->m_container.push(handle); } + + constexpr auto pop() noexcept -> steal_handle final { + return {key(), get(key(), this->m_container.pop())}; + } +}; + +export template +class mono_context : public context_base, Stack, Deque, mono_context> { + using base = context_base, Stack, Deque, mono_context>; + + public: + using context_type = mono_context; + + using base::base; + + constexpr void push(steal_handle handle) noexcept(noexcept(this->m_container.push(handle))) { + this->m_container.push(handle); + } + + constexpr auto pop() noexcept -> steal_handle { + return {key(), get(key(), this->m_container.pop())}; + } +}; + +} // namespace lf diff --git a/src/batteries/deque.cxx b/src/batteries/deque.cxx new file mode 100644 index 000000000..bdc17dcc9 --- /dev/null +++ b/src/batteries/deque.cxx @@ -0,0 +1,462 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.batteries:deque; + +import std; + +import libfork.core; +import libfork.utils; + +namespace lf { + +/** + * @brief Test is a type is suitable for use with `lf::deque`. + * + * This requires it to be `lf::lock_free` and `std::default_initializable`. + */ +export template +concept dequeable = lock_free && std::default_initializable; + +/** + * @brief Thrown when a push operation fails because the deque is full. + */ +export struct deque_full_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "push failed because deque is full"; + } +}; + +/** + * @brief A basic wrapper around a c-style array that provides modulo load/stores. + * + * This class is designed for internal use only. It provides a c-style API that is + * used efficiently by deque for low level atomic operations. + * + * @tparam T The type of the elements in the array. + */ +template > Allocator> +struct atomic_ring_buf { + private: + using traits = std::allocator_traits; + using pointer = traits::pointer; + + public: + using diff_type = traits::difference_type; + using size_type = traits::size_type; + + /** + * @brief Construct a new ring buff object + * + * @param cap The capacity of the buffer, MUST be a power of 2. + * @param alloc The allocator used to allocate the buffer. + */ + constexpr atomic_ring_buf(diff_type cap, Allocator const &alloc) + : m_alloc{alloc}, + m_cap{cap}, + m_mask{cap - 1} { + + LF_ASSUME(cap > 0 && std::has_single_bit(safe_cast(cap))); + + m_buf = traits::allocate(m_alloc, safe_cast(cap)); + + diff_type i = 0; + + LF_TRY { + // Begin the lifetime of each atomic. + for (; i < cap; ++i) { + traits::construct(m_alloc, std::to_address(m_buf + i)); + } + } LF_CATCH_ALL { + clean_up(i); + LF_RETHROW; + } + } + + atomic_ring_buf(atomic_ring_buf const &) = delete; + atomic_ring_buf(atomic_ring_buf &&) = delete; + auto operator=(atomic_ring_buf const &) -> atomic_ring_buf & = delete; + auto operator=(atomic_ring_buf &&) -> atomic_ring_buf & = delete; + + constexpr ~atomic_ring_buf() noexcept { clean_up(m_cap); } + + /** + * @brief Get the capacity of the buffer. + */ + [[nodiscard]] + constexpr auto capacity() const noexcept -> diff_type { + return m_cap; + } + /** + * @brief Store ``val`` at ``index % this->capacity()``. + */ + constexpr auto store(diff_type index, T const &val) noexcept -> void { + LF_ASSUME(index >= 0); + std::to_address(m_buf + (index & m_mask))->store(val, std::memory_order_relaxed); + } + /** + * @brief Load value at ``index % this->capacity()``. + */ + [[nodiscard]] + constexpr auto load(diff_type index) const noexcept -> T { + LF_ASSUME(index >= 0); + return std::to_address(m_buf + (index & m_mask))->load(std::memory_order_relaxed); + } + + private: + /** + * @brief Destroy the first `n` elements and deallocate the buffer. + */ + constexpr void clean_up(diff_type n) noexcept { + + LF_ASSUME(0 <= n && n <= m_cap); + + for (diff_type i = n - 1; i >= 0; --i) { + traits::destroy(m_alloc, std::to_address(m_buf + i)); + } + traits::deallocate(m_alloc, m_buf, safe_cast(m_cap)); + } + + [[no_unique_address]] + Allocator m_alloc; + pointer m_buf{}; + diff_type m_cap; + diff_type m_mask; +}; + +/** + * @brief Error codes for ``deque`` 's ``steal()`` operation. + */ +export enum class err : std::uint8_t { + /** + * @brief The ``steal()`` operation succeeded. + */ + none = 0, + /** + * @brief Lost the ``steal()`` race hence, the ``steal()`` operation failed. + */ + lost, + /** + * @brief The deque is empty and hence, the ``steal()`` operation failed. + */ + empty, +}; + +/** + * @brief The return type of a `lf::deque` `steal()` operation. + * + * This type is suitable for structured bindings. We return a custom type instead of a + * `std::optional` to allow for more information to be returned as to why a steal may fail. + */ +export template +struct steal_t { + /** + * @brief Check if the operation succeeded. + */ + [[nodiscard]] + constexpr explicit operator bool() const noexcept { + return code == err::none; + } + /** + * @brief Get the value like ``std::optional``. + * + * Requires ``code == err::none`` . + */ + [[nodiscard]] + constexpr auto operator*() const noexcept -> T { + LF_ASSUME(code == err::none); + return val; + } + /** + * @brief Get the value ``like std::optional``. + * + * Requires ``code == err::none`` . + */ + [[nodiscard]] + constexpr auto operator->() const noexcept -> T const * { + LF_ASSUME(code == err::none); + return std::addressof(val); + } + /** + * @brief The error code of the ``steal()`` operation. + */ + err code; + /** + * @brief The value stolen from the deque, Only valid if ``code == err::none``. + */ + T val; +}; + +/** + * @brief A functor that returns ``std::nullopt``. + */ +template +struct return_nullopt { + /** + * @brief Returns ``std::nullopt``. + */ + [[nodiscard]] + static constexpr auto operator()() noexcept -> std::optional { + return {}; + } +}; + +/** + * @brief A bounded lock-free single-producer multiple-consumer work-stealing deque. + * + * Implements the "Chase-Lev" deque described in the papers, `"Dynamic Circular Work-Stealing deque" + * `_ and `"Correct and Efficient Work-Stealing for Weak + * Memory Models" `_. + * + * Only the deque owner can perform ``pop()`` and ``push()`` operations where the deque behaves + * like a LIFO stack. Others can (only) ``steal()`` data from the deque, they see a FIFO deque. + * All threads must have finished using the deque before it is destructed. + * + * Also see: + + * - Rust: https://github.com/crossbeam-rs/crossbeam/blob/master/crossbeam-deque/src/deque.rs + * - CDSC: https://dl.acm.org/doi/epdf/10.1145/2544173.2509514 + * + * @tparam T The type of the elements in the deque. + */ +export template > Allocator = std::allocator>> +class deque { + public: + using diff_type = atomic_ring_buf::diff_type; + using size_type = atomic_ring_buf::size_type; + + using value_type = T; + using allocator_type = Allocator; + + deque(deque const &) = delete; + deque(deque &&) = delete; + auto operator=(deque const &) -> deque & = delete; + auto operator=(deque &&) -> deque & = delete; + + /** + * @brief A non-owning handle that can be used to steal items from the deque. + * + * All non-owner interactions with the deque should be made through this handle. + */ + class thief_handle { + + friend class deque; + + explicit thief_handle(deque *queue) noexcept + : m_queue{queue} { + LF_ASSUME(queue != nullptr); + } + + public: + /** + * @brief Check if the deque is empty. + */ + [[nodiscard]] + constexpr auto empty(this thief_handle self) noexcept -> bool { + diff_type const top = self.m_queue->m_top.load(acquire); + std::atomic_thread_fence(seq_cst); + diff_type const bottom = self.m_queue->m_bottom.load(acquire); + return top >= bottom; + } + /** + * @brief Get the number of elements in the deque. + */ + [[nodiscard]] + constexpr auto size(this thief_handle self) noexcept -> size_type { + return safe_cast(self.ssize()); + } + /** + * @brief Get the number of elements in the deque as a signed integer. + */ + [[nodiscard]] + constexpr auto ssize(this thief_handle self) noexcept -> diff_type { + diff_type const top = self.m_queue->m_top.load(acquire); + std::atomic_thread_fence(seq_cst); + diff_type const bottom = self.m_queue->m_bottom.load(acquire); + return std::max(bottom - top, diff_type{0}); + } + /** + * @brief Get the capacity of the deque. + */ + [[nodiscard]] + constexpr auto capacity(this thief_handle self) noexcept -> diff_type { + return self.m_queue->capacity(); + } + /** + * @brief Steal an item from the deque. + * + * Any threads can try to steal an item from the deque. This operation can + * fail if the deque is empty or if another thread simultaneously stole an + * item from the deque. + */ + constexpr auto steal(this thief_handle self) noexcept -> steal_t { + // + diff_type top = self.m_queue->m_top.load(acquire); + std::atomic_thread_fence(seq_cst); + diff_type const bottom = self.m_queue->m_bottom.load(acquire); + + if (top < bottom) { + // Must load *before* acquiring the slot as slot may be overwritten immediately after + // acquiring. This load is NOT required to be atomic even-though it may race with an overwrite + // as we only return the value if we win the race below guaranteeing we had no race during our + // read. If we loose the race then 'x' could be corrupt due to read-during-write race but as T + // is trivially destructible this does not matter. + T tmp = self.m_queue->m_buf.load(top); + + static_assert(std::is_trivially_destructible_v, "'atomicable' should guarantee this already"); + + if (!self.m_queue->m_top.compare_exchange_strong(top, top + 1, seq_cst, relaxed)) { + return {.code = err::lost, .val = {}}; + } + return {.code = err::none, .val = tmp}; + } + return {.code = err::empty, .val = {}}; + } + + private: + deque *m_queue; + }; + + /** + * @brief Construct a new empty deque object. + * + * @param cap The capacity of the deque (will be rounded to the next power of two). + * @param alloc Allocator used to allocate the internal buffer. + */ + constexpr explicit deque(size_type cap, Allocator const &alloc = Allocator()) + : m_buf(round_capacity(cap), alloc) {} + + /** + * @brief Check if the deque is empty. + */ + [[nodiscard]] + constexpr auto empty() const noexcept -> bool { + diff_type const bottom = m_bottom.load(relaxed); + diff_type const top = m_top.load(seq_cst); + return top >= bottom; + } + /** + * @brief Get the number of elements in the deque. + */ + [[nodiscard]] + constexpr auto size() const noexcept -> size_type { + return safe_cast(ssize()); + } + /** + * @brief Get the number of elements in the deque as a signed integer. + */ + [[nodiscard]] + constexpr auto ssize() const noexcept -> diff_type { + diff_type const bottom = m_bottom.load(relaxed); + diff_type const top = m_top.load(seq_cst); + return std::max(bottom - top, diff_type{0}); + } + /** + * @brief Get the capacity of the deque. + */ + [[nodiscard]] + constexpr auto capacity() const noexcept -> diff_type { + return m_buf.capacity(); + } + /** + * @brief Get a non-owning `thief_handle` that can be used to steal items from the deque. + */ + constexpr auto thief() noexcept -> thief_handle { return thief_handle{this}; } + + /** + * @brief Push an item into the deque. + * + * Only the owner thread can insert an item into the deque. This will throw + * an exception if the deque is full. This returns the number of elements in + * the deque before the push. + * + * @param val Value to add to the deque. + */ + constexpr auto push(T val) -> diff_type { + + diff_type const bottom = m_bottom.load(relaxed); + diff_type const top = m_top.load(acquire); + diff_type const ssize = bottom - top; + + if (m_buf.capacity() < ssize + 1) { + LF_THROW(deque_full_error{}); + } + + // Construct new object, this does not have to be atomic as no one can steal + // this item until after we store the new value of bottom, ordering is + // maintained by surrounding atomics. + m_buf.store(bottom, val); + + std::atomic_thread_fence(release); + m_bottom.store(bottom + 1, relaxed); + + // This was the size just before the push, upon return the size could be any + // smaller number, down to zero, as stealers could have stolen all the + // tasks. + return ssize; + } + + /** + * @brief Pop an item from the deque. + * + * Only the owner thread can pop out an item from the deque. If the buffer is + * empty calls `when_empty` and returns the result. By default, `when_empty` + * is a no-op that returns a null `std::optional`. + */ + template > + requires std::convertible_to> + constexpr auto + pop(Fn &&when_empty = {}) noexcept(std::is_nothrow_invocable_v) -> std::invoke_result_t { + + diff_type const bottom = m_bottom.load(relaxed) - 1; // + m_bottom.store(bottom, relaxed); // Stealers can no longer steal. + + std::atomic_thread_fence(seq_cst); + + diff_type top = m_top.load(relaxed); + + if (top <= bottom) { + // Non-empty deque + + // This load is not required to be atomic as we are the exclusive writer. + T val = m_buf.load(bottom); + + if (top == bottom) { + // The last item could get stolen, by a stealer that loaded bottom before our write above. + if (!m_top.compare_exchange_strong(top, top + 1, seq_cst, relaxed)) { + // Failed race, thief got the last item. + m_bottom.store(bottom + 1, relaxed); + return std::invoke(std::forward(when_empty)); + } + m_bottom.store(bottom + 1, relaxed); + } + return val; + } + m_bottom.store(bottom + 1, relaxed); + return std::invoke(std::forward(when_empty)); + } + + private: + alignas(k_cache_line) atomic_ring_buf m_buf; + alignas(k_cache_line) std::atomic m_top{0}; + alignas(k_cache_line) std::atomic m_bottom{0}; + + // Convenience aliases. + static constexpr std::memory_order relaxed = std::memory_order_relaxed; + static constexpr std::memory_order consume = std::memory_order_consume; + static constexpr std::memory_order acquire = std::memory_order_acquire; + static constexpr std::memory_order release = std::memory_order_release; + static constexpr std::memory_order seq_cst = std::memory_order_seq_cst; + + /** + * @brief Round `cap` up to the next power of two as a `diff_type`. + */ + static constexpr auto round_capacity(size_type cap) -> diff_type { + constexpr auto max_cap = std::bit_floor(safe_cast(std::numeric_limits::max())); + LF_ASSUME(0 < cap && cap <= max_cap); + return safe_cast(std::bit_ceil(cap)); + } +}; + +} // namespace lf diff --git a/src/batteries/geometric_stack.cxx b/src/batteries/geometric_stack.cxx new file mode 100644 index 000000000..c34648442 --- /dev/null +++ b/src/batteries/geometric_stack.cxx @@ -0,0 +1,449 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/compiler.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.batteries:geometric_stack; + +import std; + +import libfork.utils; + +namespace lf { + +/** + * @brief A geometric_stack is a user-space (geometric-growth) segmented program stack. + * + * This protects against hot-splitting by keeping a single cached segment. + * + * For this to conform to `worker_stack` the allocators void pointer type must be `void *` + */ +export template Allocator = std::allocator> +class geometric_stack { + + struct ctrl; + struct node; + + using ctrl_traits = std::allocator_traits::template rebind_traits; + using node_traits = std::allocator_traits::template rebind_traits; + + using ctrl_ptr = ctrl_traits::pointer; + using node_ptr = node_traits::pointer; + + using void_ptr = node_traits::void_pointer; + + using size_type = node_traits::size_type; + using diff_type = node_traits::difference_type; + + struct release_t { + explicit constexpr release_t(key_t) noexcept {} + }; + + class checkpoint_t { + public: + constexpr checkpoint_t() noexcept = default; + constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default; + + private: + friend geometric_stack; + explicit constexpr checkpoint_t(ctrl_ptr ptr) noexcept + : m_ctrl(ptr) {} + ctrl_ptr m_ctrl = nullptr; + }; + + public: + constexpr geometric_stack() noexcept(noexcept(Allocator())) + : geometric_stack(Allocator()) {} + explicit constexpr geometric_stack(Allocator const &alloc) noexcept + : m_ctrl_alloc(alloc) {} + + constexpr geometric_stack(geometric_stack const &other) = delete; + constexpr geometric_stack(geometric_stack &&other) = delete; + + constexpr auto operator=(geometric_stack const &other) -> geometric_stack & = delete; + constexpr auto operator=(geometric_stack &&other) -> geometric_stack & = delete; + + constexpr ~geometric_stack() noexcept { + LF_ASSUME(empty()); + delete_ctrl(m_ctrl); + } + + /** + * @brief Test if the stack is empty (all pushes have been popped). + */ + [[nodiscard]] + constexpr auto empty() const noexcept -> bool { + + if (m_ctrl != nullptr) { + LF_ASSUME(m_ctrl->top != nullptr); + } else { + return true; + } + + if (m_ctrl->top->prev != nullptr) { + return false; + } + + return m_sp == m_lo; + } + + /** + * @brief Get a checkpoint of the stack that can be used to acquire it from another stack allocator. + */ + [[nodiscard]] + constexpr auto checkpoint() noexcept -> checkpoint_t { + return checkpoint_t{m_ctrl}; + } + + /** + * @brief Allocate size bytes on the stack and return a pointer to the base of the allocation. + */ + [[nodiscard]] + constexpr auto push(std::size_t size) -> void_ptr { + // Zero sized pushed are an error + LF_ASSUME(size > 0); + + // Very careful math to avoid superfluous instructions on this (very) hot path. + diff_type push_bytes = safe_cast(round_to_multiple(size)); + + constexpr diff_type node_size = sizeof(node); + + LF_ASSUME(push_bytes >= node_size); + LF_ASSUME(push_bytes % node_size == 0); + + // Optimized to just the subtraction because multiplication cancels the implicit division. + diff_type free_bytes = node_size * (m_hi - m_sp); + + if (push_bytes > free_bytes) [[unlikely]] { + return push_cached(push_bytes); + } + + LF_ASSUME(m_ctrl != nullptr); + LF_ASSUME(m_ctrl->top != nullptr); + + // Compiler should optimize this division away when it fuses it with the + // implicit multiplication in the pointer arithmetic below. + diff_type num_nodes = push_bytes / node_size; + + // node_ptr -> void_ptr + return static_cast(std::exchange(m_sp, m_sp + num_nodes)); + } + + /** + * @brief Deallocate the most recent allocation of size bytes at ptr, which + * must be the most recent allocation returned by push and not yet popped. + */ + constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t n) noexcept { + + LF_ASSUME(m_ctrl != nullptr); + LF_ASSUME(m_ctrl->top != nullptr); + LF_ASSUME(!empty()); + LF_ASSUME(m_sp != nullptr); + LF_ASSUME(ptr != nullptr); + + // Inverse of push: void_ptr -> node_ptr + auto sp = static_cast(ptr); + + if (m_sp == m_lo) [[unlikely]] { + return pop_shuffle(sp); + } + + m_sp = sp; + } + + [[nodiscard]] + constexpr auto prepare_release() noexcept -> release_t { + + // Guard against null release + if (m_ctrl != nullptr) { + m_ctrl->sp_cache = m_sp; + } + + return release_t{key()}; + } + + constexpr void release([[maybe_unused]] release_t) noexcept { + + // Don't delete, will be resumed from a checkpoint. + m_ctrl = nullptr; + + m_lo = nullptr; + m_sp = nullptr; + m_hi = nullptr; + } + + constexpr void acquire(checkpoint_t ckpt) noexcept { + + LF_ASSUME(empty()); + LF_ASSUME(ckpt.m_ctrl != m_ctrl); + + if (ckpt.m_ctrl == nullptr) { + return; + } + + delete_ctrl(m_ctrl); + + m_ctrl = ckpt.m_ctrl; + + if constexpr (!node_traits::is_always_equal::value) { + // Need to propagate allocator + m_ctrl_alloc = typename ctrl_traits::allocator_type{std::as_const(m_ctrl->node_alloc)}; + } + + LF_ASSUME(m_ctrl->top != nullptr); + + load_local(); + } + + private: + // ============== Types ============== // + + enum class from : char { + top, + cache, + none, + }; + + struct alignas(k_new_align) node { + node_ptr prev; // Linked list (past) + diff_type size; // Usable-size of the stacklet + }; + + struct ctrl { + [[no_unique_address]] + typename node_traits::allocator_type node_alloc; + + node_ptr top = nullptr; // Most recent stacklet i.e. the top of the stack. + node_ptr cache = nullptr; // Cached (empty) stacklet for hot-split guarding. + node_ptr sp_cache = nullptr; // Cached stack pointer for this stacklet. + }; + + // ============== Members ============== // + + [[no_unique_address]] + typename ctrl_traits::allocator_type m_ctrl_alloc; + + ctrl_ptr m_ctrl = nullptr; // The control block for the stack. + + node_ptr m_lo = nullptr; // The base pointer for the current stacklet. + node_ptr m_sp = nullptr; // The stack pointer for the current stacklet. + node_ptr m_hi = nullptr; // The one-past-the-end pointer for the current stacklet. + + // ============== Methods ============== // + + /** + * @brief Make local pointers point to the current stacklet in the control block. + * + * Assumes that the control block and top stacklet are non-nullptr. + */ + template + constexpr auto load_local() noexcept -> void { + + LF_ASSUME(m_ctrl != nullptr); + LF_ASSUME(m_ctrl->top != nullptr); + + constexpr diff_type one{1}; + + m_lo = m_ctrl->top + one; + m_hi = m_lo + m_ctrl->top->size; + + if constexpr (StackPtr == from::cache) { + m_sp = m_ctrl->sp_cache; + } else if constexpr (StackPtr == from::top) { + m_sp = m_lo; + } else { + static_assert(StackPtr == from::none); + } + } + + /** + * @brief Allocate and construct a new control block with a single stacklet of size bytes. + */ + [[nodiscard]] + constexpr auto new_ctrl(this geometric_stack &self, diff_type num_nodes) -> ctrl_ptr { + + ctrl_ptr new_ctrl = ctrl_traits::allocate(self.m_ctrl_alloc, 1); + + LF_TRY { + // Propagate ctrl allocator to control blocks node allocator. + ctrl_traits::construct(self.m_ctrl_alloc, std::to_address(new_ctrl), std::as_const(self.m_ctrl_alloc)); + LF_TRY { + new_ctrl->top = new_node(new_ctrl, num_nodes); + } LF_CATCH_ALL { + // Clean up construction + ctrl_traits::destroy(self.m_ctrl_alloc, std::to_address(new_ctrl)); + LF_RETHROW; + } + } LF_CATCH_ALL { + // Clean up allocation + ctrl_traits::deallocate(self.m_ctrl_alloc, new_ctrl, 1); + LF_RETHROW; + } + + return new_ctrl; + } + + /** + * @brief Clean and delete the control block and all stacklets. + */ + constexpr void delete_ctrl(this geometric_stack &self, ctrl_ptr ctrl) noexcept { + if (ctrl != nullptr) { + LF_ASSUME(ctrl->top != nullptr); + LF_ASSUME(ctrl->top->prev == nullptr); + + // Clea-up stacklets + delete_node(ctrl, ctrl->top); + delete_node(ctrl, ctrl->cache); + + // Finally delete the control block. + ctrl_traits::destroy(self.m_ctrl_alloc, std::to_address(ctrl)); + ctrl_traits::deallocate(self.m_ctrl_alloc, ctrl, 1); + } + } + + /** + * @brief Allocate node with size bytes for stacklet. + * + * This function is strongly exception-safe. + */ + [[nodiscard]] + static constexpr auto new_node(ctrl_ptr ctrl, diff_type num_nodes) -> node_ptr { + + // Allocation should be a multiple of the node size + LF_ASSUME(num_nodes > 0); + LF_ASSUME(ctrl != nullptr); + + // Allocation/deallocation requires size_type, +1 for the header node + size_type allocate_nodes = 1 + safe_cast(num_nodes); + + node_ptr next_node = node_traits::allocate(ctrl->node_alloc, allocate_nodes); + + LF_TRY { + // Construct the header + node_traits::construct(ctrl->node_alloc, std::to_address(next_node), nullptr, num_nodes); + } LF_CATCH_ALL { + node_traits::deallocate(ctrl->node_alloc, next_node, allocate_nodes); + LF_RETHROW; + } + + return next_node; + } + + /** + * @brief Delete a (possibly null) node and it's associated stacklet. + */ + static constexpr auto delete_node(ctrl_ptr ctrl, node_ptr ptr) noexcept -> void { + if (ptr != nullptr) { + LF_ASSUME(ctrl != nullptr); + // Size doesn't include the header node so we +1 here. + size_type allocated_nodes = 1 + safe_cast(ptr->size); + node_traits::destroy(ctrl->node_alloc, std::to_address(ptr)); + node_traits::deallocate(ctrl->node_alloc, ptr, allocated_nodes); + } + } + + [[nodiscard]] + constexpr auto push_cached(diff_type push_bytes) -> void_ptr { + + // Have to be very careful in this function to be strongly exception-safe! + + constexpr diff_type node_size = sizeof(node); + + LF_ASSUME(push_bytes >= node_size); + LF_ASSUME(push_bytes % node_size == 0); + + diff_type num_nodes = safe_cast(push_bytes / node_size); + + LF_ASSUME(num_nodes > 0); + + if (m_ctrl == nullptr) { + // Initial stacklet wants to be quite large + constexpr diff_type min_nodes = (k_page_size / sizeof(node)) - 1; + + m_ctrl = new_ctrl(std::max(min_nodes, num_nodes)); + + // Local copies of the new top + load_local(); + // Do the allocation. + return static_cast(std::exchange(m_sp, m_sp + num_nodes)); + } + + LF_ASSUME(m_ctrl->top != nullptr); + + if (m_ctrl->cache != nullptr && m_ctrl->cache->size >= num_nodes) { + + // We have space in the cache. No allocations on this path, nothing cam throw. + + if (m_sp == m_lo) { + // There is nothing allocated on the current stacklet/top but it doesn't + // have enough space hence, we need to delete top such that we don't end up + // with an empty stacklet in the chain. This would break deletion otherwise. + node_ptr empty_top = m_ctrl->top; + m_ctrl->top = m_ctrl->top->prev; // top could be null now + delete_node(m_ctrl, empty_top); + } + + // Shuffle cache to the top. + m_ctrl->cache->prev = m_ctrl->top; + m_ctrl->top = m_ctrl->cache; + m_ctrl->cache = nullptr; + + // Local copies of the new top + load_local(); + // Do the allocation. + return static_cast(std::exchange(m_sp, m_sp + num_nodes)); + } + + // We need to allocate a new stacklet to fit this allocation, we choose to + // grow geometrically to try to avoid too many allocations. Fine if this + // throws + node_ptr new_top = new_node(m_ctrl, std::max(num_nodes, 2 * m_ctrl->top->size)); + + // Nothing can throw after this point + + // We didn't use the cache because it wasn't big enough, we should delete it + // now because we had to grow the stack. We couldn't do this until now because + // new_node may have thrown. + delete_node(m_ctrl, std::exchange(m_ctrl->cache, nullptr)); + + if (m_sp == m_lo) { + // There is nothing allocated on the current stacklet/top but it doesn't + // have enough space hence, we need to delete top such that we don't end up + // with an empty stacklet in the chain. This would break deletion otherwise. + node_ptr empty_top = m_ctrl->top; + m_ctrl->top = m_ctrl->top->prev; // top could be null now + delete_node(m_ctrl, empty_top); + } + + // Commit the new/node + new_top->prev = m_ctrl->top; + m_ctrl->top = new_top; + + // Local copies of the new top + load_local(); + // Do the allocation. + return static_cast(std::exchange(m_sp, m_sp + num_nodes)); + } + + constexpr void pop_shuffle(node_ptr sp) noexcept { + + // Shuffle top/cache + LF_ASSUME(!empty()); + LF_ASSUME(m_ctrl != nullptr); + LF_ASSUME(m_ctrl->top != nullptr); // Pop from empty stack + LF_ASSUME(m_ctrl->top->prev != nullptr); // ^ + + // Shuffle top to cache + node_ptr old_cache = m_ctrl->cache; + m_ctrl->cache = m_ctrl->top; + delete_node(m_ctrl, old_cache); + + // Go back one stacklet + m_ctrl->top = m_ctrl->top->prev; + + // Local copies of the new top + load_local(); + m_sp = sp; + } +}; + +} // namespace lf diff --git a/src/batteries/slab_stack.cxx b/src/batteries/slab_stack.cxx new file mode 100644 index 000000000..7145b8336 --- /dev/null +++ b/src/batteries/slab_stack.cxx @@ -0,0 +1,236 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/compiler.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.batteries:slab_stack; + +import std; + +import libfork.utils; + +namespace lf { + +/** + * @brief A slab_stack is a user-space stack backed by a single fixed-size slab of memory. + * + * For this to conform to `worker_stack` the allocators void pointer type must be `void *` + */ +export template Allocator = std::allocator> +class slab_stack { + + struct node; + + using node_traits = std::allocator_traits::template rebind_traits; + using node_alloc_t = node_traits::allocator_type; + + using node_ptr = node_traits::pointer; + using void_ptr = node_traits::void_pointer; + + using size_type = node_traits::size_type; + using diff_type = node_traits::difference_type; + + struct alignas(k_new_align) node { + [[no_unique_address]] + node_alloc_t node_alloc; // Propagated to new owners on acquire. + node_ptr sp_cache; // Stack pointer saved across release/acquire. + diff_type size; // Usable node count following this header. + }; + + static constexpr diff_type k_default_nodes = safe_cast(4 * k_page_size / sizeof(node)) - 1; + + static_assert(k_default_nodes > 0); + + struct release_t { + explicit constexpr release_t(key_t) noexcept {} + }; + + class checkpoint_t { + public: + constexpr checkpoint_t() noexcept = default; + constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default; + + private: + friend slab_stack; + explicit constexpr checkpoint_t(node_ptr ptr) noexcept + : m_ctrl(ptr) {} + node_ptr m_ctrl = nullptr; + }; + + public: + constexpr slab_stack() + : slab_stack(k_default_nodes, Allocator()) {} + + // TODO: what is appropriate unit for initialisation + // TODO: remove default constructor? + + explicit constexpr slab_stack(diff_type num_nodes, Allocator const &alloc = Allocator()) + : m_alloc(alloc) { + init_slab(num_nodes); + } + + constexpr slab_stack(slab_stack const &) = delete; + constexpr slab_stack(slab_stack &&) = delete; + + constexpr auto operator=(slab_stack const &) -> slab_stack & = delete; + constexpr auto operator=(slab_stack &&) -> slab_stack & = delete; + + constexpr ~slab_stack() noexcept { + LF_ASSUME(empty()); + delete_ctrl(m_ctrl); + } + + /** + * @brief Test if the stack is empty (all pushes have been popped). + */ + [[nodiscard]] + constexpr auto empty() const noexcept -> bool { + return m_sp == m_lo; + } + + /** + * @brief Get a checkpoint of the stack for transfer to another stack instance. + */ + [[nodiscard]] + constexpr auto checkpoint() noexcept -> checkpoint_t { + return checkpoint_t{m_ctrl}; + } + + /** + * @brief Allocate size bytes on the stack and return a pointer to the base of the allocation. + */ + [[nodiscard]] + constexpr auto push(std::size_t size) -> void_ptr { + LF_ASSUME(size > 0); + + constexpr diff_type node_size = sizeof(node); + + diff_type push_bytes = safe_cast(round_to_multiple(size)); + + LF_ASSUME(push_bytes >= node_size); + LF_ASSUME(push_bytes % node_size == 0); + + // Optimized to just the subtraction because multiplication cancels the implicit division. + diff_type free_bytes = node_size * (m_hi - m_sp); + + if (push_bytes > free_bytes) [[unlikely]] { + LF_THROW(std::bad_alloc{}); + } + + diff_type num_nodes = push_bytes / node_size; + + // node_ptr -> void_ptr + return static_cast(std::exchange(m_sp, m_sp + num_nodes)); + } + + /** + * @brief Deallocate the most recent allocation of n bytes at ptr. + */ + constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t n) noexcept { + LF_ASSUME(!empty()); + LF_ASSUME(m_sp != nullptr); + LF_ASSUME(ptr != nullptr); + + // Inverse of push: void_ptr -> node_ptr + m_sp = static_cast(ptr); + } + + /** + * @brief Make ready for a call to release(). + */ + [[nodiscard]] + constexpr auto prepare_release() noexcept -> release_t { + // Guard against null ctrl (failed prior allocation in release()). + if (m_ctrl != nullptr) { + m_ctrl->sp_cache = m_sp; + } + return release_t{key()}; + } + + constexpr void release([[maybe_unused]] release_t) noexcept { + + diff_type next_size = (m_ctrl != nullptr) ? m_ctrl->size : k_default_nodes; + + // Hand off the current slab to whoever holds the checkpoint; clear local state. + m_ctrl = nullptr; + m_lo = nullptr; + m_sp = nullptr; + m_hi = nullptr; + + // Pre-allocate a fresh slab for our next use. + + LF_TRY { + init_slab(next_size); + } LF_CATCH_ALL { + // If ^ throws, swallow the exception — push will see no space + // i.e. (m_hi - m_sp == 0) and throw instead. + } + } + + constexpr void acquire(checkpoint_t ckpt) noexcept { + + LF_ASSUME(empty()); + LF_ASSUME(ckpt.m_ctrl != m_ctrl); + + if (ckpt.m_ctrl == nullptr) { + return; + } + + // Discard the fresh empty slab we prepared during release() (may be null on alloc failure). + delete_ctrl(m_ctrl); + + m_ctrl = ckpt.m_ctrl; + + if constexpr (!node_traits::is_always_equal::value) { + m_alloc = node_alloc_t{std::as_const(m_ctrl->node_alloc)}; + } + + load_local(); + } + + private: + [[no_unique_address]] + node_alloc_t m_alloc; + + node_ptr m_ctrl = nullptr; // Header node (fused ctrl+first-node of the slab). + node_ptr m_lo = nullptr; // Base of usable space (m_ctrl + 1). + node_ptr m_sp = nullptr; // Stack pointer for the current slab. + node_ptr m_hi = nullptr; // One-past-the-end of usable space. + + // Restore local pointers from the header node, taking sp from the cache. + constexpr void load_local() noexcept { + LF_ASSUME(m_ctrl != nullptr); + m_lo = m_ctrl + 1; + m_hi = m_lo + m_ctrl->size; + m_sp = m_ctrl->sp_cache; + } + + // Allocate and construct a fresh slab with num_nodes usable nodes. + constexpr void init_slab(diff_type num_nodes) { + LF_ASSUME(num_nodes > 0); + + size_type total = safe_cast(1 + num_nodes); + m_ctrl = node_traits::allocate(m_alloc, total); + + LF_TRY { + node_traits::construct(m_alloc, std::to_address(m_ctrl), m_alloc, nullptr, num_nodes); + } LF_CATCH_ALL { + node_traits::deallocate(m_alloc, m_ctrl, total); + m_ctrl = nullptr; + LF_RETHROW; + } + + m_lo = m_sp = m_ctrl + 1; + m_hi = m_lo + num_nodes; + } + + // Destroy and deallocate a slab (no-op if null). + constexpr void delete_ctrl(node_ptr ctrl) noexcept { + if (ctrl != nullptr) { + size_type total = safe_cast(1 + ctrl->size); + node_traits::destroy(m_alloc, std::to_address(ctrl)); + node_traits::deallocate(m_alloc, ctrl, total); + } + } +}; + +} // namespace lf diff --git a/src/core/awaitables.cxx b/src/core/awaitables.cxx new file mode 100644 index 000000000..83cf411bf --- /dev/null +++ b/src/core/awaitables.cxx @@ -0,0 +1,239 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.core:awaitables; + +import std; + +import libfork.utils; + +import :concepts_context; +import :frame; +import :handles; +import :task; +import :thread_locals; +import :final_suspend; + +namespace lf { + +// =============== Fork/Call =============== // + +/** + * @brief Call inside a catch block, stash current exception in `frame`. + */ +template +constexpr void stash_current_exception(frame_type *frame) noexcept { + // No synchronization is done via exception_bit, hence we can use relaxed atomics + // and rely on the usual fork/join synchronization to ensure memory ordering. + if (frame->atomic_except().exchange(1, std::memory_order_relaxed) == 0) { + + frame->except.construct(std::current_exception()); + + // Should have been called from inside a catch block + LF_ASSUME(*frame->except != nullptr); + } +} + +/** + * @brief In a separate function to allow it to be placed in cold block. + */ +template +constexpr void +destroy_child_stash_exception(frame_t *child, coro> parent) noexcept { + // Clean-up the child that will never be resumed. + child->handle().destroy(); + // Stash in the parent's frame which will then be resumed. + stash_current_exception(&parent.promise().frame); +} + +/** + * @brief Awaitable for forking/calling an async function. + */ +template +struct async_awaitable : std::suspend_always { + + static_assert(Cat == category::call || Cat == category::fork, "Invalid category for awaitable"); + + frame_t *child; + + template + constexpr auto + await_suspend(this async_awaitable self, coro> parent) noexcept -> coro<> { + + // TODO: test of having a dedicated is_stopped awaitable is quicker + + if (!self.child) [[unlikely]] { + // Noop if an exception was thrown. + return parent; + } + + if (self.child->stop_requested()) [[unlikely]] { + // Noop if stopped, must clean-up the child that will never be resumed. + return self.child->handle().destroy(), parent; + } + + // Propagate parent->child relationships + self.child->parent = &parent.promise().frame; + + if constexpr (Cat == category::call) { + // Should be the default + LF_ASSUME(self.child->kind == category::call); + } else { + self.child->kind = Cat; + } + + if constexpr (Cat == category::fork) { + // It is critical to pass self by-value here, after the call to push() + // the object `*this` may be destroyed, if passing by ref it would be + // use-after-free to then access self in the following line to fetch the + // handle. + LF_TRY { + get_tls_context().push(steal_handle{key(), &parent.promise().frame}); + } LF_CATCH_ALL { + return destroy_child_stash_exception(self.child, parent), parent; + } + } + + return self.child->handle(); + } +}; + +// =============== Join =============== // + +template +struct join_awaitable { + + frame_t *frame; + + constexpr auto await_ready(this join_awaitable self) noexcept -> bool { + if (not_null(self.frame)->steals == 0) [[likely]] { + if (self.frame->stop_requested()) [[unlikely]] { + // Must unconditionally suspended if stopped + return false; + } + // If no steals then we are the only owner of the parent and we are + // ready to join. Therefore, no need to reset the control block. + return true; + } + return false; + } + + constexpr auto await_suspend(this join_awaitable self, std::coroutine_handle<> task) noexcept -> coro<> { + // Currently self.joins = k_u16_max - num_joined + // + // We set joins = self->joins - (k_u16_max - num_steals) + // = num_steals - num_joined + // + // Hence joined = k_u16_max - num_joined + // k_u16_max - joined = num_joined + + // Lemma: + // + // If a thread is at a join and steals have occurred then the + // thread can never own the stack of the current frame. + // + // This is because threads follow the work-first principle, so for the + // owner to be running this task it would have to have re-stolen it from a + // thief. Which implies it would have run the final suspend of the child + // that had it's continuation stolen, where it would have had to release + // the stack, because the parent was at not at the join. + + LF_ASSUME(self.frame); + + std::uint32_t steals = self.frame->steals; + std::uint32_t offset = k_u16_max - steals; + std::uint32_t joined = self.frame->atomic_joins().fetch_sub(offset, std::memory_order_release); + + // If this was a stop: + // + // steals = 0, joins = k_u16_max then: + // + // steals = 0 + // offset = k_u16_max + // joined = k_u16_max, (self.frame->joins is now 0) + // + // k_u16_max - joined = 0 = steals, hence win the if + + if (steals == k_u16_max - joined) { + // We set joins after all children had completed therefore we can resume the task. + // Need to acquire to ensure we see all writes by other threads to the result. + std::atomic_thread_fence(std::memory_order_acquire); + + if (self.frame->stop_requested()) [[unlikely]] { + return self.handle_stop(); + } + + // We must reset the control block and take the stack. We should never + // own the stack at this point because we must have stolen the stack. + self.take_stack(); + self.frame->reset_counters(); + return task; + } + // Someone else is responsible for running this task. + + // We cannot touch *this or dereference self as someone may have resumed already! + // We cannot currently own this stack (checking would violate above). + + // If no explicit scheduling then we must have an empty WSQ as we stole this task. + + // If explicit scheduling then we may have tasks on our WSQ if we performed a self-steal + // in a switch awaitable. In this case we can/must do another self-steal. + + // return try_self_stealing(); + + return std::noop_coroutine(); + } + + constexpr void await_resume(this join_awaitable self) { + // We should have been reset + LF_ASSUME(self.frame->steals == 0); + LF_ASSUME(self.frame->joins == k_u16_max); + + // Outside parallel regions so can touch non-atomically. + // + // A task that completes by responding to cancellation will drop any + // exceptions however, a task may still throw exceptions even if cancelled. + // Here we must rethrow even if cancelled because we can't re-suspend at + // this point. + if constexpr (LF_COMPILER_EXCEPTIONS) { + if (self.frame->exception_bit) [[unlikely]] { + self.rethrow_exception(); + } + } + + LF_ASSUME(self.frame->exception_bit == 0); + } + + constexpr auto take_stack(this join_awaitable self) noexcept -> void { + stack_t &stack = get_tls_stack(); + LF_ASSUME(self.frame->stack_ckpt != stack.checkpoint()); + stack.acquire(std::as_const(self.frame->stack_ckpt)); + } + + [[nodiscard]] + constexpr auto handle_stop(this join_awaitable self) noexcept -> coro<> { + // Only need to take the stack if there were steals + if (self.frame->steals > 0) { + self.take_stack(); + } + + // We always need to reset the connters as we modified + self.frame->reset_counters(); + + // Drop any exceptions in the now-stopped task + if constexpr (LF_COMPILER_EXCEPTIONS) { + if (self.frame->exception_bit) [[unlikely]] { + std::ignore = extract_exception(self.frame); + } + } + + return final_suspend_leading(self.frame); + } + + [[noreturn]] + constexpr void rethrow_exception(this join_awaitable self) { + std::rethrow_exception(extract_exception(self.frame)); + } +}; + +} // namespace lf diff --git a/src/core/concepts/context.cxx b/src/core/concepts/context.cxx new file mode 100644 index 000000000..359da3ae0 --- /dev/null +++ b/src/core/concepts/context.cxx @@ -0,0 +1,49 @@ +export module libfork.core:concepts_context; + +import std; + +import libfork.utils; + +import :concepts_stack; +import :handles; + +namespace lf { + +template +concept ref_to_worker_stack = std::is_lvalue_reference_v && worker_stack>; + +/** + * @brief Specifies that a type acts as a LIFO stack over U. + */ +export template +concept lifo_stack = plain_object && requires (T context, U val) { + { context.push(val) } -> std::same_as; + { context.pop() } noexcept -> std::same_as; +}; + +/** + * @brief Defines the API for a libfork compatible worker context. + * + * This requires that `T` is an object type and supports the following operations: + * + * - Push/pop a steal handle onto the context in a LIFO manner. + * - Have a `worker_stack` that can be accessed via `stack()`. + */ +export template +concept worker_context = lifo_stack> && requires (T context) { + { context.stack() } noexcept -> ref_to_worker_stack; +}; + +/** + * @brief Fetch the stack type of a worker context `T`. + */ +export template +using stack_t = std::remove_reference_t().stack())>; + +/** + * @brief Fetch the checkpoint type of a worker context `T`. + */ +template +using checkpoint_t = decltype(std::declval &>().checkpoint()); + +} // namespace lf diff --git a/src/core/concepts/invocable.cxx b/src/core/concepts/invocable.cxx new file mode 100644 index 000000000..c6da4b016 --- /dev/null +++ b/src/core/concepts/invocable.cxx @@ -0,0 +1,66 @@ +module; +#include "libfork/__impl/utils.hpp" +export module libfork.core:concepts_invocable; + +import std; + +import libfork.utils; + +import :task; +import :concepts_context; + +namespace lf { + +template +struct ctx_invoke_t { + // Explicitly constrained so overload resolution selects prefers + template + requires std::invocable, Args...> + static constexpr auto operator()(Fn &&fn, Args &&...args) + LF_HOF(std::invoke(std::forward(fn), env{key()}, std::forward(args)...)) + + template + static constexpr auto operator()(Fn &&fn, Args &&...args) + LF_HOF(std::invoke(std::forward(fn), std::forward(args)...)) +}; + +template +concept task_from = specialization_of && std::same_as; + +/** + * @brief Test if a callable `Fn` when invoked with `Args...` returns an `lf::task`. + */ +export template +concept async_invocable = worker_context && // + std::invocable, Fn, Args...> && // + task_from, Fn, Args...>>; // + +/** + * @brief Subsumes `async_invocable` and checks that the invocation is `noexcept`. + */ +export template +concept async_nothrow_invocable = + async_invocable && std::is_nothrow_invocable_v, Fn, Args...>; + +/** + * @brief The result type of invoking an async function `Fn` with `Args...`. + */ +export template + requires async_invocable +using async_result_t = std::invoke_result_t, Fn, Args...>::value_type; + +/** + * @brief Subsumes `async_invocable` and checks the result type is `R`. + */ +export template +concept async_invocable_to = + async_invocable && std::same_as>; + +/** + * @brief Subsumes `async_nothrow_invocable` and `async_invocable_to`. + */ +export template +concept async_nothrow_invocable_to = + async_nothrow_invocable && async_invocable_to; + +} // namespace lf diff --git a/src/core/concepts/scheduler.cxx b/src/core/concepts/scheduler.cxx new file mode 100644 index 000000000..bd081ce6d --- /dev/null +++ b/src/core/concepts/scheduler.cxx @@ -0,0 +1,29 @@ +export module libfork.core:concepts_scheduler; + +import std; + +import :handles; + +namespace lf { + +export template +concept has_context_typedef = requires { typename std::remove_cvref_t::context_type; }; + +export template +using context_t = typename std::remove_cvref_t::context_type; + +/** + * @brief An object capable of scheduling a libfork task for execution. + * + * These are typed to a context, the `post` method must: + * + * - Satisfy the strong exception guarantee. + * - Guarantee eventual execution of the task associated with `handle`. + */ +export template +concept scheduler = + has_context_typedef && requires (Sch &&scheduler, sched_handle> handle) { + { static_cast(scheduler).post(handle) } -> std::same_as; + }; + +} // namespace lf diff --git a/src/core/concepts/stack.cxx b/src/core/concepts/stack.cxx new file mode 100644 index 000000000..a85efab37 --- /dev/null +++ b/src/core/concepts/stack.cxx @@ -0,0 +1,52 @@ +export module libfork.core:concepts_stack; + +import std; + +import libfork.utils; + +namespace lf { + +template + requires std::is_object_v +consteval auto constify(T &&x) noexcept -> std::add_const_t &; + +/** + * @brief Defines the API for a libfork compatible stack. + * + * - After construction `this` is empty and push is valid. + * - Pop is valid provided the FILO order is respected. + * - Push produces pointers aligned to __STDCPP_DEFAULT_NEW_ALIGNMENT__. + * - Destruction is expected to only occur when the stack is empty. + * - Result of `.checkpoint()` is expected to: + * - Be "cheap to copy". + * - Have a null state (default constructed) that only compares equal to itself. + * - Is allowed to return null if push has never been called. + * - Compare equal if and only if they are both null or they allocate from the same stack. + * - Have no preconditions about when it's called. + * - Prepare release puts the stack into a state which another thread can acquire it. + * - Release detaches the current stack and leaves `this` empty. + * - This may be called concurrently with acquire + * - Acquire attaches to the stack that the checkpoint came from: + * - It is only called the stack is empty. + * - It is only called with a checkpoint not equal to the current checkpoint. + * - It is called after prepare release (and no other functions in between) + * + * Fast-path operations: empty, push, pop, checkpoint + * Slow-path operations: release, acquire + */ +export template +concept worker_stack = plain_object && requires (T stack, std::size_t n, void *ptr) { + { stack.push(n) } -> std::same_as; + { stack.pop(ptr, n) } noexcept -> std::same_as; + { stack.checkpoint() } noexcept -> std::regular; + { stack.prepare_release() } noexcept -> std::movable; + { stack.release(stack.prepare_release()) } noexcept -> std::same_as; + { stack.acquire(constify(stack.checkpoint())) } noexcept -> std::same_as; +}; + +// TODO: Allocator aware stack + +// export template +// concept aa_worker_stack = worker_stack && true; + +} // namespace lf diff --git a/src/core/core.cxx b/src/core/core.cxx new file mode 100644 index 000000000..3d983b95c --- /dev/null +++ b/src/core/core.cxx @@ -0,0 +1,26 @@ +export module libfork.core; + +// This module contains the core components of libfork, this includes: +// +// task/promise +// schedule +// concepts +// polymorphic context ABC + +export import :concepts_invocable; +export import :concepts_scheduler; +export import :concepts_context; +export import :concepts_stack; +export import :frame; +export import :task; +export import :thread_locals; +export import :poly_context; +export import :ops; +export import :handles; +export import :promise; +export import :schedule; +export import :root; +export import :execute; +export import :receiver; +export import :stop; +export import :exception; diff --git a/src/core/exception.cxx b/src/core/exception.cxx new file mode 100644 index 000000000..102447051 --- /dev/null +++ b/src/core/exception.cxx @@ -0,0 +1,12 @@ +export module libfork.core:exception; + +import std; + +namespace lf { + +/** + * @brief Base class for all libfork exceptions. + */ +export struct libfork_exception : std::exception {}; + +} // namespace lf diff --git a/src/core/execute.cxx b/src/core/execute.cxx new file mode 100644 index 000000000..5eb503b0c --- /dev/null +++ b/src/core/execute.cxx @@ -0,0 +1,78 @@ + +module; +#include "libfork/__impl/assume.hpp" +export module libfork.core:execute; + +import std; + +import :frame; +import :thread_locals; +import :concepts_context; +import :handles; +import :exception; + +namespace lf { + +export struct execute_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "execute called from within a worker thread!"; + } +}; + +/** + * @brief Bind this thread to a context and execute the scheduled tasks on that context/thread. + * + * This should not be called from a thread already bound to a context, once this call returns + * the thread is unbound from the context. + */ +export template +constexpr void execute(Context &context, sched_handle handle) { + + if (thread_local_context != nullptr) { + LF_THROW(execute_error{}); + } + + thread_local_context = std::addressof(context); + + defer _ = [] static noexcept -> void { + thread_local_context = nullptr; + }; + + auto *frame = static_cast> *>(get(key(), handle)); + + frame->handle().resume(); +} + +export struct steal_overflow_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "a single task has been stolen 65,535 times"; + } +}; + +export template +constexpr void execute(Context &context, steal_handle handle) { + + if (thread_local_context != nullptr) { + LF_THROW(execute_error{}); + } + + thread_local_context = std::addressof(context); + + defer _ = [] static noexcept -> void { + thread_local_context = nullptr; + }; + + auto *frame = static_cast> *>(get(key(), handle)); + + // TODO: bench if we should do this in debug only + if (frame->steals == k_u16_max) { + LF_THROW(steal_overflow_error{}); + } + + frame->steals += 1; + frame->handle().resume(); +} + +} // namespace lf diff --git a/src/core/final_suspend.cxx b/src/core/final_suspend.cxx new file mode 100644 index 000000000..db6c91fd3 --- /dev/null +++ b/src/core/final_suspend.cxx @@ -0,0 +1,243 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/compiler.hpp" +export module libfork.core:final_suspend; + +import std; + +import libfork.utils; + +import :concepts_context; +import :frame; +import :handles; +import :thread_locals; + +namespace lf { + +template +using coro = std::coroutine_handle; + +template +using frame_t = frame_type>; + +// =============== Extract exception =============== // + +/** + * @brief Pull an exception out of a frame and clean-up the union/allocation. + */ +template +[[nodiscard]] +constexpr auto extract_exception(frame_type *frame) noexcept -> std::exception_ptr { + + LF_ASSUME(frame->exception_bit); // Should only be called if an exception was thrown. + + // Local copy + std::exception_ptr except = std::move(*frame->except); + + // Should have been set by stash_current_exception + LF_ASSUME(except != nullptr); + + // Clean-up exception state + frame->exception_bit = 0; + frame->except.destroy(); + + return except; // NRVO +} + +// =============== Final =============== // + +/** + * @brief The full final suspend logic. + * + * The final suspend logic is fully expressed in this function in brief: + * + * - Try to resume parent if a call. + * - Try to resume parent if a fork with no stealing. + * - Try to resume a stolen forked task if last to complete. + * + * This function also handles cancellation (of the parent) by iteratively + * climbing up the parent chain. + * + * This function is split and repeated as two separate functions to allow the + * hot-path code to be inlined more easily into the final suspend. + */ +template +[[nodiscard]] +constexpr auto final_suspend_full(Context &context, frame_t *frame) noexcept -> coro<> { + for (;;) { + // Validate final state + LF_ASSUME(frame); + LF_ASSUME(frame->kind != category::root); + LF_ASSUME(frame->steals == 0); + LF_ASSUME(frame->joins == k_u16_max); + LF_ASSUME(frame->exception_bit == 0); + + // Local copies (before we destroy frame) + category const kind = frame->kind; + + frame_t *parent = not_null(frame->parent); + + // Before resuming the next (or exiting) we should clean-up the current frame. + // Can't use frame from this point onwards + frame->handle().destroy(); + + if (kind == category::call) { + return parent->handle(); + } + + // Given we are not a call we must be a fork hence, our + // parent can't be a root as they can only call. + LF_ASSUME(kind == category::fork); + LF_ASSUME(parent->kind != category::root); + + if (steal_handle last_pushed = context.pop()) { + // No-one stole continuation, we are the exclusive owner of parent -> just keep ripping! + LF_ASSUME(last_pushed == steal_handle{key(), parent}); + // This is not a join point so no state (i.e. counters) is guaranteed. + return parent->handle(); + } + + // An owner is a worker who: + // + // - Created the task. + // - OR had the task submitted to them. + // - OR won the task at a join. + // + // An owner of a task owns the stack the task is on. + // + // As the worker who completed the child task this thread owns the stack the child task was on. + // + // Either: + // + // 1. The parent is on the same stack as the child. + // 2. OR the parent is on a different stack to the child. + // + // Case (1) implies: we owned the parent; forked the child task; then the parent was then stolen. + // Case (2) implies: we stole the parent task; then forked the child; then the parent was stolen. + // + // Case (2) implies that our stack is empty. + + // As soon as we do the `fetch_sub` below the parent task is no longer safe + // to access as it may be resumed and then destroyed by another thread. Hence + // we must make copies on-the-stack of any data we may need if we lose the + // join race. + bool const owner = parent->stack_ckpt == context.stack().checkpoint(); + + // As soon as we do the fetch_sub (if we loose) someone may acquire + // the stack so we must prepare it for release now. + auto release_key = context.stack().prepare_release(); + + // Register with parent we have completed this child task. + if (parent->atomic_joins().fetch_sub(1, std::memory_order_release) == 1) { + // Parent has reached join and we are the last child task to complete. We + // are the exclusive owner of the parent and therefore, we must continue + // parent. As we won the race, acquire all writes before resuming. + std::atomic_thread_fence(std::memory_order_acquire); + + if (!owner) { + // In case of scenario (2) we must acquire the parent's stack. + context.stack().acquire(std::as_const(parent->stack_ckpt)); + } + + // Must reset parent's control block before resuming parent. + parent->reset_counters(); + + if (parent->stop_requested()) [[unlikely]] { + // Don't resume if stopped + if constexpr (LF_COMPILER_EXCEPTIONS) { + if (parent->exception_bit) [[unlikely]] { + std::ignore = extract_exception(parent); + } + } + frame = parent; + continue; + } + + return parent->handle(); + } + + if (owner) { + // We were unable to resume the parent and we were its owner, as the + // resuming thread will take ownership of the parent's we must give it up. + context.stack().release(std::move(release_key)); + } + + // We did not win the join-race, we cannot dereference the parent pointer now + // as the frame may now be freed by the winner. Parent has not reached join + // or we are not the last child to complete. We are now out of jobs, we must + // yield to the executor. + + // Else, case (2), our stack has no allocations on it, it may be used later. + return std::noop_coroutine(); + } +} + +template +[[nodiscard]] +constexpr auto final_suspend_trailing(Context &context, frame_t *parent) noexcept -> coro<> { + + bool const owner = parent->stack_ckpt == context.stack().checkpoint(); + + auto release_key = context.stack().prepare_release(); + + if (parent->atomic_joins().fetch_sub(1, std::memory_order_release) == 1) { + + std::atomic_thread_fence(std::memory_order_acquire); + + if (!owner) { + context.stack().acquire(std::as_const(parent->stack_ckpt)); + } + + parent->reset_counters(); + + if (parent->stop_requested()) [[unlikely]] { + if constexpr (LF_COMPILER_EXCEPTIONS) { + if (parent->exception_bit) [[unlikely]] { + std::ignore = extract_exception(parent); + } + } + return final_suspend_full(context, parent); + } + + return parent->handle(); + } + + if (owner) { + context.stack().release(std::move(release_key)); + } + + return std::noop_coroutine(); +} + +template +[[nodiscard]] +constexpr auto final_suspend_leading(frame_t *frame) noexcept -> coro<> { + + LF_ASSUME(frame); + LF_ASSUME(frame->steals == 0); + LF_ASSUME(frame->joins == k_u16_max); + LF_ASSUME(frame->exception_bit == 0); + + category const kind = frame->kind; + + frame_t *parent = not_null(frame->parent); + + frame->handle().destroy(); + + if (kind == category::call) { + return parent->handle(); + } + + LF_ASSUME(kind == category::fork); + + Context &context = get_tls_context(); + + if (steal_handle last_pushed = context.pop()) { + LF_ASSUME(last_pushed == steal_handle{key(), parent}); + return parent->handle(); + } + + return final_suspend_trailing(context, parent); +} + +} // namespace lf diff --git a/src/core/frame.cxx b/src/core/frame.cxx new file mode 100644 index 000000000..3a12696e2 --- /dev/null +++ b/src/core/frame.cxx @@ -0,0 +1,80 @@ +module; +#include "libfork/__impl/compiler.hpp" +#include "libfork/__impl/utils.hpp" +export module libfork.core:frame; + +import std; + +import libfork.utils; + +import :stop; + +namespace lf { + +enum class category : std::uint8_t { + call = 0, + fork, + root, +}; + +struct frame_base {}; + +// TODO: make everything (deque etc) allocator aware... + +template +struct frame_type : frame_base { + + // == Member variables == // + + // TODO: add checked accessors for all the things (including except etc) + + // Only set if an exception is thrown, otherwise uninit + uninitialized except; + + frame_type *parent; + stop_source::stop_token stop_token; + + [[no_unique_address]] + Checkpoint stack_ckpt; + + ATOMIC_ALIGN(std::uint32_t) joins = 0; // Atomic is 32 bits for speed + std::uint16_t steals = 0; // In debug do overflow checking + category kind = static_cast(0); // Fork/Call + ATOMIC_ALIGN(std::uint8_t) exception_bit = 0; // Atomically set + + // == Member functions == // + + // Explicitly post construction, this allows the compiler to emit a single + // instruction for the zero init then an instruction for the joins init, + // instead of three instructions. + explicit constexpr frame_type(Checkpoint &&ckpt) noexcept(std::is_nothrow_move_constructible_v) + : stack_ckpt(std::move(ckpt)) { + joins = k_u16_max; + } + + [[nodiscard]] + constexpr auto stop_requested() const noexcept -> bool { + // TODO: Should exception trigger stop? + return stop_token.stop_requested(); + } + + [[nodiscard]] + constexpr auto handle() LF_HOF(std::coroutine_handle::from_promise(*this)) + + [[nodiscard]] + constexpr auto atomic_joins() noexcept -> std::atomic_ref { + return std::atomic_ref{joins}; + } + + [[nodiscard]] + constexpr auto atomic_except() noexcept -> std::atomic_ref { + return std::atomic_ref{exception_bit}; + } + + constexpr void reset_counters() noexcept { + joins = k_u16_max; + steals = 0; + } +}; + +} // namespace lf diff --git a/src/core/handles.cxx b/src/core/handles.cxx new file mode 100644 index 000000000..ef646354b --- /dev/null +++ b/src/core/handles.cxx @@ -0,0 +1,68 @@ +export module libfork.core:handles; + +import libfork.utils; + +import :frame; + +namespace lf { + +// =================== Untyped handles =================== // + +class handle { + public: + constexpr handle() = default; + constexpr handle(key_t, frame_base *ptr) noexcept + : m_ptr{ptr} {} + constexpr auto operator==(handle const &) const noexcept -> bool = default; + constexpr explicit operator bool() const noexcept { return m_ptr != nullptr; } + + private: + [[nodiscard]] + constexpr friend auto get(key_t, handle h) noexcept -> frame_base * { + return h.m_ptr; + } + + frame_base *m_ptr = nullptr; +}; + +/** + * @brief An untyped steal-handle. + * + * For use by context policies that need to store handles in an untyped manner. + */ +export struct unsafe_steal_handle : handle { + using handle::handle; +}; + +/** + * @brief An untyped schedule-handle. + * + * For use by context policies that need to store handles in an untyped manner. + */ +export struct unsafe_sched_handle : handle { + using handle::handle; +}; + +// =================== Tagged handles =================== // + +/** + * @brief A handle to a task that can be stolen and resumed with `execute`. + * + * The coroutine behind this task is always suspended at fork point. + */ +export template +struct steal_handle : unsafe_steal_handle { + using unsafe_steal_handle::unsafe_steal_handle; +}; + +/** + * @brief A handle to a task that can be resumed with `execute`. + * + * The coroutine behind this task is either not-yet-started or suspended at a context-switch. + */ +export template +struct sched_handle : unsafe_sched_handle { + using unsafe_sched_handle::unsafe_sched_handle; +}; + +} // namespace lf diff --git a/src/core/ops.cxx b/src/core/ops.cxx new file mode 100644 index 000000000..d88a09724 --- /dev/null +++ b/src/core/ops.cxx @@ -0,0 +1,228 @@ +module; +#include "libfork/__impl/utils.hpp" +export module libfork.core:ops; + +import std; + +import libfork.utils; + +import :concepts_invocable; +import :frame; +import :stop; + +namespace lf { + +// Placeholder types for absent optional fields. +struct no_stop_t {}; +struct no_ret_t {}; + +// =============== Value-or-reference storage policy =============== // + +// For rvalue-reference arguments that are trivially copyable and fit in two +// pointer-sized words, store by value inside pkg instead of keeping a reference. +// This lets [[no_unique_address]] collapse empty functors to zero bytes and +// allows the compiler to treat the stored values as local data (no aliasing). +template +concept small_trivially_copyable = !std::is_reference_v // + && std::is_trivially_copyable_v // + && sizeof(T) <= 2 * sizeof(void *) // + && alignof(T) <= alignof(std::max_align_t); // + +// Only collapses rvalue refs; lvalue refs are kept as-is to preserve reference semantics. +template +using store_as_t = + std::conditional_t && small_trivially_copyable>, + std::remove_cvref_t, + T>; + +// clang-format off + +template +struct [[nodiscard("You should immediately co_await this!")]] pkg { + [[no_unique_address]] std::conditional_t stop_token; + [[no_unique_address]] std::conditional_t, no_ret_t, R *> return_addr; + [[no_unique_address]] Fn fn; + [[no_unique_address]] tuple args; +}; + +// clang-format on + +/** + * @brief Forward the function member of a pkg correctly. + * + * Handles three cases: + * - rvalue reference Fn: move it. + * - lvalue reference Fn: return by reference. + * - value type Fn (small trivially-copyable stored directly): return by value. + */ +template +constexpr auto fwd_fn(auto &&fn) noexcept -> Fn { + if constexpr (std::is_rvalue_reference_v) { + return std::move(fn); + } else if constexpr (std::is_lvalue_reference_v || small_trivially_copyable) { + return fn; + } else { + static_assert(false, "Invalid Fn type in fwd_fn"); + } +} + +// =============== Join =============== // + +struct join_type {}; + +/** + * @brief Base class shared by scope_ops and child_scope_ops. + * + * Provides a member `join()` so that `co_await sc.join()` works on any scope type. + */ +struct scope_base { + [[nodiscard("You should immediately co_await this!")]] + static constexpr auto join() noexcept -> join_type { + return {}; + } +}; + +// =============== Scope ops (no embedded stop source) =============== // + +template +struct scope_ops : scope_base { + private: + template + using call_pkg = pkg, store_as_t...>; + + template + using fork_pkg = pkg, store_as_t...>; + + public: + // Default constructible + scope_ops() noexcept = default; + + // Immovable + scope_ops(const scope_ops &) = delete; + scope_ops(scope_ops &&) = delete; + auto operator=(const scope_ops &) -> scope_ops & = delete; + auto operator=(scope_ops &&) -> scope_ops & = delete; + + // === Fork === // + + template Fn> + static constexpr auto fork(R *ret, Fn &&fn, Args &&...args) noexcept -> fork_pkg { + return {.return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + static constexpr auto fork_drop(Fn &&fn, Args &&...args) noexcept -> fork_pkg { + return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + static constexpr auto fork(Fn &&fn, Args &&...args) noexcept -> fork_pkg { + return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + + // === Call === // + + template Fn> + static constexpr auto call(R *ret, Fn &&fn, Args &&...args) noexcept -> call_pkg { + return {.return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + static constexpr auto call_drop(Fn &&fn, Args &&...args) noexcept -> call_pkg { + return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + static constexpr auto call(Fn &&fn, Args &&...args) noexcept -> call_pkg { + return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } +}; + +// ==== Scope awaitable ==== // + +template +struct scope_awaitable : std::suspend_never { + static constexpr auto await_resume() noexcept -> scope_ops { return {}; } +}; + +struct scope_type {}; + +export [[nodiscard("You should immediately co_await this!")]] +constexpr auto scope() noexcept -> scope_type { + return {}; +} + +// =============== Child scope ops (with embedded stop source) =============== // + +/** + * @brief A scope that is a stop_source. + */ +template +struct child_scope_ops : scope_base, stop_source { + private: + template + using call_pkg = pkg, store_as_t...>; + + template + using fork_pkg = pkg, store_as_t...>; + + public: + /** + * @brief Construct the scope, chaining its stop source onto the parent's token. + */ + explicit constexpr child_scope_ops(stop_source::stop_token parent) noexcept + : stop_source(parent) {} + + // Immovable (stop_source base is immovable) + child_scope_ops(const child_scope_ops &) = delete; + child_scope_ops(child_scope_ops &&) = delete; + auto operator=(const child_scope_ops &) -> child_scope_ops & = delete; + auto operator=(child_scope_ops &&) -> child_scope_ops & = delete; + + // === Fork (binds this scope's stop source as child stop source) === // + + template Fn> + constexpr auto fork(R *ret, Fn &&fn, Args &&...args) noexcept -> fork_pkg { + return {.stop_token = token(), .return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + constexpr auto fork_drop(Fn &&fn, Args &&...args) noexcept -> fork_pkg { + return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + constexpr auto fork(Fn &&fn, Args &&...args) noexcept -> fork_pkg { + return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + + // === Call (binds this scope's stop source as child stop source) === // + + template Fn> + constexpr auto call(R *ret, Fn &&fn, Args &&...args) noexcept -> call_pkg { + return {.stop_token = token(), .return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + constexpr auto call_drop(Fn &&fn, Args &&...args) noexcept -> call_pkg { + return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } + template Fn> + constexpr auto call(Fn &&fn, Args &&...args) noexcept -> call_pkg { + return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}}; + } +}; + +// =============== child_scope_awaitable =============== // + +template +struct child_scope_awaitable : std::suspend_never { + + stop_source::stop_token parent_stop_token; + + constexpr auto await_resume(this child_scope_awaitable self) noexcept -> child_scope_ops { + return child_scope_ops{self.parent_stop_token}; + } +}; + +struct child_scope_type {}; + +export [[nodiscard("You should immediately co_await this!")]] +constexpr auto child_scope() noexcept -> child_scope_type { + return {}; +} + +} // namespace lf diff --git a/src/core/poly_context.cxx b/src/core/poly_context.cxx new file mode 100644 index 000000000..f651d11b0 --- /dev/null +++ b/src/core/poly_context.cxx @@ -0,0 +1,58 @@ +module; +#include "libfork/__impl/exception.hpp" +export module libfork.core:poly_context; + +import std; + +import :concepts_stack; +import :handles; +import :exception; + +namespace lf { + +export template +class base_context { + public: + auto stack() noexcept -> Stack & { return m_stack; } + + protected: + constexpr base_context() = default; + + template + requires std::constructible_from + explicit(sizeof...(Args) == + 1) constexpr base_context(Args &&...args) noexcept(std::is_nothrow_constructible_v) + : m_stack(std::forward(args)...) {} + + private: + Stack m_stack; +}; + +export struct post_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "derived context does not support posting tasks."; + } +}; + +/** + * @brief A worker context polymorphic in push/pop/post. + * + * This is the canonical/blessed base class in libfork for polymorphic uses + * cases. Although possible, libfork does not recommend contexts polymorphic + * in the `.stack` member + */ +export template +class poly_context : public base_context { + public: + using base_context::base_context; + + virtual void push(steal_handle) = 0; + virtual auto pop() noexcept -> steal_handle = 0; + + virtual void post([[maybe_unused]] sched_handle handle) { LF_THROW(post_error{}); } + + virtual ~poly_context() noexcept = default; +}; + +} // namespace lf diff --git a/src/core/promise.cxx b/src/core/promise.cxx new file mode 100644 index 000000000..3ed44def8 --- /dev/null +++ b/src/core/promise.cxx @@ -0,0 +1,228 @@ +module; +#include + +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/exception.hpp" +#include "libfork/__impl/utils.hpp" +export module libfork.core:promise; + +import std; + +import libfork.utils; + +import :concepts_context; +import :concepts_invocable; +import :frame; +import :stop; +import :task; +import :thread_locals; +import :ops; +import :handles; +import :final_suspend; +import :awaitables; + +// TODO: vet constexpr usage in the library + +namespace lf { + +// =============== Final awaitable =============== // + +struct final_awaitable : std::suspend_always { + template + constexpr static auto await_suspend(coro> handle) noexcept -> coro<> { + return final_suspend_leading(&handle.promise().frame); + } +}; + +// =============== Frame mixin =============== // + +template +struct mixin_frame { + + // === For internal use === // + + using enum category; + + template + requires (!std::is_const_v) + [[nodiscard]] + constexpr auto handle(this Self &self) + LF_HOF(coro::from_promise(self)) + + // === Called by the compiler === // + + // --- Allocation + + static auto operator new(std::size_t sz) noexcept(noexcept(get_tls_stack().push(sz))) -> void * { + void *ptr = get_tls_stack().push(sz); + LF_ASSUME(is_sufficiently_aligned(ptr)); + return std::assume_aligned(ptr); + } + + static auto operator delete(void *p, std::size_t sz) noexcept -> void { + get_tls_stack().pop(p, sz); + } + + // --- Await transformations + + template + constexpr auto + await_transform_pkg(this auto const &self, pkg &&pkg) noexcept( + async_nothrow_invocable) -> async_awaitable { + + using U = async_result_t; + + // clang-format off + + promise_type *child_promise = get(key(), std::move(pkg.args).apply( + [&](auto &&...args) LF_HOF(ctx_invoke_t{}(fwd_fn(pkg.fn), LF_FWD(args)...)) + )); + + // clang-format on + + LF_ASSUME(child_promise); + + // void can signal drop return. + static_assert(std::same_as || std::is_void_v); + + if constexpr (!std::is_void_v) { + child_promise->return_address = not_null(pkg.return_addr); + } else if constexpr (!std::is_void_v) { + // Set child's return address to null to inhibit the return + // TODO: add test for this + child_promise->return_address = nullptr; + } + + if constexpr (StopToken) { + // TODO: need some kind of API to launch an unstoppable task? + LF_ASSUME(pkg.stop_token.stop_possible()); + child_promise->frame.stop_token = pkg.stop_token; + } else { + child_promise->frame.stop_token = self.frame.stop_token; + } + + return {.child = &child_promise->frame}; + } + + template + constexpr auto await_transform(this auto &self, pkg &&pkg) noexcept + -> async_awaitable { + LF_TRY { + return self.await_transform_pkg(std::move(pkg)); + } LF_CATCH_ALL { + stash_current_exception(&self.frame); + } + return {.child = nullptr}; + } + + constexpr auto await_transform(this auto &self, join_type) noexcept -> join_awaitable { + return {.frame = &self.frame}; + } + + static constexpr auto await_transform(scope_type) noexcept -> scope_awaitable { return {}; } + + constexpr auto + await_transform(this auto const &self, child_scope_type) noexcept -> child_scope_awaitable { + return {.parent_stop_token = self.frame.stop_token}; + } + + constexpr static auto initial_suspend() noexcept -> std::suspend_always { return {}; } + + constexpr static auto final_suspend() noexcept -> final_awaitable { return {}; } + + constexpr void unhandled_exception(this auto &self) noexcept { + // Stash the exception in the parent which will rethrow at the join. + stash_current_exception(self.frame.parent); + } +}; + +// =============== Promise (void) =============== // + +template +struct promise_type : mixin_frame { + + // Putting init here allows: + // 1. Frame not to need to know about the checkpoint type + // 2. Compiler merge double read of thread local here and in allocator + frame_t frame{get_tls_stack().checkpoint()}; + + constexpr auto get_return_object() noexcept -> task { return {key(), this}; } + + constexpr static void return_void() noexcept {} +}; + +// =============== Promise (non-void) =============== // + +template +struct promise_type : mixin_frame { + + // Putting init here allows: + // 1. Frame not to need to know about the checkpoint type + // 2. Compiler merge double read of thread local here and in allocator + frame_t frame{get_tls_stack().checkpoint()}; + T *return_address; + + constexpr auto get_return_object() noexcept -> task { return {key(), this}; } + + template + requires std::assignable_from + constexpr void return_value(U &&value) noexcept(std::is_nothrow_assignable_v) { + if (return_address) { + *return_address = LF_FWD(value); + } + } +}; + +} // namespace lf + +// =============== std specialization =============== // + +template +struct std::coroutine_traits, Args...> { + using promise_type = ::lf::promise_type; +}; + +template +struct std::coroutine_traits, Self, Args...> { + using promise_type = ::lf::promise_type; +}; + +// =============== Layout invariants =============== // + +namespace { + +struct unit_checkpoint { + auto operator==(unit_checkpoint const &) const -> bool = default; +}; + +struct unit_stack { + static auto push(std::size_t) -> void *; + static auto pop(void *, std::size_t) noexcept -> void; + static auto checkpoint() noexcept -> unit_checkpoint; + static auto prepare_release() noexcept -> int; + static auto release(int) noexcept -> void; + static auto acquire(unit_checkpoint) noexcept -> void; +}; + +struct unit_context { + void push(lf::steal_handle); + auto pop() noexcept -> lf::steal_handle; + auto stack() noexcept -> unit_stack &; +}; + +static_assert(lf::worker_context); + +using frame_t = lf::frame_type; + +static_assert(std::is_standard_layout_v); +static_assert(alignof(lf::promise_type) == alignof(frame_t)); +static_assert(alignof(lf::promise_type) == alignof(frame_t)); +static_assert(std::is_standard_layout_v>); +static_assert(std::is_standard_layout_v>); + +#ifdef __cpp_lib_is_pointer_interconvertible +static_assert(std::is_pointer_interconvertible_with_class(&lf::promise_type::frame)); +static_assert(std::is_pointer_interconvertible_with_class(&lf::promise_type::frame)); +#endif + +} // namespace diff --git a/src/core/receiver.cxx b/src/core/receiver.cxx new file mode 100644 index 000000000..bfa0f42c7 --- /dev/null +++ b/src/core/receiver.cxx @@ -0,0 +1,214 @@ + +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.core:receiver; + +import std; + +import :stop; +import :exception; + +namespace lf { + +export struct broken_receiver_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "receiver is in invalid state"; + } +}; + +export struct operation_cancelled_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "operation was cancelled"; + } +}; + +/** + * @brief Shared state between a scheduled task and its receiver handle. + */ +template +struct hidden_receiver_state { + + struct empty_1 {}; + struct empty_2 {}; + + alignas(k_new_align) std::array buffer{}; + + [[no_unique_address]] + std::conditional_t, empty_1, T> return_value{}; + + std::exception_ptr exception; + std::atomic_flag ready; + + [[no_unique_address]] + std::conditional_t stop; + + constexpr hidden_receiver_state() = default; + + template + requires std::constructible_from + constexpr explicit(sizeof...(Args) == 1) + hidden_receiver_state(Args &&...args) noexcept(std::is_nothrow_constructible_v) + : return_value(std::forward(args)...) {} +}; + +/// Convenience alias — used throughout the core partitions. +template +using state_handle = std::shared_ptr>; + +/** + * @brief Lightweight move-only handle owning a pre-allocated root task state. + * + * Construction allocates a `hidden_receiver_state` which embeds a + * 1 KiB aligned buffer; the root coroutine frame is placement-constructed + * into that buffer by `schedule`. + * + * Constructors mirror `make_shared` / `allocate_shared`: + * + * recv_state s; // default-init return value + * recv_state s{v1, v2}; // in-place init: T{v1, v2} + * recv_state s{allocator_arg, alloc}; // default-init, custom allocator + * recv_state s{allocator_arg, alloc, v1, v2}; // in-place init + custom allocator + */ +export template +class recv_state { + using state_type = hidden_receiver_state; + + public: + /// Default: value-initialise via `std::make_shared`. + constexpr recv_state() + : m_ptr(std::make_shared()) {} + + /// Value-init from args: forwards `args` to `hidden_receiver_state`'s constructor + /// (in-place construction of the return value) via `std::make_shared`. + template + requires std::constructible_from + constexpr explicit(sizeof...(Args) == 1) recv_state(Args &&...args) + : m_ptr(std::make_shared(std::forward(args)...)) {} + + /// Allocator-aware, default return value: allocate via `std::allocate_shared`. + template + constexpr recv_state(std::allocator_arg_t, Alloc const &alloc) + : m_ptr(std::allocate_shared(alloc)) {} + + /// Allocator-aware with value-init args. + template + requires std::constructible_from + constexpr recv_state(std::allocator_arg_t, Alloc const &alloc, Args &&...args) + : m_ptr(std::allocate_shared(alloc, std::forward(args)...)) {} + + // Move-only. + constexpr recv_state(recv_state &&) noexcept = default; + constexpr auto operator=(recv_state &&) noexcept -> recv_state & = default; + constexpr recv_state(recv_state const &) = delete; + constexpr auto operator=(recv_state const &) -> recv_state & = delete; + + private: + [[nodiscard]] + friend constexpr auto get(key_t, recv_state &&self) noexcept -> state_handle { + return std::move(self.m_ptr); + } + + state_handle m_ptr; +}; + +export template +class receiver { + + using state_type = hidden_receiver_state; + + public: + constexpr receiver(key_t, state_handle state) noexcept + : m_state(std::move(state)) {} + + // Move only + constexpr receiver(receiver &&) noexcept = default; + constexpr receiver(const receiver &) = delete; + constexpr auto operator=(receiver &&) noexcept -> receiver & = default; + constexpr auto operator=(const receiver &) -> receiver & = delete; + + /** + * @brief Test if connected to a receiver state. + */ + [[nodiscard]] + constexpr auto valid() const noexcept -> bool { + return m_state != nullptr; + } + + /** + * @brief Test if the associated task has completed (either successfully or with an exception/cancellation). + */ + [[nodiscard]] + constexpr auto ready() const -> bool { + if (!valid()) { + LF_THROW(broken_receiver_error{}); + } + return m_state->ready.test(); + } + + /** + * @brief Wait for the associated task to complete (either successfully or with an exception/cancellation). + * + * May be called multiple times. + */ + constexpr void wait() const { + if (!valid()) { + LF_THROW(broken_receiver_error{}); + } + m_state->ready.wait(false); + } + + /** + * @brief Get a reference to the stop_source for this task, allowing the caller to request cancellation. + * + * Only available when Stoppable=true. + */ + [[nodiscard]] + constexpr auto stop_source() -> stop_source & + requires Stoppable + { + if (!valid()) { + LF_THROW(broken_receiver_error{}); + } + return m_state->stop; + } + + /** + * @brief Wait for the associated task to complete and return its result, or rethrow. + * + * If the receiver was cancelled this will throw an exception. + * + * This may only be called once; the state is consumed and the receiver becomes invalid. + */ + [[nodiscard]] + constexpr auto get() && -> T { + + wait(); + + // State will be cleaned up on unwind + std::shared_ptr state = std::exchange(m_state, nullptr); + + LF_ASSUME(state != nullptr); + + if (state->exception) { + std::rethrow_exception(state->exception); + } + + if constexpr (Stoppable) { + if (state->stop.stop_requested()) { + LF_THROW(operation_cancelled_error{}); + } + } + + if constexpr (!std::is_void_v) { + return std::move(state->return_value); + } + } + + private: + state_handle m_state; +}; + +} // namespace lf diff --git a/src/core/root.cxx b/src/core/root.cxx new file mode 100644 index 000000000..12c9b955b --- /dev/null +++ b/src/core/root.cxx @@ -0,0 +1,204 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.core:root; + +import std; + +import :concepts_context; +import :concepts_invocable; +import :frame; +import :promise; +import :receiver; +import :thread_locals; +import :task; +import :exception; + +namespace lf { + +/** + * @brief Thrown if the root coroutine frame is too large for the embedded buffer. + */ +export struct root_alloc_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "root coroutine frame exceeds hidden_receiver_state buffer size"; + } +}; + +struct get_frame_t {}; + +template +struct root_task { + struct promise_type { + + frame_type frame{Checkpoint{}}; + + /// Owns a ref to the hidden_receiver_state hosting this frame's buffer. + std::shared_ptr keep_alive; + + template + constexpr explicit promise_type(state_handle const &recv, Args const &...) noexcept + : keep_alive(recv) {} + + template + static auto + operator new(std::size_t size, state_handle const &recv, Args const &...) -> void * { + + LF_ASSUME(recv != nullptr); + + if (size > recv->buffer.size()) { + LF_THROW(root_alloc_error{}); + } + + return recv->buffer.data(); + } + + /// No-op: the buffer is owned by the hidden_receiver_state, not the frame. + static auto operator delete(void * /*ptr*/, std::size_t /*size*/) noexcept -> void {} + + struct frame_awaitable : std::suspend_never { + frame_type *frame; + [[nodiscard]] + constexpr auto await_resume() const noexcept -> frame_type * { + return frame; + } + }; + + constexpr auto await_transform([[maybe_unused]] get_frame_t tag) noexcept -> frame_awaitable { + return {.frame = &frame}; + } + + struct call_awaitable : std::suspend_always { + frame_type *child; + constexpr auto await_suspend([[maybe_unused]] coro root) const noexcept -> coro<> { + return child->handle(); + } + }; + + constexpr auto await_transform(frame_type *child) noexcept -> call_awaitable { + return {.child = child}; + } + + constexpr auto get_return_object() noexcept -> root_task { return {.promise = this}; } + + constexpr static auto initial_suspend() noexcept -> std::suspend_always { return {}; } + + /** + * @brief Custom final_suspend. + * + * The root coroutine frame lives inside the hidden_receiver_state's embedded + * buffer, so the hidden_receiver_state must outlive the frame teardown. + * + * 1. `std::exchange` the keep-alive shared_ptr into a local on the + * host stack, leaving the promise member null. + * 2. `handle.destroy()` — runs parameter + promise destructors (including + * the now-null `keep_alive`) and our no-op `operator delete`. + * No frame-memory access occurs after the handle returns. + * 3. On return, the stack-local `shared_ptr` dies; if its ref + * was the last, it destroys the hidden_receiver_state cleanly — we are + * no longer executing inside the buffer. + */ + struct final_awaiter : std::suspend_always { + void await_suspend(std::coroutine_handle handle) const noexcept { + std::shared_ptr local = std::exchange(handle.promise().keep_alive, nullptr); + LF_ASSUME(local != nullptr); + handle.destroy(); + // `local` released here — possibly freeing hidden_receiver_state on return. + } + }; + + constexpr static auto final_suspend() noexcept -> final_awaiter { return {}; } + + constexpr static void return_void() noexcept {} + + [[noreturn]] + constexpr void unhandled_exception() noexcept { + // Any exceptions escaping the root task are a bug. + LF_UNREACHABLE(); + } + }; + + promise_type *promise; +}; + +template + requires async_invocable_to +[[nodiscard]] +auto // +root_pkg(state_handle recv, Fn fn, Args... args) -> root_task> { + + // This should be resumed on a valid context. + LF_ASSUME(thread_local_context != nullptr); + + using checkpoint = checkpoint_t; + + // Pointer to this root_task's own frame. + frame_type *root = not_null(co_await get_frame_t{}); + + // Manual "call" invocation of the user-supplied task. + + using result_type = async_result_t; + using promise_type = promise_type; + + promise_type *child = nullptr; + + if (root->stop_requested()) { + // The root task was cancelled before it even started, we can skip + // straight to cleanup. + goto cleanup; + } + + LF_TRY { + // Potentially throwing + child = get(key(), ctx_invoke_t{}(std::move(fn), std::move(args)...)); + } LF_CATCH_ALL { + recv->exception = std::current_exception(); + goto cleanup; + } + + LF_ASSUME(child != nullptr); + + // Propagate parent/stop info to child + child->frame.parent = root; + child->frame.stop_token = root->stop_token; + + LF_ASSUME(child->frame.kind == category::call); + + if constexpr (!std::is_void_v>) { + child->return_address = std::addressof(recv->return_value); + } + + // Begin normal execution of the child task, it will clean itself + // up (i.e. .destroy()) at the final suspend + co_await &child->frame; + + // Now we have been resumed the child is done, it could have completed via: + // + // - Normal return + // - Exception + // - Cancellation (in which case it would have dropped any exceptions) + // + // For symmetry with a normal task we unconditionally propagate exceptions here, + // effectively this is an `await_resume`. + + if constexpr (LF_COMPILER_EXCEPTIONS) { + if (root->exception_bit) { + // The child threw an exception, propagate it to the receiver. + recv->exception = extract_exception(root); + } + } + +cleanup: + // Notify the receiver that the task is done. + recv->ready.test_and_set(); + recv->ready.notify_one(); + + LF_ASSUME(root->steals == 0); + LF_ASSUME(root->joins == k_u16_max); + LF_ASSUME(root->exception_bit == 0); + + co_return; +} + +} // namespace lf diff --git a/src/core/schedule.cxx b/src/core/schedule.cxx new file mode 100644 index 000000000..0a328063f --- /dev/null +++ b/src/core/schedule.cxx @@ -0,0 +1,114 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/compiler.hpp" +#include "libfork/__impl/exception.hpp" +export module libfork.core:schedule; + +import std; + +import :concepts_invocable; +import :concepts_scheduler; +import :frame; +import :stop; +import :thread_locals; +import :promise; +import :root; +import :handles; +import :receiver; +import :exception; + +namespace lf { + +export struct schedule_error final : libfork_exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "schedule called from within a worker thread!"; + } +}; + +template +concept decay_copyable = std::convertible_to>; + +/** + * @brief Schedule a function using a caller-provided `recv_state`. + * + * This will create a root task that stores decayed copies of `Fn` and + * `Args...` in its frame, then post it to the scheduler. The root task must + * then be resumed by a worker which will perform the invocation of `Fn`. + * + * The return address/exception and possibly stop token of the root task are + * bound to the provided `recv_state` and can be observed by the caller via the + * returned `receiver`. + * + * Strongly exception safe. + */ +export template + requires async_invocable_to, R, context_t, std::decay_t...> +[[nodiscard("Fire and forget is an anti-pattern")]] +constexpr auto +schedule(Sch &&sch, recv_state state, Fn &&fn, Args &&...args) -> receiver { + + using context_type = context_t; + + if (thread_local_context != nullptr) { + LF_THROW(schedule_error{}); + } + + state_handle state_ptr = get(key(), std::move(state)); + + LF_ASSUME(state_ptr != nullptr); + + // root_pkg's operator new may throw root_alloc_error if the frame is + // too large; if so, `state_ptr` goes out of scope and destroys the state. + root_task task = root_pkg(state_ptr, std::forward(fn), std::forward(args)...); + + LF_ASSUME(task.promise != nullptr); + + task.promise->frame.kind = category::root; + task.promise->frame.parent = nullptr; + + if constexpr (Stoppable) { + task.promise->frame.stop_token = state_ptr->stop.token(); + } else { + task.promise->frame.stop_token = stop_source::stop_token{}; // non-cancellable root + } + + LF_TRY { + std::forward(sch).post(sched_handle{key(), &task.promise->frame}); + // If ^ didn't throw then the root_task will destroy itself at the final suspend. + } LF_CATCH_ALL { + // Otherwise, if it did throw, we must clean up + task.promise->frame.handle().destroy(); + LF_RETHROW; + } + + return {key(), std::move(state_ptr)}; +} + +template +concept schedulable_return = std::is_void_v || (std::default_initializable && std::movable); + +template +concept default_schedulable = + async_invocable && schedulable_return>; + +template +using async_decay_result_t = async_result_t, Context, std::decay_t...>; + +/** + * @brief Convenience overload: default-constructs a non-cancellable recv_state. + * + * Uses the default allocator (`make_shared`) for all allocations. + */ +export template + requires default_schedulable, context_t, std::decay_t...> +[[nodiscard("Fire and forget is an anti-pattern")]] +constexpr auto +schedule(Sch &&sch, Fn &&fn, Args &&...args) -> receiver, Args...>> { + using result_type = async_decay_result_t, Args...>; + recv_state state; + return schedule( + std::forward(sch), std::move(state), std::forward(fn), std::forward(args)...); +} + +} // namespace lf diff --git a/src/core/stop.cxx b/src/core/stop.cxx new file mode 100644 index 000000000..149b0adde --- /dev/null +++ b/src/core/stop.cxx @@ -0,0 +1,126 @@ +export module libfork.core:stop; + +import std; + +import libfork.utils; + +namespace lf { + +/** + * @brief Similar to a linked-list of std::stop_source but with an embedded stop_state. + */ +export class stop_source { + public: + /** + * @brief Lightweight public handle to a stop_source chain. + * + * A stop_token is a non-owning pointer-sized wrapper around a stop_source. + */ + class stop_token { + public: + /** + * @brief Construct a null (unstoppable) token. + */ + constexpr stop_token() noexcept = default; + + /** + * @brief Returns true if a stop source is associated (stopping is possible). + */ + [[nodiscard]] + constexpr auto stop_possible() const noexcept -> bool { + return m_src != nullptr; + } + + /** + * @brief Returns true if any stop source in the ancestor chain has been stopped. + * + * A null token always returns false. + * + * Complexity: O(chain depth). Every task that creates a child_scope adds one + * node to the chain, so deeply-nested task hierarchies pay proportionally more + * per stop check. + */ + [[nodiscard]] + constexpr auto stop_requested() const noexcept -> bool { + return deep_stop_requested(m_src); + } + + private: + friend class stop_source; + + explicit constexpr stop_token(stop_source const *src) noexcept + : m_src(src) {} + + stop_source const *m_src = nullptr; + }; + + /** + * @brief Construct a root stop source with no parent. + */ + constexpr stop_source() noexcept = default; + + /** + * @brief Construct a stop source chained onto the given parent token. + */ + constexpr explicit stop_source(stop_token parent) noexcept + : m_parent(parent.m_src) {} + + // Immovable + constexpr stop_source(const stop_source &) noexcept = delete; + constexpr stop_source(stop_source &&) noexcept = delete; + constexpr auto operator=(const stop_source &) noexcept -> stop_source & = delete; + constexpr auto operator=(stop_source &&) noexcept -> stop_source & = delete; + + /** + * @brief Get a handle to this stop source. + */ + [[nodiscard]] + constexpr auto token() const noexcept -> stop_token { + return stop_token{this}; + } + + /** + * @brief Returns true if any stop source in the ancestor chain has been stopped. + * + * Complexity: O(chain depth). Every task that creates a child_scope adds one + * node to the chain, so deeply-nested task hierarchies pay proportionally more + * per stop check. + */ + [[nodiscard]] + constexpr auto stop_requested() const noexcept -> bool { + return deep_stop_requested(this); + } + + /** + * @brief Request that this stop source (and all its children) stop. + */ + constexpr auto request_stop() noexcept -> void { m_stop.store(1, std::memory_order_release); } + + /** + * @brief Same as `request_stop`, but returns true if this is the first time stop has been requested. + */ + [[nodiscard("You can use request_stop() if you don't need the return value")]] + constexpr auto race_request_stop() noexcept -> bool { + return m_stop.exchange(1, std::memory_order_release) == 0; + } + + private: + /** + * @brief Test if any stop request has been made in the current chain. + * + * Safe to call with a null pointer, in which case it returns false. + */ + [[nodiscard]] + friend constexpr auto deep_stop_requested(stop_source const *src) noexcept -> bool { + for (stop_source const *ptr = src; ptr != nullptr; ptr = ptr->m_parent) { + if (ptr->m_stop.load(std::memory_order_acquire) == 1) { + return true; + } + } + return false; + } + + stop_source const *m_parent = nullptr; + std::atomic m_stop = 0; +}; +} // namespace lf diff --git a/src/core/task.cxx b/src/core/task.cxx new file mode 100644 index 000000000..424910b4e --- /dev/null +++ b/src/core/task.cxx @@ -0,0 +1,59 @@ +export module libfork.core:task; + +import std; + +import libfork.utils; + +import :concepts_context; + +namespace lf { + +/** + * @brief A type returnable from libfork's async functions/coroutines. + * + * This requires that `T` is `void` or a `std::movable` type. + */ +export template +concept returnable = std::is_void_v || (plain_object && std::movable); + +export template +struct env { + explicit constexpr env(key_t) noexcept {} +}; + +// Forward-declare promise_type so task can reference it as a pointer. +template +struct promise_type; + +/** + * @brief The return type for libfork's async functions/coroutines. + * + * This predominantly exists to disambiguate `libfork`s coroutines from other + * coroutines and specify `T` the async function's return type which is + * required to be `void` or a `std::movable` type. + * + * \rst + * + * .. note:: + * + * No consumer of this library should ever touch an instance of this type, + * it is used for specifying the return type of an `async` function only. + * + * \endrst + */ +export template +class task { + public: + using value_type = T; + using context_type = Context; + + constexpr task(key_t, promise_type *promise) noexcept + : m_promise(promise) {} + + private: + friend constexpr auto get(key_t, task t) noexcept -> promise_type * { return t.m_promise; } + + promise_type *m_promise; +}; + +} // namespace lf diff --git a/src/core/thread_locals.cxx b/src/core/thread_locals.cxx new file mode 100644 index 000000000..67a278378 --- /dev/null +++ b/src/core/thread_locals.cxx @@ -0,0 +1,33 @@ +export module libfork.core:thread_locals; + +import libfork.utils; + +import :concepts_context; + +namespace lf { + +/** + * @brief Thread-local pointer to the current worker context. + */ +template +constinit inline thread_local Context *thread_local_context = nullptr; + +// TODO: implications of thread local on constexpr + +/** + * @brief A getter for the current worker context, checks for null in debug. + */ +template +constexpr auto get_tls_context() noexcept -> Context & { + return *not_null(thread_local_context); +} + +/** + * @brief A getter for the current worker context's stack, checks for null in debug. + */ +template +constexpr auto get_tls_stack() noexcept -> stack_t & { + return get_tls_context().stack(); +} + +} // namespace lf diff --git a/src/exception.cpp b/src/exception.cpp new file mode 100644 index 000000000..a05e60c0e --- /dev/null +++ b/src/exception.cpp @@ -0,0 +1,20 @@ +#include + +#include "libfork/__impl/exception.hpp" + +import std; + +namespace lf::impl { + +[[noreturn]] +void terminate_with(char const *message, char const *file, int line) noexcept { + LF_TRY { + std::println(stderr, "{} {}:{}: {}", std::this_thread::get_id(), file, line, message); + } LF_CATCH_ALL { + // Drop exceptions during termination + } + // TODO: can we get a stack trace here? + std::terminate(); +} + +} // namespace lf::impl diff --git a/src/libfork.cxx b/src/libfork.cxx new file mode 100644 index 000000000..25e255874 --- /dev/null +++ b/src/libfork.cxx @@ -0,0 +1,5 @@ +export module libfork; + +export import libfork.core; +export import libfork.batteries; +export import libfork.schedulers; diff --git a/src/schedulers/busy.cxx b/src/schedulers/busy.cxx new file mode 100644 index 000000000..f7bc92955 --- /dev/null +++ b/src/schedulers/busy.cxx @@ -0,0 +1,142 @@ +module; +#include "libfork/__impl/assume.hpp" +#include "libfork/__impl/compiler.hpp" +export module libfork.schedulers:basic_busy_pool; + +import std; + +import libfork.utils; +import libfork.core; +import libfork.batteries; + +namespace lf { + +struct invalid_workers_error : std::exception { + [[nodiscard]] + constexpr auto what() const noexcept -> const char * override { + return "A thread pool must have at least one worker."; + } +}; + +export enum class pool_kind { mono, poly }; + +export template , + simple_allocator Alloc = std::allocator> +class basic_busy_pool { + + using context = std::conditional_t< // + Kind == pool_kind::poly, // + derived_poly_context, // + mono_context // + >; + + public: + using context_type = context::context_type; + + // TODO: sleep when zero work + + explicit basic_busy_pool(std::size_t n = std::thread::hardware_concurrency(), Alloc const &alloc = Alloc()) + : m_contexts(n) { + + // TODO: propagate alloc to m_contexts, m_posted, etc. + (void)alloc; + + if (n < 1) { + LF_THROW(invalid_workers_error{}); + } + + LF_TRY{ + for (std::size_t id = 0; id < n; ++id) { + m_threads.emplace_back([this, id](std::stop_token stop) -> void { + worker(std::move(stop), id); + }); + } + } LF_CATCH_ALL { + // Force joins before members (which threads reference) are destroyed. + join_all(); + LF_RETHROW; + } + } + + basic_busy_pool(basic_busy_pool const &) = delete; + basic_busy_pool(basic_busy_pool &&) = delete; + + auto operator=(basic_busy_pool const &) -> basic_busy_pool & = delete; + auto operator=(basic_busy_pool &&) -> basic_busy_pool & = delete; + + ~basic_busy_pool() { join_all(); } + + void post(sched_handle handle) { + // TODO: use a lock-free queue here + auto lock = std::unique_lock(m_mutex); + m_posted.push_back(handle); + } + + private: + void worker(std::stop_token stop, std::size_t id) { + + LF_ASSUME(id < m_contexts.size()); + + context &ctx = m_contexts[id]; + + std::size_t const n = m_contexts.size(); + + std::default_random_engine rng(safe_cast(id + 1)); + std::uniform_int_distribution dist(0, n - 2); + + constexpr int k_steal_attempts = 1024; + + while (!stop.stop_requested()) { + + if (auto lock = std::unique_lock(m_mutex); !m_posted.empty()) { + sched_handle task = m_posted.back(); + m_posted.pop_back(); + lock.unlock(); + execute(static_cast(ctx), task); + continue; + } + + if (n > 1) { + for (int i = 0; i < k_steal_attempts; ++i) { + + std::size_t victim = dist(rng); + + if (victim >= id) { + victim += 1; + } + + LF_ASSUME(victim < n); + LF_ASSUME(victim != id); + + if (auto result = m_contexts[victim].steal()) { + execute(static_cast(ctx), result); + continue; + } + } + } + } + } + + void join_all() { + m_threads.clear(); // jthread calls stop and joins in destructor + } + + std::vector m_contexts; + std::vector m_threads; + std::mutex m_mutex; + std::vector> m_posted; +}; + +export template , + simple_allocator Alloc = std::allocator> +using mono_busy_pool = basic_busy_pool; + +export template , + simple_allocator Alloc = std::allocator> +using poly_busy_pool = basic_busy_pool; + +} // namespace lf diff --git a/src/schedulers/inline.cxx b/src/schedulers/inline.cxx new file mode 100644 index 000000000..c0d58bd5b --- /dev/null +++ b/src/schedulers/inline.cxx @@ -0,0 +1,51 @@ +export module libfork.schedulers:inline_scheduler; + +import std; + +import libfork.core; + +import libfork.batteries; + +namespace lf { + +// TODO: think about initialization: +// - do we need default initializable on stack/context? +// - with allocators + +// TODO: Can we store the context directly in TLS? + +template +concept derived_context_from = worker_context && std::derived_from; + +export template +concept derived_worker_context = + has_context_typedef && derived_context_from>; + +export template +class inline_scheduler { + public: + using context_type = Context::context_type; + + inline_scheduler() = default; + + template + requires std::constructible_from + explicit(sizeof...(Args) == 1) + inline_scheduler(Args &&...args) noexcept(std::is_nothrow_constructible_v) + : m_context(std::forward(args)...) {} + + void post(lf::sched_handle handle) { + execute(static_cast(m_context), handle); + } + + private: + Context m_context; +}; + +export template +using mono_inline_scheduler = inline_scheduler>; + +export template +using poly_inline_scheduler = inline_scheduler>; + +} // namespace lf diff --git a/src/schedulers/schedulers.cxx b/src/schedulers/schedulers.cxx new file mode 100644 index 000000000..0ab5c0a27 --- /dev/null +++ b/src/schedulers/schedulers.cxx @@ -0,0 +1,4 @@ +export module libfork.schedulers; + +export import :inline_scheduler; +export import :basic_busy_pool; diff --git a/src/utils/concepts.cxx b/src/utils/concepts.cxx new file mode 100644 index 000000000..ad1d7a11b --- /dev/null +++ b/src/utils/concepts.cxx @@ -0,0 +1,65 @@ +export module libfork.utils:concepts; + +import std; + +namespace lf { + +// =========== Atomic related concepts =========== // + +export template +concept plain_object = std::is_object_v && std::same_as>; + +/** + * @brief Verify a type is suitable for use with `std::atomic` + * + * This requires a `TriviallyCopyable` type satisfying both `CopyConstructible` and `CopyAssignable`. + */ +export template +concept atomicable = plain_object && // + std::is_trivially_copyable_v && // + std::is_copy_constructible_v && // + std::is_move_constructible_v && // + std::is_copy_assignable_v && // + std::is_move_assignable_v; // + +/** + * @brief A concept that verifies a type is lock-free when used with `std::atomic`. + */ +export template +concept lock_free = atomicable && std::atomic::is_always_lock_free; + +// ========== Specialization ========== // + +template typename Template> +struct is_specialization_of : std::false_type {}; + +template