diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..14dcdd1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,300 @@ +name: CI + +on: + push: + branches: [branch_libev, master] + pull_request: + branches: [branch_libev, master] + workflow_dispatch: + +permissions: + contents: write + deployments: write + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Run correctness tests + run: make test + + - name: Build benchmarks + run: make bench + + - name: Run benchmarks + run: ./bench_udpspeeder + + - name: Run benchmarks (JSON) + run: taskset -c 0 ./bench_udpspeeder --json + + - name: Build production binary + run: make all + + - name: Throughput test (io_uring) + run: bash bench/throughput.sh ./speederv2 --iterations 3 --duration 5 + + - name: Throughput test (recvfrom baseline) + run: UDPSPEEDER_NO_URING=1 bash bench/throughput.sh ./speederv2 --iterations 3 --duration 5 + + - name: Store benchmark results + if: github.ref == 'refs/heads/branch_libev' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: UDPspeeder Benchmarks + tool: customSmallerIsBetter + output-file-path: bench_results.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + alert-threshold: '115%' + comment-on-alert: true + fail-on-alert: false + benchmark-data-dir-path: dev/bench + + build-static: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - name: x86_64 + packages: "" + toolchain_url: "" + bench_target: bench-static + test_target: test-static + prod_target: all + make_args: "" + bench_bin: bench_udpspeeder_static + test_bin: test_udpspeeder_static + prod_bin: speederv2 + qemu_cmd: "" + - name: aarch64 + packages: g++-aarch64-linux-gnu qemu-user-static + toolchain_url: "" + bench_target: bench-cross + test_target: test-cross + prod_target: all-cross + make_args: "CC=aarch64-linux-gnu-g++" + bench_bin: bench_udpspeeder_cross + test_bin: test_udpspeeder_cross + prod_bin: speederv2_cross + qemu_cmd: qemu-aarch64-static + - name: mips + packages: g++-mips-linux-gnu qemu-user-static + toolchain_url: "" + bench_target: bench-cross + test_target: test-cross + prod_target: all-cross + make_args: "CC=mips-linux-gnu-g++" + bench_bin: bench_udpspeeder_cross + test_bin: test_udpspeeder_cross + prod_bin: speederv2_cross + qemu_cmd: qemu-mips-static + - name: powerpc + packages: qemu-user-static zstd + toolchain_url: "https://downloads.openwrt.org/releases/25.12.0-rc5/targets/mpc85xx/p1010/openwrt-toolchain-25.12.0-rc5-mpc85xx-p1010_gcc-14.3.0_musl.Linux-x86_64.tar.zst" + bench_target: bench-cross + test_target: test-cross + prod_target: all-cross + make_args: "SPE=1" + bench_bin: bench_udpspeeder_cross + test_bin: test_udpspeeder_cross + prod_bin: speederv2_cross + qemu_cmd: "qemu-ppc-static -cpu e500v2" + - name: riscv64 + packages: qemu-user-static zstd + toolchain_url: "https://downloads.openwrt.org/releases/24.10.0/targets/sifiveu/generic/openwrt-toolchain-24.10.0-sifiveu-generic_gcc-13.3.0_musl.Linux-x86_64.tar.zst" + bench_target: bench-cross + test_target: test-cross + prod_target: all-cross + make_args: "" + bench_bin: bench_udpspeeder_cross + test_bin: test_udpspeeder_cross + prod_bin: speederv2_cross + qemu_cmd: qemu-riscv64-static + steps: + - uses: actions/checkout@v4 + + - name: Install packages + if: matrix.packages != '' + run: sudo apt-get update && sudo apt-get install -y ${{ matrix.packages }} + + - name: Download OpenWrt toolchain + if: matrix.toolchain_url != '' + run: | + curl -fSL "${{ matrix.toolchain_url }}" -o toolchain.tar.zst + mkdir -p /tmp/openwrt-toolchain + tar --zstd -xf toolchain.tar.zst -C /tmp/openwrt-toolchain --strip-components=1 + OPENWRT_GXX=$(find /tmp/openwrt-toolchain -name '*-g++' -path '*/bin/*' | head -1) + echo "OPENWRT_GXX=${OPENWRT_GXX}" >> "$GITHUB_ENV" + echo "STAGING_DIR=/tmp/openwrt-toolchain" >> "$GITHUB_ENV" + echo "Found toolchain: ${OPENWRT_GXX}" + + - name: Build bench (${{ matrix.name }}) + run: | + ARGS="${{ matrix.make_args }}" + if [ -n "${OPENWRT_GXX:-}" ]; then + ARGS="CC=${OPENWRT_GXX} ${ARGS}" + fi + make ${{ matrix.bench_target }} ${ARGS} + + - name: Build test (${{ matrix.name }}) + run: | + ARGS="${{ matrix.make_args }}" + if [ -n "${OPENWRT_GXX:-}" ]; then + ARGS="CC=${OPENWRT_GXX} ${ARGS}" + fi + make ${{ matrix.test_target }} ${ARGS} + + - name: Build production (${{ matrix.name }}) + run: | + ARGS="${{ matrix.make_args }}" + if [ -n "${OPENWRT_GXX:-}" ]; then + ARGS="CC=${OPENWRT_GXX} ${ARGS}" + fi + make ${{ matrix.prod_target }} ${ARGS} + + - name: Verify binaries + run: file ${{ matrix.bench_bin }} ${{ matrix.test_bin }} ${{ matrix.prod_bin }} + + - name: Run tests (QEMU) + if: matrix.qemu_cmd != '' + run: ${{ matrix.qemu_cmd }} ./${{ matrix.test_bin }} + + - name: Run benchmarks (QEMU) + if: matrix.qemu_cmd != '' + run: ${{ matrix.qemu_cmd }} ./${{ matrix.bench_bin }} + + - name: Run benchmarks JSON (QEMU) + if: matrix.qemu_cmd != '' && matrix.name == 'powerpc' + run: ${{ matrix.qemu_cmd }} ./${{ matrix.bench_bin }} --json + + - name: Store PPC benchmark results + if: matrix.name == 'powerpc' && github.ref == 'refs/heads/branch_libev' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: UDPspeeder Benchmarks (PowerPC e500v2 via QEMU) + tool: customSmallerIsBetter + output-file-path: bench_results.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + alert-threshold: '200%' + comment-on-alert: true + fail-on-alert: false + benchmark-data-dir-path: dev/bench-powerpc + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: udpspeeder-${{ matrix.name }} + path: | + ${{ matrix.bench_bin }} + ${{ matrix.test_bin }} + ${{ matrix.prod_bin }} + bench/profile.sh + + interop: + runs-on: ubuntu-latest + needs: [build-static] + steps: + - uses: actions/checkout@v4 + + - name: Install QEMU + run: sudo apt-get update && sudo apt-get install -y qemu-user-static + + - name: Download x86_64 artifact + uses: actions/download-artifact@v4 + with: + name: udpspeeder-x86_64 + path: bin/x86_64 + + - name: Download aarch64 artifact + uses: actions/download-artifact@v4 + with: + name: udpspeeder-aarch64 + path: bin/aarch64 + + - name: Download mips artifact + uses: actions/download-artifact@v4 + with: + name: udpspeeder-mips + path: bin/mips + + - name: Download powerpc artifact + uses: actions/download-artifact@v4 + with: + name: udpspeeder-powerpc + path: bin/powerpc + + - name: Download riscv64 artifact + uses: actions/download-artifact@v4 + with: + name: udpspeeder-riscv64 + path: bin/riscv64 + + - name: Set executable permissions + run: chmod +x bin/*/speederv2 bin/*/speederv2_cross + + - name: Run cross-architecture interop tests + run: | + set -e + PASS=0 + FAIL=0 + + X86="bin/x86_64/speederv2" + ARM="qemu-aarch64-static bin/aarch64/speederv2_cross" + MIPS="qemu-mips-static bin/mips/speederv2_cross" + PPC="qemu-ppc-static -cpu e500v2 bin/powerpc/speederv2_cross" + RV64="qemu-riscv64-static bin/riscv64/speederv2_cross" + + TESTS=( + "x86-server_arm-client|$X86|$ARM" + "arm-server_x86-client|$ARM|$X86" + "x86-server_mips-client|$X86|$MIPS" + "mips-server_x86-client|$MIPS|$X86" + "x86-server_ppc-client|$X86|$PPC" + "ppc-server_x86-client|$PPC|$X86" + "x86-server_rv64-client|$X86|$RV64" + "rv64-server_x86-client|$RV64|$X86" + ) + + CONFIGS=( + "--disable-fec|no-fec" + "--disable-fec --key testkey123|no-fec-key" + "--fec 20:10|fec-20-10" + "--fec 20:10 --key testkey123|fec-20-10-key" + ) + + for entry in "${TESTS[@]}"; do + IFS='|' read -r pair_name server_cmd client_cmd <<< "$entry" + for cfg in "${CONFIGS[@]}"; do + IFS='|' read -r cfg_args cfg_label <<< "$cfg" + label="${pair_name}/${cfg_label}" + + echo "" + echo "==========================================" + echo " TESTING: $label" + echo "==========================================" + + if bash bench/interop.sh \ + --server-cmd "$server_cmd" \ + --client-cmd "$client_cmd" \ + $cfg_args \ + --label "$label" \ + --packets 200; then + PASS=$((PASS + 1)) + else + FAIL=$((FAIL + 1)) + echo "^^^ FAILED: $label ^^^" + fi + done + done + + echo "" + echo "==========================================" + echo " INTEROP RESULTS: $PASS passed, $FAIL failed" + echo "==========================================" + + if [[ $FAIL -ne 0 ]]; then + exit 1 + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..194d7d6 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,73 @@ +name: Release + +on: + push: + tags: + - 'v*' + +permissions: + contents: write + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Run tests + run: make test + + build: + needs: test + runs-on: ubuntu-latest + strategy: + matrix: + include: + - name: x86_64 + packages: "" + target: all + make_args: "" + src_bin: speederv2 + release_bin: speederv2_linux_x86_64 + - name: aarch64 + packages: g++-aarch64-linux-gnu + target: all-cross + make_args: "CC=aarch64-linux-gnu-g++" + src_bin: speederv2_cross + release_bin: speederv2_linux_aarch64 + steps: + - uses: actions/checkout@v4 + + - name: Install toolchain + if: matrix.packages != '' + run: sudo apt-get update && sudo apt-get install -y ${{ matrix.packages }} + + - name: Build production binary + run: make ${{ matrix.target }} ${{ matrix.make_args }} + + - name: Verify binary + run: file ${{ matrix.src_bin }} + + - name: Rename binary + run: mv ${{ matrix.src_bin }} ${{ matrix.release_bin }} + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.release_bin }} + path: ${{ matrix.release_bin }} + + release: + needs: build + runs-on: ubuntu-latest + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: artifacts/**/* + generate_release_notes: true diff --git a/.github/workflows/throughput.yml b/.github/workflows/throughput.yml new file mode 100644 index 0000000..3f9a9e9 --- /dev/null +++ b/.github/workflows/throughput.yml @@ -0,0 +1,67 @@ +name: Throughput + +on: + push: + branches: [branch_libev, master] + pull_request: + branches: [branch_libev, master] + +permissions: + contents: write + +jobs: + throughput: + runs-on: ubuntu-latest + steps: + - name: Checkout current branch + uses: actions/checkout@v4 + + - name: Fix script permissions + run: chmod +x bench/throughput.sh + + - name: Build current binary + run: make all + + - name: Rename current binary + run: mv speederv2 speederv2_current + + - name: Checkout baseline + uses: actions/checkout@v4 + with: + ref: baseline + path: baseline + + - name: Build baseline binary + run: make -C baseline all + + - name: Rename baseline binary + run: mv baseline/speederv2 speederv2_baseline + + - name: Generate JSON results (current + baseline) + run: | + cur_nofec=$(./bench/throughput.sh ./speederv2_current --disable-fec --json) + cur_fec=$(./bench/throughput.sh ./speederv2_current --fec 20:10 --json) + base_nofec=$(./bench/throughput.sh ./speederv2_baseline --disable-fec --json) + base_fec=$(./bench/throughput.sh ./speederv2_baseline --fec 20:10 --json) + + # Rename baseline entries to include "baseline/" prefix + base_nofec=$(echo "$base_nofec" | sed 's|"throughput/|"baseline/throughput/|') + base_fec=$(echo "$base_fec" | sed 's|"throughput/|"baseline/throughput/|') + + printf '[%s, %s, %s, %s]\n' "$cur_nofec" "$cur_fec" "$base_nofec" "$base_fec" > throughput_results.json + cat throughput_results.json + python3 -c "import json; d=json.load(open('throughput_results.json')); [print(f\" {e['name']}: {e['value']} {e['unit']}\") for e in d]" + + - name: Store throughput results + if: github.ref == 'refs/heads/branch_libev' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: UDPspeeder Throughput + tool: customBiggerIsBetter + output-file-path: throughput_results.json + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + alert-threshold: '115%' + comment-on-alert: true + fail-on-alert: false + benchmark-data-dir-path: dev/throughput diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dff2fb6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.claude/ +.gitmodules diff --git a/OPTIMIZATION.md b/OPTIMIZATION.md new file mode 100644 index 0000000..0e7a918 --- /dev/null +++ b/OPTIMIZATION.md @@ -0,0 +1,433 @@ +# UDPspeeder Optimization Results + +Benchmarked on Intel Core i5-7300U (Kaby Lake, 2C/4T, SSE4.2 + AVX2). +Target platforms: Intel N150 (Alder Lake-N), Mediatek Filogic (ARMv8), +TP-Link TL-WDR4900 (Freescale P1014 e500v2, PowerPC SPE). + +## End-to-end throughput (GitHub Actions, 1400B UDP, loopback) + +| Config | Baseline | Current | Improvement | +|---|---|---|---| +| no-fec | 618-861 Mbps | 942-1517 Mbps | **+48-76%** | +| fec 20:10 | 364-509 Mbps | 657-1082 Mbps | **+81-113%** | + +Ranges reflect CI host variance between runs. Current/baseline measured on +the same host within each run. + +FEC overhead dropped from ~41% of throughput (baseline) to ~30% (current). + +## Microbenchmark summary at 1500B + +| Path | Before | After | Speedup | +|---|---|---|---| +| addmul1 (GF multiply-accumulate) | 665 ns | 58 ns | 11.5x | +| rs_encode k=10 n=15 | 34,000 ns | 3,100 ns | 11x | +| rs_decode k=10 n=15 | 36,000 ns | 4,500 ns | 8x | +| do_cook (CRC32C + obscure + XOR) | 3,000 ns | 713 ns | 4.2x | +| cook XOR encryption only | 1,826 ns | 131 ns | 14x | + +## Commits + +### 1. Benchmark and test harness + +Nanobench-based microbenchmarks and correctness tests for FEC, CRC32, +and packet cooking. Enables data-driven optimization and CI regression +detection. + +### 2. CRC32C (Castagnoli) replacing CRC32 (zlib) + +Switched packet checksum from software CRC32 to CRC32C with hardware +acceleration via SSE4.2 (`_mm_crc32_u64`) and ARMv8-CRC (`__crc32cb`). +Software fallback for CPUs without hardware support. + +### 3. SSSE3 and NEON addmul1 vectorization + +The GF(2^8) multiply-accumulate (`addmul1`) is the inner loop of Reed-Solomon +encode and decode. The scalar implementation uses a 64KB lookup table — one +byte at a time. + +SSSE3/NEON use nibble decomposition: split each input byte into low/high +nibbles, use `PSHUFB`/`TBL` as a 16-entry parallel lookup, XOR the results. +Processes 16 bytes per iteration. Scalar tail for remainder. + +### 4. AVX2 addmul1 with runtime CPUID dispatch + +Same nibble-decomposition approach widened to 256-bit registers. `VPSHUFB` +operates on two independent 128-bit lanes, so the 16-byte lookup table is +duplicated into both lanes via `_mm256_broadcastsi128_si256`. + +Runtime dispatch: 3-phase CPUID check (OSXSAVE + XCR0 AVX state + leaf 7 +bit 5). Function pointer `addmul1_x86_fn` resolved in `init_fec()`. +SSE tail handles the 16-31 byte remainder. + +### 5. Packet cook refactor into context struct + +Extracted the cook pipeline (CRC32C + XOR obfuscation + XOR encryption) +from `packet.cpp` into self-contained `packet_cook.cpp` with a `cook_ctx_t` +context struct. Eliminates 6 global variables. No dependency on common.h, +libev, or networking code — enables benchmarking and testing in isolation. + +### 6. SSE2 and NEON XOR vectorization for cook pipeline + +The cook XOR loops (key encryption and IV obfuscation) were byte-at-a-time +with data-dependent branches (`if (key[j] == 0) j = 0`). Compilers cannot +auto-vectorize these. + +Pre-expand the repeating key/IV pattern into a tile whose length is +`lcm(pattern_len, 16)`, then XOR 16 bytes at a time with SSE2 or NEON. +Key tile is computed once at startup (`cook_ctx_prepare_key`), IV tile is +built per-packet on the stack (4-32 bytes, tile at most 496 bytes). + +### 7. Eliminate per-call malloc in fec_decode + +`fec_decode` performed 7+ malloc/free pairs per call. Replaced with: +- `invert_mat`: 5 heap buffers replaced with stack VLAs (max 3.6 KB) +- `build_decode_matrix`: k*k matrix pre-allocated in `fec_parms` struct +- `fec_decode`: per-row data buffers replaced with contiguous scratch + in `fec_parms`, lazily grown on first use + +Eliminates allocation jitter on the real-time decode path. + +### 8. io_uring multishot receive with provided buffer rings + +Replaced per-packet `recvfrom()` / `recv()` syscalls with io_uring +multishot receive using kernel-managed provided buffer rings. The kernel +fills pre-registered buffers and posts completions to a shared ring — +userspace drains batches of completions without syscalls per packet. + +Key implementation details: +- **Multishot recvmsg** for unconnected sockets (server local, client local) + with `IORING_OP_RECVMSG` + `IORING_RECV_MULTISHOT` + `IOSQE_BUFFER_SELECT` +- **Multishot recv** for connected sockets (server remote, client remote) +- **Provided buffer ring** (`IORING_REGISTER_PBUF_RING`): 256 buffers, + power-of-2 ring, kernel picks buffers without userspace involvement +- **Batched CQ drain**: single `acquire` load on CQ tail, process all ready + CQEs, single `release` store to advance CQ head +- **Batched buffer recycling**: deferred ring entries with single atomic + tail commit per batch +- **Combined submit+flush**: `io_uring_enter(IORING_ENTER_SQ_WAKEUP | + IORING_ENTER_GETEVENTS)` — one syscall for SQE submission and CQE + materialization +- **Zero-copy paths**: all four socket paths process directly from provided + buffers. recvmsg paths (unconnected) have 140+ bytes natural headroom. + recv paths (connected) use `URING_RECV_HEADROOM` (4 bytes) reserved before + each buffer for in-place conv header insertion, eliminating per-packet memcpy. +- **COOP_TASKRUN + SINGLE_ISSUER** flags with fallback for older kernels +- **CQ ring sized 4x buffer count** to avoid multishot stalls +- Graceful fallback to `recvfrom()` on older kernels or non-Linux + +Bugs fixed during development: +- `io_uring_recvmsg_out` payload offset must use template `msg_namelen` + (128 bytes for `sockaddr_storage`), not `hdr->namelen` (actual, e.g. 16) +- CQ tail read requires `acquire` barrier (correctness on ARM/NEON targets) +- Ring fd notification gap after CQ drain: explicit `IORING_ENTER_GETEVENTS` + flush needed to materialize deferred completions + +GitHub Actions throughput (no-fec, 1400B UDP, loopback): + +| Path | Median Mbps | Runs | +|---|---|---| +| io_uring multishot | 798.5 | 784.8, 798.5, 837.8 | +| recvfrom baseline | 629.9 | 627.6, 629.9, 654.0 | +| **Improvement** | **+27%** | | + +### 9. sendmmsg batching for FEC output + +Replaced per-packet `sendto()` calls after FEC encoding with a single +`sendmmsg()` call per batch. When the delay manager detects all output +delays are zero (the common case), it routes through `my_send_batch()` +which cooks all packets then issues one `sendmmsg()` for the entire batch. + +Typically 20-30 packets per FEC batch → 20-30 syscalls reduced to 1. + +### 10. Flat array replacing std::map in FEC decode + +Replaced `std::map` shard index in `fec_decode_manager_t` +with a flat pre-allocated array indexed directly by shard position. Eliminates +per-shard tree traversal (O(log n) → O(1)) and per-node heap allocation. + +### 11. Zero-copy io_uring recv for conv header + +The conv header (4 bytes) was previously inserted via `memmove` after receive. +Reserved `URING_RECV_HEADROOM` (4 bytes) before each provided buffer at +registration time. Recv paths now write the conv header directly into the +headroom, avoiding any per-packet memcpy/memmove. + +### 12. Anti-replay direct-mapped table + +Replaced `unordered_map` + `u64_t[30000]` ring buffer (~2 MB +scattered) with a `u32_t[32768]` direct-mapped table (128 KB contiguous). + +Design: `table[seq & (SIZE-1)] = seq` to mark seen, `table[seq & (SIZE-1)] != seq` +to check validity. Power-of-2 size for bitwise modulo. Old entries naturally +evicted when new seqs map to the same slot. Effective replay window ~32K groups, +comparable to the old 30K ring buffer. No timeout logic needed. + +Per-shard cost: single array access + compare vs hash computation + pointer chase. + +### 13. Flat group table in FEC decode with bitmap shard tracking + +Replaced `unordered_map` with a pre-allocated direct-mapped +array sized `next_pow2(fec_buff_num * 2)`. Safe because FEC sequence numbers +are monotonically increasing — consecutive seqs map to distinct slots, so no +two concurrent groups collide when table size exceeds max concurrent groups. + +Shard tracking uses a 32-byte bitmap (`u32_t[8]`, 256 bits) instead of +`memset(shard_idx, -1, 1024)`. Only the bitmap is cleared on group creation +(32 bytes vs 1024 bytes). The `shard_idx[]` array is only accessed through +`has_shard()`/`set_shard()` which check the bitmap first. + +Eliminates ~50K malloc/free pairs per second at line rate (one allocation per +FEC group for the map node containing the ~1 KB `fec_group_t`). + +### 14. Final hot-path micro-optimizations + +Replaced per-struct `memset(&msgs[i], 0, sizeof(msgs[i]))` (~64 bytes) +in the sendmmsg loop with targeted field writes (6 fields, ~48 bytes +skipped per struct × 20-30 packets per FEC batch). Merged two separate +loops in type==1 FEC decode (pre-init + populate) into one. Added early +`pad > 0` check to skip shard padding memset when the shard is already +at max_len. + +Each individually below CI noise floor. Combined: 1-3%. + +### 15. PowerPC e500v2 SPE XOR for cook pipeline + +Added SPE (Signal Processing Extension) assembly for the XOR cook stage on +PowerPC e500v2 (Freescale P1014, used in TP-Link TL-WDR4900 running OpenWrt). + +The e500v2 has no AltiVec/VMX. SPE provides 64-bit operations: `evldd`/`evstdd` +(8-byte aligned load/store) and `evxor` (64-bit XOR). GCC 9+ removed SPE +intrinsics, so this is standalone assembly (`xor_spe.S`) following the Linux +kernel pattern (`arch/powerpc/crypto/aes-spe-core.S`). + +Implementation details: +- **4x unrolled main loop**: 32 bytes/iteration with `evldd`/`evxor`/`evstdd` +- **Alignment handling**: scalar head loop until data is 8-byte aligned, + 8-byte tail loop, then byte tail for remainder +- **Tile wrap**: offset tracking with compare-and-reset per doubleword +- **Build guard**: `HAVE_PPC_SPE` define, set via `make SPE=1` +- **Word-width generic fallback**: non-SPE generic platforms (MIPS, RISC-V, + ARMv7) now use `sizeof(unsigned long)` XOR instead of byte-at-a-time + +PPC assembly gotchas fixed during development: +- OpenWrt binutils 2.44 requires `%r` register prefix (`%r0`, `%r3`); bare + `r0` is treated as a symbol reference ("unsupported relocation" errors) +- PPC r0-as-zero: `addi rD, r0, imm` treats r0 as literal 0, not the + register. Fixed by using `addic` (no r0 special case, but clobbers XER[CA]) +- `evldd` reads 8 bytes at tile+offset; after unaligned head loop, offset + can be 1-7, straddling tile boundary. Fixed with `COOK_VEC_WIDTH` padding + bytes at end of tile buffers + +SPE only helps the XOR stage of cook. It cannot help `addmul1` (requires +byte-level shuffle, absent on SPE) or CRC32C (no hardware CRC on e500v2). + +PowerPC e500v2 microbenchmarks (QEMU, GitHub Actions): + +| Path | Baseline | Current | Speedup | +|---|---|---|---| +| crc32c/1500B (sw slicing-by-8) | 2,609 ns | 1,804 ns | **1.4x** | +| rs_encode k=10 n=15 | 182,166 ns | 123,385 ns | **1.5x** | +| rs_decode k=10 n=15 | 169,905 ns | 133,963 ns | **1.3x** | +| addmul1/1500B (scalar) | 2,453 ns | 2,447 ns | 1.0x | + +RS encode/decode improvement is from pre-allocated decode buffers (#7), not +SPE. CRC32C improvement is from switching CRC32-zlib to CRC32C-Castagnoli +(software slicing-by-8 table, #2). addmul1 is identical (both scalar). + +Cook pipeline numbers (current only, no baseline cook tests): + +| Path | PPC (QEMU) ns | +|---|---| +| do_cook/1500B | 4,090 | +| de_cook/1500B | 3,971 | +| cook_xor_only/1500B | 1,091 | +| cook_obscure_only/1500B | 1,415 | +| cook_crc32_only/1500B | 1,983 | + +Files: `xor_spe.S` (new), `packet_cook.cpp`, `makefile`, `.github/workflows/ci.yml` + +## Analysis and diminishing returns + +After 15 optimizations, the codebase is within 10% of the theoretical +floor (see below). Remaining overhead is irreducible: + +**Syscall overhead**: Eliminated. io_uring multishot recv batches receives +without per-packet syscalls. sendmmsg batches sends. + +**Compute hotspots**: SIMD-vectorized. GF(2^8) multiply-accumulate uses +AVX2/SSSE3/NEON. CRC32C uses hardware instructions. XOR cook uses +SSE2/NEON/SPE (PPC e500v2). Word-width fallback for generic platforms. + +**Allocation overhead**: Eliminated from hot paths. FEC decode uses pre-allocated +buffers, flat arrays, and direct-mapped tables. No `malloc`/`free` per packet +or per group. + +**Memory copies**: Two per-packet memcpy remain and are architecturally necessary: +1. `blob_encode_t::input()` — packets must be packed contiguously before RS can + slice them into equal-length shards. Shard boundaries aren't known until the + batch is complete (depends on total data size and optimal data_num selection). +2. `fec_decode_manager_t::input()` — received shards must be copied into owned + buffers because RS decode modifies data in-place. + +Each copies ~1400 bytes per packet. At 1.5M packets/sec, that's ~4 GB/sec of +memcpy bandwidth — real but fundamental to the FEC architecture. + +## Theoretical FEC overhead floor + +For a given FEC config k:r (k data shards, r redundant, n=k+r total), the +minimum per-packet overhead has three irreducible components: + +### 1. Wire amplification + +Every k application packets produce n=k+r packets on the wire. Goodput +cannot exceed k/n of wire capacity regardless of CPU speed. + +| Config | k | r | n | Wire overhead | +|---|---|---|---|---| +| fec 20:10 | 20 | 10 | 30 | **33% lost** (goodput ≤ 67% of no-fec) | +| fec 10:5 | 10 | 5 | 15 | **33% lost** | +| fec 5:3 | 5 | 3 | 8 | **38% lost** (goodput ≤ 63%) | + +This is information-theoretic: you must transmit r/k extra data. + +### 2. RS encode compute (per application packet) + +Encode generates r parity shards. Each parity shard requires k addmul1 +calls over shard_len bytes (`lib/fec.cpp:940-944`). Per batch of k packets: + + total_addmul1 = r × k calls at shard_len ≈ 1400 bytes + per_app_packet = r addmul1(shard_len) calls + +| Config | addmul1 per pkt | ns/pkt (x86 AVX2) | ns/pkt (PPC scalar) | +|---|---|---|---| +| fec 20:10 | 10 | 10 × 38 = **380** | 10 × 2447 = **24,470** | +| fec 10:5 | 5 | 5 × 38 = **190** | 5 × 2447 = **12,235** | +| fec 5:3 | 3 | 3 × 38 = **114** | 3 × 2447 = **7,341** | + +These are the pure addmul1 cost; each call also includes a bzero of the +shard buffer (first iteration). + +### 3. Cook amplification + +Every shard (data + parity) is cooked before send and de-cooked after +receive. Per application packet: n/k cook + n/k de_cook calls. + +| Config | cook+de_cook/pkt | ns/pkt (x86 AVX2) | +|---|---|---| +| fec 20:10 | 1.5 × (351 + 230) = **872** | (no-fec: 351 + 230 = 581) | +| fec 10:5 | 1.5 × (351 + 230) = **872** | | +| fec 5:3 | 1.6 × (351 + 230) = **929** | | + +### Combined floor (x86_64 AVX2, fec 20:10, no loss) + +| Component | Per-app-pkt (ns) | Notes | +|---|---|---| +| RS encode | 380 | 10 × addmul1(1400B) | +| Cook amplification | +291 | 0.5 extra cook+de_cook | +| memcpy (blob input) | ~35 | 1400B at ~40 GB/s L1 | +| memcpy (decode input) | ~53 | 1.5 × 1400B | +| bzero (parity init) | ~18 | 0.5 × 1400B | +| **Total overhead** | **~777** | on top of no-fec cost | + +No-fec per-packet cost: ~581 ns (cook + de_cook). +FEC 20:10 per-packet cost: ~1358 ns (581 + 777). +**Minimum FEC throughput ratio: 581 / 1358 = 43% of no-fec** (compute-bound). + +But wire amplification caps at 67% of no-fec, which is less restrictive. +At low throughput (CPU-bound), the compute floor dominates. At high +throughput (bandwidth-bound), the wire floor dominates. + +CI measured ~70% of no-fec (30% overhead), better than the compute floor +predicts. This is because the throughput test is bandwidth-limited on +loopback before hitting CPU saturation — the wire amplification floor +(67%) is the binding constraint, and measured overhead (30%) is close to +the theoretical 33%. + +### Decode worst case + +When r data shards are lost, decode reconstructs each via k addmul1 calls +(`lib/fec.cpp:1060-1065`). Per batch: r × k addmul1 = same as encode. +Per app packet: r addmul1(shard_len) — identical to encode cost. + +Worst-case round-trip (all r lost): encode + decode = 2r addmul1 per +app packet = 760 ns/pkt on x86 AVX2 for fec 20:10. + +### Implication + +The current ~30% FEC overhead on CI loopback is within 10% of the +information-theoretic floor (33% wire amplification). No further software +optimization can meaningfully close this gap. On real networks with actual +packet loss, the wire amplification is the cost of redundancy by design. + +## Not done (deliberately) + +**Scatter-gather RS encoder to eliminate blob_encode memcpy**: The +`blob_encode_t::input()` memcpy (~1400B per packet) packs variable-length +application packets into a contiguous buffer with interleaved 2-byte length +headers, then slices the result into k equal-length shards for RS encode. +This copy exists because shard boundaries depend on total batch size, which +isn't known until the last packet arrives. Eliminating it would require the +RS encoder to accept a scatter-gather (iovec-style) input instead of flat +`char *data[k]` pointers. That means rewriting `fec_encode`'s inner loop +(`lib/fec.cpp:940-945`) and `addmul1` to iterate over discontiguous chunks, +adding branch overhead per chunk boundary inside the tightest SIMD loop in +the system. The alternative — pre-positioning packets into a shard grid as +they arrive — fails because the grid layout depends on the final batch size. +Net effect: replaces a 1400B L1-resident memcpy (~35 ns) with scatter-gather +bookkeeping of comparable cost, while adding complexity to the FEC core. + +**Zero-copy RS decode to eliminate fec_data memcpy**: The +`fec_decode_manager_t::input()` memcpy (~1400B per shard) copies received +shards into owned `fec_data[].buf` buffers because `fec_decode` modifies +data in-place — it overwrites redundancy shard buffers with recovered data +(`lib/fec.cpp:1061-1067`), then copies results back (`lib/fec.cpp:1072-1075`). +Pointing RS decode directly at io_uring provided buffers would corrupt the +kernel buffer ring (buffers must be recycled promptly or ring starvation +occurs). For the recvfrom path, the receive buffer is stack-local and reused +per callback. Making `fec_decode` write to separate output buffers instead +of in-place would eliminate the input copy but add an identical output copy +(the recovered data must still go somewhere). Net: zero gain, additional +complexity in the 1997-era Vandermonde matrix math. + +**Auto-vectorization of scalar GF(2^8) fallback**: The scalar `addmul1` +uses a 64KB lookup table indexed by runtime byte values. No compiler can +auto-vectorize arbitrary table lookups — the SSSE3/NEON `PSHUFB`/`TBL` +approach requires algebraic insight (nibble decomposition of GF multiplication) +that is beyond compiler analysis. Documented in `lib/fec.cpp`. + +**-O3**: Tested, no measurable improvement. All hot paths are hand-written +SIMD intrinsics or hardware CRC32C — the compiler's extra optimization +passes have nothing to improve. + +**Alignment audit**: Unaligned SIMD loads/stores used throughout (correct +for arbitrary buffer pointers). On both target architectures, unaligned +accesses that don't cross cache line boundaries are free. Estimated +impact of forced alignment: <1%. + +## Cross-architecture notes + +**x86_64** (N150, CI runners): Full SIMD coverage. AVX2 addmul1, SSE4.2 +CRC32C, SSE2/AVX2 XOR cook. io_uring multishot recv, sendmmsg batching. +All optimizations apply. + +**ARMv8/AArch64** (Mediatek Filogic): NEON addmul1 (TBL), ARMv8-CRC +CRC32C, NEON XOR cook. All three compute paths are vectorized. io_uring +available if kernel 6.0+. Cross-compiled and QEMU-tested in CI; untested +on real Filogic hardware. + +**PowerPC e500v2** (TL-WDR4900): SPE XOR only. addmul1 is scalar +(SPE has no byte-level shuffle/permute equivalent to PSHUFB/TBL). +CRC32C is software slicing-by-8 (no hardware CRC). No io_uring +(older kernel). Expected real-hardware throughput: 50-150 Mbps no-fec, +15-40 Mbps fec-20:10, limited by scalar addmul1. + +**MIPS 24Kc** (AR71xx OpenWrt targets): No useful SIMD. MIPS SIMD +Architecture (MSA) is only on MIPS32r5+ (P5600, I6400), not 24Kc. +All paths would be scalar. Build targets exist in makefile but are +untested with current optimization work. + +**RISC-V RV64GCV**: Hypothetical future target. The V extension has +`vrgather` which can implement GF(2^8) nibble-decomposition lookup +(equivalent to PSHUFB/TBL), potentially vectorizing addmul1. This +is the only other ISA besides x86/ARM that could accelerate FEC. diff --git a/bench/bench_common.h b/bench/bench_common.h new file mode 100644 index 0000000..5746227 --- /dev/null +++ b/bench/bench_common.h @@ -0,0 +1,34 @@ +#ifndef BENCH_COMMON_H +#define BENCH_COMMON_H + +#include + +/* gf type matches lib/fec.cpp for GF_BITS=8 */ +typedef unsigned char gf; + +/* Exposed by lib/fec.cpp when compiled with -DBENCH_EXPOSE_INTERNALS */ +extern "C++" void bench_addmul1(gf *dst, gf *src, gf c, int sz); + +/* Exposed by packet_cook.cpp when compiled with -DBENCH_EXPOSE_INTERNALS */ +extern "C++" void bench_xor_tile(char *data, int len, const char *tile, int tile_len); +extern "C++" int bench_cook_vec_width(); +extern "C++" const char *bench_xor_tile_impl(); + +/* Exposed by lib/fec.cpp when compiled with -DBENCH_EXPOSE_INTERNALS */ +extern "C++" const char *bench_addmul1_impl(); + +/* Packet sizes representative of real traffic */ +static const size_t bench_sizes[] = { 64, 256, 1024, 1500 }; +static const int bench_sizes_count = sizeof(bench_sizes) / sizeof(bench_sizes[0]); + +/* Registration functions called from bench_main.cpp */ +void register_fec_benchmarks(void *bench_ptr); +void register_crc32_benchmarks(void *bench_ptr); +void register_packet_benchmarks(void *bench_ptr); + +/* Registration functions called from test_main.cpp */ +int run_fec_tests(); +int run_crc32_tests(); +int run_packet_tests(); + +#endif diff --git a/bench/bench_crc32.cpp b/bench/bench_crc32.cpp new file mode 100644 index 0000000..2a307b2 --- /dev/null +++ b/bench/bench_crc32.cpp @@ -0,0 +1,61 @@ +#include "nanobench.h" +#include "bench_common.h" +#include "crc32c.h" +#include "crc32/Crc32.h" +#include +#include + +void register_crc32_benchmarks(void *bench_ptr) { + auto &bench = *static_cast(bench_ptr); + + /* Fill a buffer with pseudo-random data */ + static char buf[1500]; + for (int i = 0; i < 1500; i++) + buf[i] = (char)(rand() & 0xFF); + + /* --- Old CRC32 (zlib polynomial) baseline --- */ + for (int i = 0; i < bench_sizes_count; i++) { + size_t sz = bench_sizes[i]; + std::string name = "crc32_old/" + std::to_string(sz) + "B"; + + bench.run(name, [sz]() { + auto r = crc32_fast(buf, sz); + ankerl::nanobench::doNotOptimizeAway(r); + }); + } + + /* --- CRC32C software --- */ + for (int i = 0; i < bench_sizes_count; i++) { + size_t sz = bench_sizes[i]; + std::string name = "crc32c_sw/" + std::to_string(sz) + "B"; + + bench.run(name, [sz]() { + auto r = crc32c_sw(buf, sz); + ankerl::nanobench::doNotOptimizeAway(r); + }); + } + + /* --- CRC32C hardware (may be same as sw if no hw support) --- */ + if (crc32c_has_hw()) { + for (int i = 0; i < bench_sizes_count; i++) { + size_t sz = bench_sizes[i]; + std::string name = "crc32c_hw/" + std::to_string(sz) + "B"; + + bench.run(name, [sz]() { + auto r = crc32c_hw(buf, sz); + ankerl::nanobench::doNotOptimizeAway(r); + }); + } + } + + /* --- CRC32C dispatched (production path) --- */ + for (int i = 0; i < bench_sizes_count; i++) { + size_t sz = bench_sizes[i]; + std::string name = "crc32c/" + std::to_string(sz) + "B"; + + bench.run(name, [sz]() { + auto r = crc32c(buf, sz); + ankerl::nanobench::doNotOptimizeAway(r); + }); + } +} diff --git a/bench/bench_fec.cpp b/bench/bench_fec.cpp new file mode 100644 index 0000000..6316956 --- /dev/null +++ b/bench/bench_fec.cpp @@ -0,0 +1,93 @@ +#include "nanobench.h" +#include "bench_common.h" +#include "lib/rs.h" +#include +#include +#include + +static void fill_random(char *buf, int sz) { + for (int i = 0; i < sz; i++) + buf[i] = (char)(rand() & 0xFF); +} + +void register_fec_benchmarks(void *bench_ptr) { + auto &bench = *static_cast(bench_ptr); + + /* GF tables are initialized inside fec_new; force init via a dummy allocation */ + { void *d = fec_new(2, 3); fec_free(d); } + + /* --- addmul1 microbenchmarks --- */ + for (int i = 0; i < bench_sizes_count; i++) { + int sz = (int)bench_sizes[i]; + std::string name = "addmul1/" + std::to_string(sz) + "B"; + + bench.run(name, [sz]() { + static gf dst[1500], src[1500]; + bench_addmul1(dst, src, 0x53, sz); + ankerl::nanobench::doNotOptimizeAway(dst[0]); + }); + } + + /* --- rs_encode2 --- */ + struct { int k; int n; const char *label; } encode_configs[] = { + {5, 8, "5/8"}, {10, 15, "10/15"} + }; + + for (auto &cfg : encode_configs) { + std::string name = std::string("rs_encode/k") + cfg.label + "/1500B"; + int k = cfg.k, n = cfg.n; + + /* Pre-allocate outside the timed loop */ + char **data = (char **)calloc(n, sizeof(char *)); + for (int j = 0; j < n; j++) { + data[j] = (char *)calloc(1, 1500); + } + for (int j = 0; j < k; j++) + fill_random(data[j], 1500); + + bench.run(name, [k, n, data]() { + rs_encode2(k, n, data, 1500); + ankerl::nanobench::doNotOptimizeAway(data[k][0]); + }); + + for (int j = 0; j < n; j++) free(data[j]); + free(data); + } + + /* --- rs_decode2 --- */ + for (auto &cfg : encode_configs) { + std::string name = std::string("rs_decode/k") + cfg.label + "/1500B"; + int k = cfg.k, n = cfg.n; + int redundant = n - k; + + /* Prepare encoded data once */ + char **orig = (char **)calloc(n, sizeof(char *)); + for (int j = 0; j < n; j++) + orig[j] = (char *)calloc(1, 1500); + for (int j = 0; j < k; j++) + fill_random(orig[j], 1500); + rs_encode2(k, n, orig, 1500); + + /* Working copy for each decode iteration */ + char **data = (char **)calloc(n, sizeof(char *)); + char **bufs = (char **)calloc(n, sizeof(char *)); + for (int j = 0; j < n; j++) + bufs[j] = (char *)calloc(1, 1500); + + bench.run(name, [k, n, redundant, orig, data, bufs]() { + /* Reset working copy from originals */ + for (int j = 0; j < n; j++) + memcpy(bufs[j], orig[j], 1500); + + /* Simulate losing the first 'redundant' data packets */ + for (int j = 0; j < n; j++) + data[j] = (j < redundant) ? NULL : bufs[j]; + + rs_decode2(k, n, data, 1500); + ankerl::nanobench::doNotOptimizeAway(data[0][0]); + }); + + for (int j = 0; j < n; j++) { free(orig[j]); free(bufs[j]); } + free(orig); free(data); free(bufs); + } +} diff --git a/bench/bench_main.cpp b/bench/bench_main.cpp new file mode 100644 index 0000000..e8b8bc9 --- /dev/null +++ b/bench/bench_main.cpp @@ -0,0 +1,63 @@ +#define ANKERL_NANOBENCH_IMPLEMENT +#include "nanobench.h" +#include "bench_common.h" +#include "lib/rs.h" +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + bool json_output = false; + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--json") == 0) + json_output = true; + } + + /* Force FEC init so addmul1 dispatch is resolved */ + { void *d = fec_new(2, 3); fec_free(d); } + + printf("SIMD: addmul1=%s xor_cook=%s vec_width=%d\n", + bench_addmul1_impl(), bench_xor_tile_impl(), bench_cook_vec_width()); + + ankerl::nanobench::Bench bench; + bench.title("UDPspeeder").warmup(3).epochs(21).relative(false); + + register_fec_benchmarks(&bench); + register_crc32_benchmarks(&bench); + register_packet_benchmarks(&bench); + + /* Emit stability warnings for noisy benchmarks */ + { + auto results = bench.results(); + for (size_t i = 0; i < results.size(); i++) { + double mdape = results[i].medianAbsolutePercentError( + ankerl::nanobench::Result::Measure::elapsed); + if (mdape > 0.05) { + fprintf(stderr, "WARNING: %s has MdAPE %.1f%% (>5%%)\n", + results[i].config().mBenchmarkName.c_str(), mdape * 100.0); + } + } + } + + if (json_output) { + /* github-action-benchmark customSmallerIsBetter format + * Mustache templates can't do math, so we extract results manually */ + std::ofstream out("bench_results.json"); + auto results = bench.results(); + out << "[\n"; + for (size_t i = 0; i < results.size(); i++) { + double ns = results[i].median(ankerl::nanobench::Result::Measure::elapsed) * 1e9; + out << " {\n" + << " \"name\": \"" << results[i].config().mBenchmarkName << "\",\n" + << " \"unit\": \"ns/op\",\n" + << " \"value\": " << ns << "\n" + << " }"; + if (i + 1 < results.size()) out << ","; + out << "\n"; + } + out << "]\n"; + } + + return 0; +} diff --git a/bench/bench_packet.cpp b/bench/bench_packet.cpp new file mode 100644 index 0000000..27a1809 --- /dev/null +++ b/bench/bench_packet.cpp @@ -0,0 +1,106 @@ +#include "nanobench.h" +#include "bench_common.h" +#include "packet_cook.h" +#include +#include +#include + +/* Stubs for packet_cook.cpp dependencies — production uses common.cpp */ +void get_fake_random_chars(char *s, int len) { + for (int i = 0; i < len; i++) + s[i] = (char)(rand() & 0xFF); +} + +int random_between(unsigned int a, unsigned int b) { + if (a == b) return (int)a; + return (int)(a + (unsigned int)rand() % (b + 1 - a)); +} + +static cook_ctx_t make_ctx(int checksum, int obscure, int xor_enc) { + cook_ctx_t ctx = {}; + strcpy(ctx.key, "benchmarkkey1234"); + cook_ctx_prepare_key(&ctx); + ctx.iv_min = 16; + ctx.iv_max = 16; + ctx.disable_checksum = !checksum; + ctx.disable_obscure = !obscure; + ctx.disable_xor = !xor_enc; + return ctx; +} + +void register_packet_benchmarks(void *bench_ptr) { + auto &bench = *static_cast(bench_ptr); + + /* Full pipeline: do_cook at all sizes */ + for (int i = 0; i < bench_sizes_count; i++) { + int sz = (int)bench_sizes[i]; + std::string name = "do_cook/" + std::to_string(sz) + "B"; + cook_ctx_t ctx = make_ctx(1, 1, 1); + + bench.run(name, [sz, &ctx]() { + static char buf[4096]; + memset(buf, 0xAB, sz); + int len = sz; + do_cook(&ctx, buf, len); + ankerl::nanobench::doNotOptimizeAway(buf[0]); + }); + } + + /* Full pipeline: de_cook at all sizes */ + for (int i = 0; i < bench_sizes_count; i++) { + int sz = (int)bench_sizes[i]; + std::string name = "de_cook/" + std::to_string(sz) + "B"; + cook_ctx_t ctx = make_ctx(1, 1, 1); + + /* Prepare a cooked buffer */ + static char cooked[4096]; + memset(cooked, 0xAB, sz); + int cooked_len = sz; + do_cook(&ctx, cooked, cooked_len); + int saved_len = cooked_len; + + bench.run(name, [saved_len, &ctx]() { + static char buf[4096]; + memcpy(buf, cooked, saved_len); + int len = saved_len; + de_cook(&ctx, buf, len); + ankerl::nanobench::doNotOptimizeAway(buf[0]); + }); + } + + /* Component: checksum only at 1500B */ + { + cook_ctx_t ctx = make_ctx(1, 0, 0); + bench.run("cook_crc32_only/1500B", [&ctx]() { + static char buf[4096]; + memset(buf, 0xAB, 1500); + int len = 1500; + do_cook(&ctx, buf, len); + ankerl::nanobench::doNotOptimizeAway(buf[0]); + }); + } + + /* Component: obscure only at 1500B */ + { + cook_ctx_t ctx = make_ctx(0, 1, 0); + bench.run("cook_obscure_only/1500B", [&ctx]() { + static char buf[4096]; + memset(buf, 0xAB, 1500); + int len = 1500; + do_cook(&ctx, buf, len); + ankerl::nanobench::doNotOptimizeAway(buf[0]); + }); + } + + /* Component: xor only at 1500B */ + { + cook_ctx_t ctx = make_ctx(0, 0, 1); + bench.run("cook_xor_only/1500B", [&ctx]() { + static char buf[4096]; + memset(buf, 0xAB, 1500); + int len = 1500; + do_cook(&ctx, buf, len); + ankerl::nanobench::doNotOptimizeAway(buf[0]); + }); + } +} diff --git a/bench/interop.sh b/bench/interop.sh new file mode 100755 index 0000000..803d114 --- /dev/null +++ b/bench/interop.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# bench/interop.sh — Cross-architecture interop test +# +# Runs a UDPspeeder tunnel between two (possibly different-arch) binaries +# and verifies data integrity. Both binaries can be prefixed with QEMU. +# +# Usage: ./bench/interop.sh --server-cmd CMD --client-cmd CMD [options] +# --server-cmd CMD Command to run server (may include QEMU prefix) +# --client-cmd CMD Command to run client (may include QEMU prefix) +# --fec X:Y FEC parameter (default: disabled) +# --disable-fec Explicitly disable FEC (default) +# --key KEY Encryption key +# --packets N Number of packets to send (default: 200) +# --label LABEL Label for output (default: "interop") + +set -euo pipefail + +SERVER_CMD="" +CLIENT_CMD="" +FEC_ARGS="--disable-fec" +KEY_ARGS="" +PACKETS=200 +LABEL="interop" +LOG_LEVEL=4 + +while [[ $# -gt 0 ]]; do + case "$1" in + --server-cmd) SERVER_CMD="$2"; shift 2 ;; + --client-cmd) CLIENT_CMD="$2"; shift 2 ;; + --fec) FEC_ARGS="-f $2"; shift 2 ;; + --disable-fec) FEC_ARGS="--disable-fec"; shift ;; + --key) KEY_ARGS="-k $2"; shift 2 ;; + --packets) PACKETS="$2"; shift 2 ;; + --label) LABEL="$2"; shift 2 ;; + --log-level) LOG_LEVEL="$2"; shift 2 ;; + *) echo "Unknown option: $1" >&2; exit 1 ;; + esac +done + +if [[ -z "$SERVER_CMD" || -z "$CLIENT_CMD" ]]; then + echo "Error: --server-cmd and --client-cmd are required" >&2 + exit 1 +fi + +PORT_TUNNEL=20010 +PORT_APP=20011 +PORT_CLIENT=20012 + +RECV_RESULT=$(mktemp) +SERVER_LOG=$(mktemp) +CLIENT_LOG=$(mktemp) + +cleanup() { + local pids + pids=$(jobs -p 2>/dev/null) || true + if [[ -n "$pids" ]]; then + kill $pids 2>/dev/null || true + wait $pids 2>/dev/null || true + fi + rm -f "$RECV_RESULT" "$SERVER_LOG" "$CLIENT_LOG" +} +trap cleanup EXIT + +dump_logs() { + echo " --- SERVER LOG (last 80 lines) ---" >&2 + tail -80 "$SERVER_LOG" >&2 2>/dev/null || true + echo " --- CLIENT LOG (last 80 lines) ---" >&2 + tail -80 "$CLIENT_LOG" >&2 2>/dev/null || true + echo " --- END LOGS ---" >&2 +} + +# Receiver: validate each packet's content +# Packet format: 4-byte big-endian seq + 1396 bytes of (seq & 0xFF) +python3 -c " +import socket, struct, sys + +sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +sock.bind(('127.0.0.1', $PORT_APP)) +sock.settimeout(10) + +valid = 0 +invalid = 0 + +try: + while True: + data = sock.recv(65535) + sock.settimeout(3) # shorter timeout after first packet + if len(data) < 4: + invalid += 1 + continue + seq = struct.unpack('>I', data[:4])[0] + fill = seq & 0xFF + expected = data[:4] + bytes([fill]) * (len(data) - 4) + if data == expected: + valid += 1 + else: + invalid += 1 + sys.stderr.write('CORRUPT seq=%d len=%d\n' % (seq, len(data))) + sys.stderr.flush() +except socket.timeout: + pass + +print('%d %d' % (valid, invalid)) +" > "$RECV_RESULT" 2>&1 & +RECV_PID=$! + +# Start tunnel (io_uring disabled — QEMU can't translate those syscalls) +UDPSPEEDER_NO_URING=1 $SERVER_CMD \ + -s -l 127.0.0.1:$PORT_TUNNEL -r 127.0.0.1:$PORT_APP \ + $FEC_ARGS $KEY_ARGS --log-level $LOG_LEVEL >"$SERVER_LOG" 2>&1 & + +UDPSPEEDER_NO_URING=1 $CLIENT_CMD \ + -c -l 127.0.0.1:$PORT_CLIENT -r 127.0.0.1:$PORT_TUNNEL \ + $FEC_ARGS $KEY_ARGS --log-level $LOG_LEVEL >"$CLIENT_LOG" 2>&1 & + +sleep 2 # let QEMU-emulated binaries start + +# Sender: N packets, each 1400 bytes with verifiable content +python3 -c " +import socket, struct, time + +sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +for seq in range($PACKETS): + header = struct.pack('>I', seq) + fill = bytes([seq & 0xFF]) * 1396 + sock.sendto(header + fill, ('127.0.0.1', $PORT_CLIENT)) + time.sleep(0.001) +" + +echo " [$LABEL] sender done ($PACKETS packets), waiting for receiver..." >&2 + +wait $RECV_PID 2>/dev/null || true + +# Parse results +RESULT=$(cat "$RECV_RESULT") + +VALID=$(echo "$RESULT" | tail -1 | awk '{print $1}') +INVALID=$(echo "$RESULT" | tail -1 | awk '{print $2}') +VALID=${VALID:-0} +INVALID=${INVALID:-0} + +echo " [$LABEL] valid=$VALID invalid=$INVALID sent=$PACKETS" >&2 + +if [[ "$INVALID" -ne 0 ]]; then + echo "FAIL [$LABEL]: $INVALID corrupted packets" >&2 + dump_logs + exit 1 +fi + +if [[ "$VALID" -eq 0 ]]; then + echo "FAIL [$LABEL]: no packets received" >&2 + dump_logs + exit 1 +fi + +MIN_EXPECTED=$(( PACKETS / 2 )) +if [[ "$VALID" -lt "$MIN_EXPECTED" ]]; then + echo "FAIL [$LABEL]: only $VALID/$PACKETS packets (expected >=$MIN_EXPECTED)" >&2 + dump_logs + exit 1 +fi + +echo "PASS [$LABEL]: $VALID/$PACKETS packets, 0 corrupt" diff --git a/bench/nanobench.h b/bench/nanobench.h new file mode 100644 index 0000000..127240d --- /dev/null +++ b/bench/nanobench.h @@ -0,0 +1,3484 @@ +// __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _ +// | \ | |_____| | \ | | | |_____] |______ | \ | | |_____| +// | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | | +// +// Microbenchmark framework for C++11/14/17/20 +// https://github.com/martinus/nanobench +// +// Licensed under the MIT License . +// SPDX-License-Identifier: MIT +// Copyright (c) 2019-2023 Martin Leitner-Ankerl +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ANKERL_NANOBENCH_H_INCLUDED +#define ANKERL_NANOBENCH_H_INCLUDED + +// see https://semver.org/ +#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes +#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes +#define ANKERL_NANOBENCH_VERSION_PATCH 11 // backwards-compatible bug fixes + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// public facing api - as minimal as possible +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include // high_resolution_clock +#include // memcpy +#include // for std::ostream* custom output target in Config +#include // all names +#include // holds context information of results +#include // holds all results + +#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x() + +#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus +#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L +#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L +#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L +#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L + +#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17) +# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]] +#else +# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() +#endif + +#if defined(__clang__) +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \ + _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"") +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop") +#else +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() +#endif + +#if defined(__GNUC__) +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"") +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop") +#else +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() +# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() +#endif + +#if defined(ANKERL_NANOBENCH_LOG_ENABLED) +# include +# define ANKERL_NANOBENCH_LOG(x) \ + do { \ + std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \ + } while (0) +#else +# define ANKERL_NANOBENCH_LOG(x) \ + do { \ + } while (0) +#endif + +#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0 +#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS) +# include +# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0) +// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3 +// PERF_FLAG_FD_CLOEXEC since kernel 3.14 +# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS +# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1 +# endif +#endif + +#if defined(__clang__) +# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__))) +#else +# define ANKERL_NANOBENCH_NO_SANITIZE(...) +#endif + +#if defined(_MSC_VER) +# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline) +#else +# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline)) +#endif + +// workaround missing "is_trivially_copyable" in g++ < 5.0 +// See https://stackoverflow.com/a/31798726/48181 +#if defined(__GNUC__) && __GNUC__ < 5 +# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) +#else +# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value +#endif + +// noexcept may be missing for std::string. +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58265 +#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable::value + +// declarations /////////////////////////////////////////////////////////////////////////////////// + +namespace ankerl { +namespace nanobench { + +using Clock = std::conditional::type; +class Bench; +struct Config; +class Result; +class Rng; +class BigO; + +/** + * @brief Renders output from a mustache-like template and benchmark results. + * + * The templating facility here is heavily inspired by [mustache - logic-less templates](https://mustache.github.io/). + * It adds a few more features that are necessary to get all of the captured data out of nanobench. Please read the + * excellent [mustache manual](https://mustache.github.io/mustache.5.html) to see what this is all about. + * + * nanobench output has two nested layers, *result* and *measurement*. Here is a hierarchy of the allowed tags: + * + * * `{{#result}}` Marks the begin of the result layer. Whatever comes after this will be instantiated as often as + * a benchmark result is available. Within it, you can use these tags: + * + * * `{{title}}` See Bench::title. + * + * * `{{name}}` Benchmark name, usually directly provided with Bench::run, but can also be set with Bench::name. + * + * * `{{unit}}` Unit, e.g. `byte`. Defaults to `op`, see Bench::unit. + * + * * `{{batch}}` Batch size, see Bench::batch. + * + * * `{{complexityN}}` Value used for asymptotic complexity calculation. See Bench::complexityN. + * + * * `{{epochs}}` Number of epochs, see Bench::epochs. + * + * * `{{clockResolution}}` Accuracy of the clock, i.e. what's the smallest time possible to measure with the clock. + * For modern systems, this can be around 20 ns. This value is automatically determined by nanobench at the first + * benchmark that is run, and used as a static variable throughout the application's runtime. + * + * * `{{clockResolutionMultiple}}` Configuration multiplier for `clockResolution`. See Bench::clockResolutionMultiple. + * This is the target runtime for each measurement (epoch). That means the more accurate your clock is, the faster + * will be the benchmark. Basing the measurement's runtime on the clock resolution is the main reason why nanobench is so fast. + * + * * `{{maxEpochTime}}` Configuration for a maximum time each measurement (epoch) is allowed to take. Note that at least + * a single iteration will be performed, even when that takes longer than maxEpochTime. See Bench::maxEpochTime. + * + * * `{{minEpochTime}}` Minimum epoch time, defaults to 1ms. See Bench::minEpochTime. + * + * * `{{minEpochIterations}}` See Bench::minEpochIterations. + * + * * `{{epochIterations}}` See Bench::epochIterations. + * + * * `{{warmup}}` Number of iterations used before measuring starts. See Bench::warmup. + * + * * `{{relative}}` True or false, depending on the setting you have used. See Bench::relative. + * + * * `{{context(variableName)}}` See Bench::context. + * + * Apart from these tags, it is also possible to use some mathematical operations on the measurement data. The operations + * are of the form `{{command(name)}}`. Currently `name` can be one of `elapsed`, `iterations`. If performance counters + * are available (currently only on current Linux systems), you also have `pagefaults`, `cpucycles`, + * `contextswitches`, `instructions`, `branchinstructions`, and `branchmisses`. All the measures (except `iterations`) are + * provided for a single iteration (so `elapsed` is the time a single iteration took). The following tags are available: + * + * * `{{median()}}` Calculate median of a measurement data set, e.g. `{{median(elapsed)}}`. + * + * * `{{average()}}` Average (mean) calculation. + * + * * `{{medianAbsolutePercentError()}}` Calculates MdAPE, the Median Absolute Percentage Error. The MdAPE is an excellent + * metric for the variation of measurements. It is more robust to outliers than the + * [Mean absolute percentage error (M-APE)](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error). + * @f[ + * \mathrm{MdAPE}(e) = \mathrm{med}\{| \frac{e_i - \mathrm{med}\{e\}}{e_i}| \} + * @f] + * E.g. for *elapsed*: First, @f$ \mathrm{med}\{e\} @f$ calculates the median by sorting and then taking the middle element + * of all *elapsed* measurements. This is used to calculate the absolute percentage + * error to this median for each measurement, as in @f$ | \frac{e_i - \mathrm{med}\{e\}}{e_i}| @f$. All these results + * are sorted, and the middle value is chosen as the median absolute percent error. + * + * This measurement is a bit hard to interpret, but it is very robust against outliers. E.g. a value of 5% means that half of the + * measurements deviate less than 5% from the median, and the other deviate more than 5% from the median. + * + * * `{{sum()}}` Sum of all the measurements. E.g. `{{sum(iterations)}}` will give you the total number of iterations +* measured in this benchmark. + * + * * `{{minimum()}}` Minimum of all measurements. + * + * * `{{maximum()}}` Maximum of all measurements. + * + * * `{{sumProduct(, )}}` Calculates the sum of the products of corresponding measures: + * @f[ + * \mathrm{sumProduct}(a,b) = \sum_{i=1}^{n}a_i\cdot b_i + * @f] + * E.g. to calculate total runtime of the benchmark, you multiply iterations with elapsed time for each measurement, and + * sum these results up: + * `{{sumProduct(iterations, elapsed)}}`. + * + * * `{{#measurement}}` To access individual measurement results, open the begin tag for measurements. + * + * * `{{elapsed}}` Average elapsed wall clock time per iteration, in seconds. + * + * * `{{iterations}}` Number of iterations in the measurement. The number of iterations will fluctuate due + * to some applied randomness, to enhance accuracy. + * + * * `{{pagefaults}}` Average number of pagefaults per iteration. + * + * * `{{cpucycles}}` Average number of CPU cycles processed per iteration. + * + * * `{{contextswitches}}` Average number of context switches per iteration. + * + * * `{{instructions}}` Average number of retired instructions per iteration. + * + * * `{{branchinstructions}}` Average number of branches executed per iteration. + * + * * `{{branchmisses}}` Average number of branches that were missed per iteration. + * + * * `{{/measurement}}` Ends the measurement tag. + * + * * `{{/result}}` Marks the end of the result layer. This is the end marker for the template part that will be instantiated + * for each benchmark result. + * + * + * For the layer tags *result* and *measurement* you additionally can use these special markers: + * + * * ``{{#-first}}`` - Begin marker of a template that will be instantiated *only for the first* entry in the layer. Use is only + * allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between + * ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-first}}``. + * + * * ``{{^-first}}`` - Begin marker of a template that will be instantiated *for each except the first* entry in the layer. This, + * this is basically the inversion of ``{{#-first}}``. Use is only allowed between the begin and end marker of the layer. + * So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``. + * + * * ``{{/-first}}`` - End marker for either ``{{#-first}}`` or ``{{^-first}}``. + * + * * ``{{#-last}}`` - Begin marker of a template that will be instantiated *only for the last* entry in the layer. Use is only + * allowed between the begin and end marker of the layer. So between ``{{#result}}`` and ``{{/result}}``, or between + * ``{{#measurement}}`` and ``{{/measurement}}``. Finish the template with ``{{/-last}}``. + * + * * ``{{^-last}}`` - Begin marker of a template that will be instantiated *for each except the last* entry in the layer. This, + * this is basically the inversion of ``{{#-last}}``. Use is only allowed between the begin and end marker of the layer. + * So between ``{{#result}}`` and ``{{/result}}``, or between ``{{#measurement}}`` and ``{{/measurement}}``. + * + * * ``{{/-last}}`` - End marker for either ``{{#-last}}`` or ``{{^-last}}``. + * + @verbatim embed:rst + + For an overview of all the possible data you can get out of nanobench, please see the tutorial at :ref:`tutorial-template-json`. + + The templates that ship with nanobench are: + + * :cpp:func:`templates::csv() ` + * :cpp:func:`templates::json() ` + * :cpp:func:`templates::htmlBoxplot() ` + * :cpp:func:`templates::pyperf() ` + + @endverbatim + * + * @param mustacheTemplate The template. + * @param bench Benchmark, containing all the results. + * @param out Output for the generated output. + */ +void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out); +void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out); + +/** + * Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when + * you only have results available. + * + * @param mustacheTemplate The template. + * @param results All the results to be used for rendering. + * @param out Output for the generated output. + */ +void render(char const* mustacheTemplate, std::vector const& results, std::ostream& out); +void render(std::string const& mustacheTemplate, std::vector const& results, std::ostream& out); + +// Contains mustache-like templates +namespace templates { + +/*! + @brief CSV data for the benchmark results. + + Generates a comma-separated values dataset. First line is the header, each following line is a summary of each benchmark run. + + @verbatim embed:rst + See the tutorial at :ref:`tutorial-template-csv` for an example. + @endverbatim + */ +char const* csv() noexcept; + +/*! + @brief HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an example output. + + The output uses only the elapsed wall clock time, and displays each epoch as a single dot. + @verbatim embed:rst + See the tutorial at :ref:`tutorial-template-html` for an example. + @endverbatim + + @see also ankerl::nanobench::render() + */ +char const* htmlBoxplot() noexcept; + +/*! + @brief Output in pyperf compatible JSON format, which can be used for more analyzation. + @verbatim embed:rst + See the tutorial at :ref:`tutorial-template-pyperf` for an example how to further analyze the output. + @endverbatim + */ +char const* pyperf() noexcept; + +/*! + @brief Template to generate JSON data. + + The generated JSON data contains *all* data that has been generated. All times are as double values, in seconds. The output can get + quite large. + @verbatim embed:rst + See the tutorial at :ref:`tutorial-template-json` for an example. + @endverbatim + */ +char const* json() noexcept; + +} // namespace templates + +namespace detail { + +template +struct PerfCountSet; + +class IterationLogic; +class PerformanceCounters; + +#if ANKERL_NANOBENCH(PERF_COUNTERS) +class LinuxPerformanceCounters; +#endif + +} // namespace detail +} // namespace nanobench +} // namespace ankerl + +// definitions //////////////////////////////////////////////////////////////////////////////////// + +namespace ankerl { +namespace nanobench { +namespace detail { + +template +struct PerfCountSet { + T pageFaults{}; + T cpuCycles{}; + T contextSwitches{}; + T instructions{}; + T branchInstructions{}; + T branchMisses{}; +}; + +} // namespace detail + +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +struct Config { + // actual benchmark config + std::string mBenchmarkTitle = "benchmark"; // NOLINT(misc-non-private-member-variables-in-classes) + std::string mBenchmarkName = "noname"; // NOLINT(misc-non-private-member-variables-in-classes) + std::string mUnit = "op"; // NOLINT(misc-non-private-member-variables-in-classes) + double mBatch = 1.0; // NOLINT(misc-non-private-member-variables-in-classes) + double mComplexityN = -1.0; // NOLINT(misc-non-private-member-variables-in-classes) + size_t mNumEpochs = 11; // NOLINT(misc-non-private-member-variables-in-classes) + size_t mClockResolutionMultiple = static_cast(1000); // NOLINT(misc-non-private-member-variables-in-classes) + std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes) + std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1); // NOLINT(misc-non-private-member-variables-in-classes) + uint64_t mMinEpochIterations{1}; // NOLINT(misc-non-private-member-variables-in-classes) + // If not 0, run *exactly* these number of iterations per epoch. + uint64_t mEpochIterations{0}; // NOLINT(misc-non-private-member-variables-in-classes) + uint64_t mWarmup = 0; // NOLINT(misc-non-private-member-variables-in-classes) + std::ostream* mOut = nullptr; // NOLINT(misc-non-private-member-variables-in-classes) + std::chrono::duration mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes) + std::string mTimeUnitName = "ns"; // NOLINT(misc-non-private-member-variables-in-classes) + bool mShowPerformanceCounters = true; // NOLINT(misc-non-private-member-variables-in-classes) + bool mIsRelative = false; // NOLINT(misc-non-private-member-variables-in-classes) + std::unordered_map mContext{}; // NOLINT(misc-non-private-member-variables-in-classes) + + Config(); + ~Config(); + Config& operator=(Config const& other); + Config& operator=(Config&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)); + Config(Config const& other); + Config(Config&& other) noexcept; +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +// Result returned after a benchmark has finished. Can be used as a baseline for relative(). +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +class Result { +public: + enum class Measure : size_t { + elapsed, + iterations, + pagefaults, + cpucycles, + contextswitches, + instructions, + branchinstructions, + branchmisses, + _size + }; + + explicit Result(Config benchmarkConfig); + + ~Result(); + Result& operator=(Result const& other); + Result& operator=(Result&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)); + Result(Result const& other); + Result(Result&& other) noexcept; + + // adds new measurement results + // all values are scaled by iters (except iters...) + void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc); + + ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept; + + ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const; + ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const; + ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const; + ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept; + ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept; + ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept; + ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept; + ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const; + ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const; + + ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept; + ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const; + ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept; + ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept; + + // Finds string, if not found, returns _size. + static Measure fromString(std::string const& str); + +private: + Config mConfig{}; + std::vector> mNameToMeasurements{}; +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +/** + * An extremely fast random generator. Currently, this implements *RomuDuoJr*, developed by Mark Overton. Source: + * http://www.romu-random.org/ + * + * RomuDuoJr is extremely fast and provides reasonable good randomness. Not enough for large jobs, but definitely + * good enough for a benchmarking framework. + * + * * Estimated capacity: @f$ 2^{51} @f$ bytes + * * Register pressure: 4 + * * State size: 128 bits + * + * This random generator is a drop-in replacement for the generators supplied by ````. It is not + * cryptographically secure. It's intended purpose is to be very fast so that benchmarks that make use + * of randomness are not distorted too much by the random generator. + * + * Rng also provides a few non-standard helpers, optimized for speed. + */ +class Rng final { +public: + /** + * @brief This RNG provides 64bit randomness. + */ + using result_type = uint64_t; + + static constexpr uint64_t(min)(); + static constexpr uint64_t(max)(); + + /** + * As a safety precaution, we don't allow copying. Copying a PRNG would mean you would have two random generators that produce the + * same sequence, which is generally not what one wants. Instead create a new rng with the default constructor Rng(), which is + * automatically seeded from `std::random_device`. If you really need a copy, use `copy()`. + */ + Rng(Rng const&) = delete; + + /** + * Same as Rng(Rng const&), we don't allow assignment. If you need a new Rng create one with the default constructor Rng(). + */ + Rng& operator=(Rng const&) = delete; + + // moving is ok + Rng(Rng&&) noexcept = default; + Rng& operator=(Rng&&) noexcept = default; + ~Rng() noexcept = default; + + /** + * @brief Creates a new Random generator with random seed. + * + * Instead of a default seed (as the random generators from the STD), this properly seeds the random generator from + * `std::random_device`. It guarantees correct seeding. Note that seeding can be relatively slow, depending on the source of + * randomness used. So it is best to create a Rng once and use it for all your randomness purposes. + */ + Rng(); + + /*! + Creates a new Rng that is seeded with a specific seed. Each Rng created from the same seed will produce the same randomness + sequence. This can be useful for deterministic behavior. + + @verbatim embed:rst + .. note:: + + The random algorithm might change between nanobench releases. Whenever a faster and/or better random + generator becomes available, I will switch the implementation. + @endverbatim + + As per the Romu paper, this seeds the Rng with splitMix64 algorithm and performs 10 initial rounds for further mixing up of the + internal state. + + @param seed The 64bit seed. All values are allowed, even 0. + */ + explicit Rng(uint64_t seed) noexcept; + Rng(uint64_t x, uint64_t y) noexcept; + explicit Rng(std::vector const& data); + + /** + * Creates a copy of the Rng, thus the copy provides exactly the same random sequence as the original. + */ + ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept; + + /** + * @brief Produces a 64bit random value. This should be very fast, thus it is marked as inline. In my benchmark, this is ~46 times + * faster than `std::default_random_engine` for producing 64bit random values. It seems that the fastest std contender is + * `std::mt19937_64`. Still, this RNG is 2-3 times as fast. + * + * @return uint64_t The next 64 bit random value. + */ + inline uint64_t operator()() noexcept; + + // This is slightly biased. See + + /** + * Generates a random number between 0 and range (excluding range). + * + * The algorithm only produces 32bit numbers, and is slightly biased. The effect is quite small unless your range is close to the + * maximum value of an integer. It is possible to correct the bias with rejection sampling (see + * [here](https://lemire.me/blog/2016/06/30/fast-random-shuffling/), but this is most likely irrelevant in practices for the + * purposes of this Rng. + * + * See Daniel Lemire's blog post [A fast alternative to the modulo + * reduction](https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/) + * + * @param range Upper exclusive range. E.g a value of 3 will generate random numbers 0, 1, 2. + * @return uint32_t Generated random values in range [0, range(. + */ + inline uint32_t bounded(uint32_t range) noexcept; + + // random double in range [0, 1( + // see http://prng.di.unimi.it/ + + /** + * Provides a random uniform double value between 0 and 1. This uses the method described in [Generating uniform doubles in the + * unit interval](http://prng.di.unimi.it/), and is extremely fast. + * + * @return double Uniformly distributed double value in range [0,1(, excluding 1. + */ + inline double uniform01() noexcept; + + /** + * Shuffles all entries in the given container. Although this has a slight bias due to the implementation of bounded(), this is + * preferable to `std::shuffle` because it is over 5 times faster. See Daniel Lemire's blog post [Fast random + * shuffling](https://lemire.me/blog/2016/06/30/fast-random-shuffling/). + * + * @param container The whole container will be shuffled. + */ + template + void shuffle(Container& container) noexcept; + + /** + * Extracts the full state of the generator, e.g. for serialization. For this RNG this is just 2 values, but to stay API compatible + * with future implementations that potentially use more state, we use a vector. + * + * @return Vector containing the full state: + */ + ANKERL_NANOBENCH(NODISCARD) std::vector state() const; + +private: + static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept; + + uint64_t mX; + uint64_t mY; +}; + +/** + * @brief Main entry point to nanobench's benchmarking facility. + * + * It holds configuration and results from one or more benchmark runs. Usually it is used in a single line, where the object is + * constructed, configured, and then a benchmark is run. E.g. like this: + * + * ankerl::nanobench::Bench().unit("byte").batch(1000).run("random fluctuations", [&] { + * // here be the benchmark code + * }); + * + * In that example Bench() constructs the benchmark, it is then configured with unit() and batch(), and after configuration a + * benchmark is executed with run(). Once run() has finished, it prints the result to `std::cout`. It would also store the results + * in the Bench instance, but in this case the object is immediately destroyed so it's not available any more. + */ +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +class Bench { +public: + /** + * @brief Creates a new benchmark for configuration and running of benchmarks. + */ + Bench(); + + Bench(Bench&& other) noexcept; + Bench& operator=(Bench&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)); + Bench(Bench const& other); + Bench& operator=(Bench const& other); + ~Bench() noexcept; + + /*! + @brief Repeatedly calls `op()` based on the configuration, and performs measurements. + + This call is marked with `noinline` to prevent the compiler to optimize beyond different benchmarks. This can have quite a big + effect on benchmark accuracy. + + @verbatim embed:rst + .. note:: + + Each call to your lambda must have a side effect that the compiler can't possibly optimize it away. E.g. add a result to an + externally defined number (like `x` in the above example), and finally call `doNotOptimizeAway` on the variables the compiler + must not remove. You can also use :cpp:func:`ankerl::nanobench::doNotOptimizeAway` directly in the lambda, but be aware that + this has a small overhead. + + @endverbatim + + @tparam Op The code to benchmark. + */ + template + ANKERL_NANOBENCH(NOINLINE) + Bench& run(char const* benchmarkName, Op&& op); + + template + ANKERL_NANOBENCH(NOINLINE) + Bench& run(std::string const& benchmarkName, Op&& op); + + /** + * @brief Same as run(char const* benchmarkName, Op op), but instead uses the previously set name. + * @tparam Op The code to benchmark. + */ + template + ANKERL_NANOBENCH(NOINLINE) + Bench& run(Op&& op); + + /** + * @brief Title of the benchmark, will be shown in the table header. Changing the title will start a new markdown table. + * + * @param benchmarkTitle The title of the benchmark. + */ + Bench& title(char const* benchmarkTitle); + Bench& title(std::string const& benchmarkTitle); + + /** + * @brief Gets the title of the benchmark + */ + ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept; + + /// Name of the benchmark, will be shown in the table row. + Bench& name(char const* benchmarkName); + Bench& name(std::string const& benchmarkName); + ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept; + + /** + * @brief Set context information. + * + * The information can be accessed using custom render templates via `{{context(variableName)}}`. + * Trying to render a variable that hasn't been set before raises an exception. + * Not included in (default) markdown table. + * + * @see clearContext, render + * + * @param variableName The name of the context variable. + * @param variableValue The value of the context variable. + */ + Bench& context(char const* variableName, char const* variableValue); + Bench& context(std::string const& variableName, std::string const& variableValue); + + /** + * @brief Reset context information. + * + * This may improve efficiency when using many context entries, + * or improve robustness by removing spurious context entries. + * + * @see context + */ + Bench& clearContext(); + + /** + * @brief Sets the batch size. + * + * E.g. number of processed byte, or some other metric for the size of the processed data in each iteration. If you benchmark + * hashing of a 1000 byte long string and want byte/sec as a result, you can specify 1000 as the batch size. + * + * @tparam T Any input type is internally cast to `double`. + * @param b batch size + */ + template + Bench& batch(T b) noexcept; + ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept; + + /** + * @brief Sets the operation unit. + * + * Defaults to "op". Could be e.g. "byte" for string processing. This is used for the table header, e.g. to show `ns/byte`. Use + * singular (*byte*, not *bytes*). A change clears the currently collected results. + * + * @param unit The unit name. + */ + Bench& unit(char const* unit); + Bench& unit(std::string const& unit); + ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept; + + /** + * @brief Sets the time unit to be used for the default output. + * + * Nanobench defaults to using ns (nanoseconds) as output in the markdown. For some benchmarks this is too coarse, so it is + * possible to configure this. E.g. use `timeUnit(1ms, "ms")` to show `ms/op` instead of `ns/op`. + * + * @param tu Time unit to display the results in, default is 1ns. + * @param tuName Name for the time unit, default is "ns" + */ + Bench& timeUnit(std::chrono::duration const& tu, std::string const& tuName); + ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept; + ANKERL_NANOBENCH(NODISCARD) std::chrono::duration const& timeUnit() const noexcept; + + /** + * @brief Set the output stream where the resulting markdown table will be printed to. + * + * The default is `&std::cout`. You can disable all output by setting `nullptr`. + * + * @param outstream Pointer to output stream, can be `nullptr`. + */ + Bench& output(std::ostream* outstream) noexcept; + ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept; + + /** + * Modern processors have a very accurate clock, being able to measure as low as 20 nanoseconds. This is the main trick nanobech to + * be so fast: we find out how accurate the clock is, then run the benchmark only so often that the clock's accuracy is good enough + * for accurate measurements. + * + * The default is to run one epoch for 1000 times the clock resolution. So for 20ns resolution and 11 epochs, this gives a total + * runtime of + * + * @f[ + * 20ns * 1000 * 11 \approx 0.2ms + * @f] + * + * To be precise, nanobench adds a 0-20% random noise to each evaluation. This is to prevent any aliasing effects, and further + * improves accuracy. + * + * Total runtime will be higher though: Some initial time is needed to find out the target number of iterations for each epoch, and + * there is some overhead involved to start & stop timers and calculate resulting statistics and writing the output. + * + * @param multiple Target number of times of clock resolution. Usually 1000 is a good compromise between runtime and accuracy. + */ + Bench& clockResolutionMultiple(size_t multiple) noexcept; + ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept; + + /** + * @brief Controls number of epochs, the number of measurements to perform. + * + * The reported result will be the median of evaluation of each epoch. The higher you choose this, the more + * deterministic the result be and outliers will be more easily removed. Also the `err%` will be more accurate the higher this + * number is. Note that the `err%` will not necessarily decrease when number of epochs is increased. But it will be a more accurate + * representation of the benchmarked code's runtime stability. + * + * Choose the value wisely. In practice, 11 has been shown to be a reasonable choice between runtime performance and accuracy. + * This setting goes hand in hand with minEpochIterations() (or minEpochTime()). If you are more interested in *median* runtime, + * you might want to increase epochs(). If you are more interested in *mean* runtime, you might want to increase + * minEpochIterations() instead. + * + * @param numEpochs Number of epochs. + */ + Bench& epochs(size_t numEpochs) noexcept; + ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept; + + /** + * @brief Upper limit for the runtime of each epoch. + * + * As a safety precaution if the clock is not very accurate, we can set an upper limit for the maximum evaluation time per + * epoch. Default is 100ms. At least a single evaluation of the benchmark is performed. + * + * @see minEpochTime, minEpochIterations + * + * @param t Maximum target runtime for a single epoch. + */ + Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept; + ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept; + + /** + * @brief Minimum time each epoch should take. + * + * Default is zero, so we are fully relying on clockResolutionMultiple(). In most cases this is exactly what you want. If you see + * that the evaluation is unreliable with a high `err%`, you can increase either minEpochTime() or minEpochIterations(). + * + * @see maxEpochTime, minEpochIterations + * + * @param t Minimum time each epoch should take. + */ + Bench& minEpochTime(std::chrono::nanoseconds t) noexcept; + ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept; + + /** + * @brief Sets the minimum number of iterations each epoch should take. + * + * Default is 1, and we rely on clockResolutionMultiple(). If the `err%` is high and you want a more smooth result, you might want + * to increase the minimum number of iterations, or increase the minEpochTime(). + * + * @see minEpochTime, maxEpochTime, minEpochIterations + * + * @param numIters Minimum number of iterations per epoch. + */ + Bench& minEpochIterations(uint64_t numIters) noexcept; + ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept; + + /** + * Sets exactly the number of iterations for each epoch. Ignores all other epoch limits. This forces nanobench to use exactly + * the given number of iterations for each epoch, not more and not less. Default is 0 (disabled). + * + * @param numIters Exact number of iterations to use. Set to 0 to disable. + */ + Bench& epochIterations(uint64_t numIters) noexcept; + ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept; + + /** + * @brief Sets a number of iterations that are initially performed without any measurements. + * + * Some benchmarks need a few evaluations to warm up caches / database / whatever access. Normally this should not be needed, since + * we show the median result so initial outliers will be filtered away automatically. If the warmup effect is large though, you + * might want to set it. Default is 0. + * + * @param numWarmupIters Number of warmup iterations. + */ + Bench& warmup(uint64_t numWarmupIters) noexcept; + ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept; + + /** + * @brief Marks the next run as the baseline. + * + * Call `relative(true)` to mark the run as the baseline. Successive runs will be compared to this run. It is calculated by + * + * @f[ + * 100\% * \frac{baseline}{runtime} + * @f] + * + * * 100% means it is exactly as fast as the baseline + * * >100% means it is faster than the baseline. E.g. 200% means the current run is twice as fast as the baseline. + * * <100% means it is slower than the baseline. E.g. 50% means it is twice as slow as the baseline. + * + * See the tutorial section "Comparing Results" for example usage. + * + * @param isRelativeEnabled True to enable processing + */ + Bench& relative(bool isRelativeEnabled) noexcept; + ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept; + + /** + * @brief Enables/disables performance counters. + * + * On Linux nanobench has a powerful feature to use performance counters. This enables counting of retired instructions, count + * number of branches, missed branches, etc. On default this is enabled, but you can disable it if you don't need that feature. + * + * @param showPerformanceCounters True to enable, false to disable. + */ + Bench& performanceCounters(bool showPerformanceCounters) noexcept; + ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept; + + /** + * @brief Retrieves all benchmark results collected by the bench object so far. + * + * Each call to run() generates a Result that is stored within the Bench instance. This is mostly for advanced users who want to + * see all the nitty gritty details. + * + * @return All results collected so far. + */ + ANKERL_NANOBENCH(NODISCARD) std::vector const& results() const noexcept; + + /*! + @verbatim embed:rst + + Convenience shortcut to :cpp:func:`ankerl::nanobench::doNotOptimizeAway`. + + @endverbatim + */ + template + Bench& doNotOptimizeAway(Arg&& arg); + + /*! + @verbatim embed:rst + + Sets N for asymptotic complexity calculation, so it becomes possible to calculate `Big O + `_ from multiple benchmark evaluations. + + Use :cpp:func:`ankerl::nanobench::Bench::complexityBigO` when the evaluation has finished. See the tutorial + :ref:`asymptotic-complexity` for details. + + @endverbatim + + @tparam T Any type is cast to `double`. + @param n Length of N for the next benchmark run, so it is possible to calculate `bigO`. + */ + template + Bench& complexityN(T n) noexcept; + ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept; + + /*! + Calculates [Big O](https://en.wikipedia.org/wiki/Big_O_notation>) of the results with all preconfigured complexity functions. + Currently these complexity functions are fitted into the benchmark results: + + @f$ \mathcal{O}(1) @f$, + @f$ \mathcal{O}(n) @f$, + @f$ \mathcal{O}(\log{}n) @f$, + @f$ \mathcal{O}(n\log{}n) @f$, + @f$ \mathcal{O}(n^2) @f$, + @f$ \mathcal{O}(n^3) @f$. + + If we e.g. evaluate the complexity of `std::sort`, this is the result of `std::cout << bench.complexityBigO()`: + + ``` + | coefficient | err% | complexity + |--------------:|-------:|------------ + | 5.08935e-09 | 2.6% | O(n log n) + | 6.10608e-08 | 8.0% | O(n) + | 1.29307e-11 | 47.2% | O(n^2) + | 2.48677e-15 | 69.6% | O(n^3) + | 9.88133e-06 | 132.3% | O(log n) + | 5.98793e-05 | 162.5% | O(1) + ``` + + So in this case @f$ \mathcal{O}(n\log{}n) @f$ provides the best approximation. + + @verbatim embed:rst + See the tutorial :ref:`asymptotic-complexity` for details. + @endverbatim + @return Evaluation results, which can be printed or otherwise inspected. + */ + std::vector complexityBigO() const; + + /** + * @brief Calculates bigO for a custom function. + * + * E.g. to calculate the mean squared error for @f$ \mathcal{O}(\log{}\log{}n) @f$, which is not part of the default set of + * complexityBigO(), you can do this: + * + * ``` + * auto logLogN = bench.complexityBigO("O(log log n)", [](double n) { + * return std::log2(std::log2(n)); + * }); + * ``` + * + * The resulting mean squared error can be printed with `std::cout << logLogN`. E.g. it prints something like this: + * + * ```text + * 2.46985e-05 * O(log log n), rms=1.48121 + * ``` + * + * @tparam Op Type of mapping operation. + * @param name Name for the function, e.g. "O(log log n)" + * @param op Op's operator() maps a `double` with the desired complexity function, e.g. `log2(log2(n))`. + * @return BigO Error calculation, which is streamable to std::cout. + */ + template + BigO complexityBigO(char const* name, Op op) const; + + template + BigO complexityBigO(std::string const& name, Op op) const; + + /*! + @verbatim embed:rst + + Convenience shortcut to :cpp:func:`ankerl::nanobench::render`. + + @endverbatim + */ + Bench& render(char const* templateContent, std::ostream& os); + Bench& render(std::string const& templateContent, std::ostream& os); + + Bench& config(Config const& benchmarkConfig); + ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept; + +private: + Config mConfig{}; + std::vector mResults{}; +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +/** + * @brief Makes sure none of the given arguments are optimized away by the compiler. + * + * @tparam Arg Type of the argument that shouldn't be optimized away. + * @param arg The input that we mark as being used, even though we don't do anything with it. + */ +template +void doNotOptimizeAway(Arg&& arg); + +namespace detail { + +#if defined(_MSC_VER) +void doNotOptimizeAwaySink(void const*); + +template +void doNotOptimizeAway(T const& val); + +#else + +// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but +// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways. +// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L443-L446 +template +void doNotOptimizeAway(T const& val) { + // NOLINTNEXTLINE(hicpp-no-assembler) + asm volatile("" : : "r,m"(val) : "memory"); +} + +template +void doNotOptimizeAway(T& val) { +# if defined(__clang__) + // NOLINTNEXTLINE(hicpp-no-assembler) + asm volatile("" : "+r,m"(val) : : "memory"); +# else + // NOLINTNEXTLINE(hicpp-no-assembler) + asm volatile("" : "+m,r"(val) : : "memory"); +# endif +} +#endif + +// internally used, but visible because run() is templated. +// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from +// having to include , and the template instantiation overhead of unique_ptr which is unfortunately quite significant. +ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH) +class IterationLogic { +public: + explicit IterationLogic(Bench const& bench); + IterationLogic(IterationLogic&&) = delete; + IterationLogic& operator=(IterationLogic&&) = delete; + IterationLogic(IterationLogic const&) = delete; + IterationLogic& operator=(IterationLogic const&) = delete; + ~IterationLogic(); + + ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept; + void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept; + void moveResultTo(std::vector& results) noexcept; + +private: + struct Impl; + Impl* mPimpl; +}; +ANKERL_NANOBENCH(IGNORE_EFFCPP_POP) + +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +class PerformanceCounters { +public: + PerformanceCounters(PerformanceCounters const&) = delete; + PerformanceCounters(PerformanceCounters&&) = delete; + PerformanceCounters& operator=(PerformanceCounters const&) = delete; + PerformanceCounters& operator=(PerformanceCounters&&) = delete; + + PerformanceCounters(); + ~PerformanceCounters(); + + void beginMeasure(); + void endMeasure(); + void updateResults(uint64_t numIters); + + ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& val() const noexcept; + ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& has() const noexcept; + +private: +#if ANKERL_NANOBENCH(PERF_COUNTERS) + LinuxPerformanceCounters* mPc = nullptr; +#endif + PerfCountSet mVal{}; + PerfCountSet mHas{}; +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +// Gets the singleton +PerformanceCounters& performanceCounters(); + +} // namespace detail + +class BigO { +public: + using RangeMeasure = std::vector>; + + template + static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op) { + for (auto& rangeMeasure : data) { + rangeMeasure.first = op(rangeMeasure.first); + } + return data; + } + + static RangeMeasure collectRangeMeasure(std::vector const& results); + + template + BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN) + : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {} + + template + BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN) + : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {} + + BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure); + BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure); + ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept; + ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept; + ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept; + ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept; + +private: + std::string mName{}; + double mConstant{}; + double mNormalizedRootMeanSquare{}; +}; +std::ostream& operator<<(std::ostream& os, BigO const& bigO); +std::ostream& operator<<(std::ostream& os, std::vector const& bigOs); + +} // namespace nanobench +} // namespace ankerl + +// implementation ///////////////////////////////////////////////////////////////////////////////// + +namespace ankerl { +namespace nanobench { + +constexpr uint64_t(Rng::min)() { + return 0; +} + +constexpr uint64_t(Rng::max)() { + return (std::numeric_limits::max)(); +} + +ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") +uint64_t Rng::operator()() noexcept { + auto x = mX; + + mX = UINT64_C(15241094284759029579) * mY; + mY = rotl(mY - x, 27); + + return x; +} + +ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") +uint32_t Rng::bounded(uint32_t range) noexcept { + uint64_t const r32 = static_cast(operator()()); + auto multiresult = r32 * range; + return static_cast(multiresult >> 32U); +} + +double Rng::uniform01() noexcept { + auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U); + // can't use union in c++ here for type puning, it's undefined behavior. + // std::memcpy is optimized anyways. + double d{}; + std::memcpy(&d, &i, sizeof(double)); + return d - 1.0; +} + +template +void Rng::shuffle(Container& container) noexcept { + auto i = container.size(); + while (i > 1U) { + using std::swap; + auto n = operator()(); + // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80) + auto b1 = static_cast((static_cast(n) * static_cast(i)) >> 32U); + swap(container[--i], container[b1]); + + auto b2 = static_cast(((n >> 32U) * static_cast(i)) >> 32U); + swap(container[--i], container[b2]); + } +} + +ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") +constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept { + return (x << k) | (x >> (64U - k)); +} + +template +ANKERL_NANOBENCH_NO_SANITIZE("integer") +Bench& Bench::run(Op&& op) { + // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op() + detail::IterationLogic iterationLogic(*this); + auto& pc = detail::performanceCounters(); + + while (auto n = iterationLogic.numIters()) { + pc.beginMeasure(); + Clock::time_point const before = Clock::now(); + while (n-- > 0) { + op(); + } + Clock::time_point const after = Clock::now(); + pc.endMeasure(); + pc.updateResults(iterationLogic.numIters()); + iterationLogic.add(after - before, pc); + } + iterationLogic.moveResultTo(mResults); + return *this; +} + +// Performs all evaluations. +template +Bench& Bench::run(char const* benchmarkName, Op&& op) { + name(benchmarkName); + return run(std::forward(op)); +} + +template +Bench& Bench::run(std::string const& benchmarkName, Op&& op) { + name(benchmarkName); + return run(std::forward(op)); +} + +template +BigO Bench::complexityBigO(char const* benchmarkName, Op op) const { + return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op); +} + +template +BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const { + return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op); +} + +// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration. +// Any argument is cast to double. +template +Bench& Bench::batch(T b) noexcept { + mConfig.mBatch = static_cast(b); + return *this; +} + +// Sets the computation complexity of the next run. Any argument is cast to double. +template +Bench& Bench::complexityN(T n) noexcept { + mConfig.mComplexityN = static_cast(n); + return *this; +} + +// Convenience: makes sure none of the given arguments are optimized away by the compiler. +template +Bench& Bench::doNotOptimizeAway(Arg&& arg) { + detail::doNotOptimizeAway(std::forward(arg)); + return *this; +} + +// Makes sure none of the given arguments are optimized away by the compiler. +template +void doNotOptimizeAway(Arg&& arg) { + detail::doNotOptimizeAway(std::forward(arg)); +} + +namespace detail { + +#if defined(_MSC_VER) +template +void doNotOptimizeAway(T const& val) { + doNotOptimizeAwaySink(&val); +} + +#endif + +} // namespace detail +} // namespace nanobench +} // namespace ankerl + +#if defined(ANKERL_NANOBENCH_IMPLEMENT) + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// implementation part - only visible in .cpp +/////////////////////////////////////////////////////////////////////////////////////////////////// + +# include // sort, reverse +# include // compare_exchange_strong in loop overhead +# include // getenv +# include // strstr, strncmp +# include // ifstream to parse proc files +# include // setw, setprecision +# include // cout +# include // accumulate +# include // random_device +# include // to_s in Number +# include // throw for rendering templates +# include // std::tie +# if defined(__linux__) +# include //sysconf +# endif +# if ANKERL_NANOBENCH(PERF_COUNTERS) +# include // map + +# include +# include +# include +# endif + +// declarations /////////////////////////////////////////////////////////////////////////////////// + +namespace ankerl { +namespace nanobench { + +// helper stuff that is only intended to be used internally +namespace detail { + +struct TableInfo; + +// formatting utilities +namespace fmt { + +class NumSep; +class StreamStateRestorer; +class Number; +class MarkDownColumn; +class MarkDownCode; + +} // namespace fmt +} // namespace detail +} // namespace nanobench +} // namespace ankerl + +// definitions //////////////////////////////////////////////////////////////////////////////////// + +namespace ankerl { +namespace nanobench { + +uint64_t splitMix64(uint64_t& state) noexcept; + +namespace detail { + +// helpers to get double values +template +inline double d(T t) noexcept { + return static_cast(t); +} +inline double d(Clock::duration duration) noexcept { + return std::chrono::duration_cast>(duration).count(); +} + +// Calculates clock resolution once, and remembers the result +inline Clock::duration clockResolution() noexcept; + +} // namespace detail + +namespace templates { + +char const* csv() noexcept { + return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total" +{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}} +{{/result}})DELIM"; +} + +char const* htmlBoxplot() noexcept { + return R"DELIM( + + + + + + +
+ + + +)DELIM"; +} + +char const* pyperf() noexcept { + return R"DELIM({ + "benchmarks": [ + { + "runs": [ + { + "values": [ +{{#measurement}} {{elapsed}}{{^-last}}, +{{/last}}{{/measurement}} + ] + } + ] + } + ], + "metadata": { + "loops": {{sum(iterations)}}, + "inner_loops": {{batch}}, + "name": "{{title}}", + "unit": "second" + }, + "version": "1.0" +})DELIM"; +} + +char const* json() noexcept { + return R"DELIM({ + "results": [ +{{#result}} { + "title": "{{title}}", + "name": "{{name}}", + "unit": "{{unit}}", + "batch": {{batch}}, + "complexityN": {{complexityN}}, + "epochs": {{epochs}}, + "clockResolution": {{clockResolution}}, + "clockResolutionMultiple": {{clockResolutionMultiple}}, + "maxEpochTime": {{maxEpochTime}}, + "minEpochTime": {{minEpochTime}}, + "minEpochIterations": {{minEpochIterations}}, + "epochIterations": {{epochIterations}}, + "warmup": {{warmup}}, + "relative": {{relative}}, + "median(elapsed)": {{median(elapsed)}}, + "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}}, + "median(instructions)": {{median(instructions)}}, + "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}}, + "median(cpucycles)": {{median(cpucycles)}}, + "median(contextswitches)": {{median(contextswitches)}}, + "median(pagefaults)": {{median(pagefaults)}}, + "median(branchinstructions)": {{median(branchinstructions)}}, + "median(branchmisses)": {{median(branchmisses)}}, + "totalTime": {{sumProduct(iterations, elapsed)}}, + "measurements": [ +{{#measurement}} { + "iterations": {{iterations}}, + "elapsed": {{elapsed}}, + "pagefaults": {{pagefaults}}, + "cpucycles": {{cpucycles}}, + "contextswitches": {{contextswitches}}, + "instructions": {{instructions}}, + "branchinstructions": {{branchinstructions}}, + "branchmisses": {{branchmisses}} + }{{^-last}},{{/-last}} +{{/measurement}} ] + }{{^-last}},{{/-last}} +{{/result}} ] +})DELIM"; +} + +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +struct Node { + enum class Type { tag, content, section, inverted_section }; + + char const* begin; + char const* end; + std::vector children; + Type type; + + template + // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) + bool operator==(char const (&str)[N]) const noexcept { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) + return static_cast(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1); + } +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +// NOLINTNEXTLINE(misc-no-recursion) +static std::vector parseMustacheTemplate(char const** tpl) { + std::vector nodes; + + while (true) { + auto const* begin = std::strstr(*tpl, "{{"); + auto const* end = begin; + if (begin != nullptr) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + begin += 2; + end = std::strstr(begin, "}}"); + } + + if (begin == nullptr || end == nullptr) { + // nothing found, finish node + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector{}, Node::Type::content}); + return nodes; + } + + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + nodes.emplace_back(Node{*tpl, begin - 2, std::vector{}, Node::Type::content}); + + // we found a tag + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + *tpl = end + 2; + switch (*begin) { + case '/': + // finished! bail out + return nodes; + + case '#': + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section}); + break; + + case '^': + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section}); + break; + + default: + nodes.emplace_back(Node{begin, end, std::vector{}, Node::Type::tag}); + break; + } + } +} + +static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) { + ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); + bool const matchFirst = n == "-first"; + bool const matchLast = n == "-last"; + if (!matchFirst && !matchLast) { + return false; + } + + bool doWrite = false; + if (n.type == Node::Type::section) { + doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1); + } else if (n.type == Node::Type::inverted_section) { + doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1); + } + + if (doWrite) { + for (auto const& child : n.children) { + if (child.type == Node::Type::content) { + out.write(child.begin, std::distance(child.begin, child.end)); + } + } + } + return true; +} + +static bool matchCmdArgs(std::string const& str, std::vector& matchResult) { + matchResult.clear(); + auto idxOpen = str.find('('); + auto idxClose = str.find(')', idxOpen); + if (idxClose == std::string::npos) { + return false; + } + + matchResult.emplace_back(str.substr(0, idxOpen)); + + // split by comma + matchResult.emplace_back(); + for (size_t i = idxOpen + 1; i != idxClose; ++i) { + if (str[i] == ' ' || str[i] == '\t') { + // skip whitespace + continue; + } + if (str[i] == ',') { + // got a comma => new string + matchResult.emplace_back(); + continue; + } + // no whitespace no comma, append + matchResult.back() += str[i]; + } + return true; +} + +static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) { + using detail::d; + + if (n == "title") { + out << config.mBenchmarkTitle; + return true; + } + if (n == "name") { + out << config.mBenchmarkName; + return true; + } + if (n == "unit") { + out << config.mUnit; + return true; + } + if (n == "batch") { + out << config.mBatch; + return true; + } + if (n == "complexityN") { + out << config.mComplexityN; + return true; + } + if (n == "epochs") { + out << config.mNumEpochs; + return true; + } + if (n == "clockResolution") { + out << d(detail::clockResolution()); + return true; + } + if (n == "clockResolutionMultiple") { + out << config.mClockResolutionMultiple; + return true; + } + if (n == "maxEpochTime") { + out << d(config.mMaxEpochTime); + return true; + } + if (n == "minEpochTime") { + out << d(config.mMinEpochTime); + return true; + } + if (n == "minEpochIterations") { + out << config.mMinEpochIterations; + return true; + } + if (n == "epochIterations") { + out << config.mEpochIterations; + return true; + } + if (n == "warmup") { + out << config.mWarmup; + return true; + } + if (n == "relative") { + out << config.mIsRelative; + return true; + } + return false; +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) { + if (generateConfigTag(n, r.config(), out)) { + return out; + } + // match e.g. "median(elapsed)" + // g++ 4.8 doesn't implement std::regex :( + // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$"); + // std::cmatch matchResult; + // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) { + std::vector matchResult; + if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) { + if (matchResult.size() == 2) { + if (matchResult[0] == "context") { + return out << r.context(matchResult[1]); + } + + auto m = Result::fromString(matchResult[1]); + if (m == Result::Measure::_size) { + return out << 0.0; + } + + if (matchResult[0] == "median") { + return out << r.median(m); + } + if (matchResult[0] == "average") { + return out << r.average(m); + } + if (matchResult[0] == "medianAbsolutePercentError") { + return out << r.medianAbsolutePercentError(m); + } + if (matchResult[0] == "sum") { + return out << r.sum(m); + } + if (matchResult[0] == "minimum") { + return out << r.minimum(m); + } + if (matchResult[0] == "maximum") { + return out << r.maximum(m); + } + } else if (matchResult.size() == 3) { + auto m1 = Result::fromString(matchResult[1]); + auto m2 = Result::fromString(matchResult[2]); + if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) { + return out << 0.0; + } + + if (matchResult[0] == "sumProduct") { + return out << r.sumProduct(m1, m2); + } + } + } + + // match e.g. "sumProduct(elapsed, iterations)" + // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$"); + + // nothing matches :( + throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood"); +} + +static void generateResultMeasurement(std::vector const& nodes, size_t idx, Result const& r, std::ostream& out) { + for (auto const& n : nodes) { + if (!generateFirstLast(n, idx, r.size(), out)) { + ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); + switch (n.type) { + case Node::Type::content: + out.write(n.begin, std::distance(n.begin, n.end)); + break; + + case Node::Type::inverted_section: + throw std::runtime_error("got a inverted section inside measurement"); + + case Node::Type::section: + throw std::runtime_error("got a section inside measurement"); + + case Node::Type::tag: { + auto m = Result::fromString(std::string(n.begin, n.end)); + if (m == Result::Measure::_size || !r.has(m)) { + out << 0.0; + } else { + out << r.get(idx, m); + } + break; + } + } + } + } +} + +static void generateResult(std::vector const& nodes, size_t idx, std::vector const& results, std::ostream& out) { + auto const& r = results[idx]; + for (auto const& n : nodes) { + if (!generateFirstLast(n, idx, results.size(), out)) { + ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); + switch (n.type) { + case Node::Type::content: + out.write(n.begin, std::distance(n.begin, n.end)); + break; + + case Node::Type::inverted_section: + throw std::runtime_error("got a inverted section inside result"); + + case Node::Type::section: + if (n == "measurement") { + for (size_t i = 0; i < r.size(); ++i) { + generateResultMeasurement(n.children, i, r, out); + } + } else { + throw std::runtime_error("got a section inside result"); + } + break; + + case Node::Type::tag: + generateResultTag(n, r, out); + break; + } + } + } +} + +} // namespace templates + +// helper stuff that only intended to be used internally +namespace detail { + +char const* getEnv(char const* name); +bool isEndlessRunning(std::string const& name); +bool isWarningsEnabled(); + +template +T parseFile(std::string const& filename, bool* fail); + +void gatherStabilityInformation(std::vector& warnings, std::vector& recommendations); +void printStabilityInformationOnce(std::ostream* outStream); + +// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry. +uint64_t& singletonHeaderHash() noexcept; + +// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference. +Clock::duration calcClockResolution(size_t numEvaluations) noexcept; + +// formatting utilities +namespace fmt { + +// adds thousands separator to numbers +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +class NumSep : public std::numpunct { +public: + explicit NumSep(char sep); + char do_thousands_sep() const override; + std::string do_grouping() const override; + +private: + char mSep; +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +// RAII to save & restore a stream's state +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +class StreamStateRestorer { +public: + explicit StreamStateRestorer(std::ostream& s); + ~StreamStateRestorer(); + + // sets back all stream info that we remembered at construction + void restore(); + + // don't allow copying / moving + StreamStateRestorer(StreamStateRestorer const&) = delete; + StreamStateRestorer& operator=(StreamStateRestorer const&) = delete; + StreamStateRestorer(StreamStateRestorer&&) = delete; + StreamStateRestorer& operator=(StreamStateRestorer&&) = delete; + +private: + std::ostream& mStream; + std::locale mLocale; + std::streamsize const mPrecision; + std::streamsize const mWidth; + std::ostream::char_type const mFill; + std::ostream::fmtflags const mFmtFlags; +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +// Number formatter +class Number { +public: + Number(int width, int precision, double value); + Number(int width, int precision, int64_t value); + ANKERL_NANOBENCH(NODISCARD) std::string to_s() const; + +private: + friend std::ostream& operator<<(std::ostream& os, Number const& n); + std::ostream& write(std::ostream& os) const; + + int mWidth; + int mPrecision; + double mValue; +}; + +// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent +std::string to_s(uint64_t n); + +std::ostream& operator<<(std::ostream& os, Number const& n); + +class MarkDownColumn { +public: + MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept; + ANKERL_NANOBENCH(NODISCARD) std::string title() const; + ANKERL_NANOBENCH(NODISCARD) std::string separator() const; + ANKERL_NANOBENCH(NODISCARD) std::string invalid() const; + ANKERL_NANOBENCH(NODISCARD) std::string value() const; + +private: + int mWidth; + int mPrecision; + std::string mTitle; + std::string mSuffix; + double mValue; +}; + +// Formats any text as markdown code, escaping backticks. +class MarkDownCode { +public: + explicit MarkDownCode(std::string const& what); + +private: + friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode); + std::ostream& write(std::ostream& os) const; + + std::string mWhat{}; +}; + +std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode); + +} // namespace fmt +} // namespace detail +} // namespace nanobench +} // namespace ankerl + +// implementation ///////////////////////////////////////////////////////////////////////////////// + +namespace ankerl { +namespace nanobench { + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +void render(char const* mustacheTemplate, std::vector const& results, std::ostream& out) { + detail::fmt::StreamStateRestorer const restorer(out); + + out.precision(std::numeric_limits::digits10); + auto nodes = templates::parseMustacheTemplate(&mustacheTemplate); + + for (auto const& n : nodes) { + ANKERL_NANOBENCH_LOG("n.type=" << static_cast(n.type)); + switch (n.type) { + case templates::Node::Type::content: + out.write(n.begin, std::distance(n.begin, n.end)); + break; + + case templates::Node::Type::inverted_section: + throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'"); + + case templates::Node::Type::section: + if (n == "result") { + const size_t nbResults = results.size(); + for (size_t i = 0; i < nbResults; ++i) { + generateResult(n.children, i, results, out); + } + } else if (n == "measurement") { + if (results.size() != 1) { + throw std::runtime_error( + "render: can only use section 'measurement' here if there is a single result, but there are " + + detail::fmt::to_s(results.size())); + } + // when we only have a single result, we can immediately go into its measurement. + auto const& r = results.front(); + for (size_t i = 0; i < r.size(); ++i) { + generateResultMeasurement(n.children, i, r, out); + } + } else { + throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'"); + } + break; + + case templates::Node::Type::tag: + if (results.size() == 1) { + // result & config are both supported there + generateResultTag(n, results.front(), out); + } else { + // This just uses the last result's config. + if (!generateConfigTag(n, results.back().config(), out)) { + throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'"); + } + } + break; + } + } +} + +void render(std::string const& mustacheTemplate, std::vector const& results, std::ostream& out) { + render(mustacheTemplate.c_str(), results, out); +} + +void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) { + render(mustacheTemplate, bench.results(), out); +} + +void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) { + render(mustacheTemplate.c_str(), bench.results(), out); +} + +namespace detail { + +PerformanceCounters& performanceCounters() { +# if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wexit-time-destructors" +# endif + static PerformanceCounters pc; +# if defined(__clang__) +# pragma clang diagnostic pop +# endif + return pc; +} + +// Windows version of doNotOptimizeAway +// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L514 +// see https://github.com/facebook/folly/blob/v2023.01.30.00/folly/lang/Hint-inl.h#L54-L58 +// see https://learn.microsoft.com/en-us/cpp/preprocessor/optimize +# if defined(_MSC_VER) +# pragma optimize("", off) +void doNotOptimizeAwaySink(void const*) {} +# pragma optimize("", on) +# endif + +template +T parseFile(std::string const& filename, bool* fail) { + std::ifstream fin(filename); // NOLINT(misc-const-correctness) + T num{}; + fin >> num; + if (fail != nullptr) { + *fail = fin.fail(); + } + return num; +} + +char const* getEnv(char const* name) { +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4996) // getenv': This function or variable may be unsafe. +# endif + return std::getenv(name); // NOLINT(concurrency-mt-unsafe) +# if defined(_MSC_VER) +# pragma warning(pop) +# endif +} + +bool isEndlessRunning(std::string const& name) { + auto const* const endless = getEnv("NANOBENCH_ENDLESS"); + return nullptr != endless && endless == name; +} + +// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0" +bool isWarningsEnabled() { + auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS"); + return nullptr == suppression || suppression == std::string("0"); +} + +void gatherStabilityInformation(std::vector& warnings, std::vector& recommendations) { + warnings.clear(); + recommendations.clear(); + +# if defined(DEBUG) + warnings.emplace_back("DEBUG defined"); + bool const recommendCheckFlags = true; +# else + bool const recommendCheckFlags = false; +# endif + + bool recommendPyPerf = false; +# if defined(__linux__) + auto nprocs = sysconf(_SC_NPROCESSORS_CONF); + if (nprocs <= 0) { + warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible"); + } else { + // check frequency scaling + for (long id = 0; id < nprocs; ++id) { + auto idStr = detail::fmt::to_s(static_cast(id)); + auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr; + auto minFreq = parseFile(sysCpu + "/cpufreq/scaling_min_freq", nullptr); + auto maxFreq = parseFile(sysCpu + "/cpufreq/scaling_max_freq", nullptr); + if (minFreq != maxFreq) { + auto minMHz = d(minFreq) / 1000.0; + auto maxMHz = d(maxFreq) / 1000.0; + warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " + + detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() + + " MHz"); + recommendPyPerf = true; + break; + } + } + + auto fail = false; + auto currentGovernor = parseFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail); + if (!fail && "performance" != currentGovernor) { + warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'"); + recommendPyPerf = true; + } + + auto noTurbo = parseFile("/sys/devices/system/cpu/intel_pstate/no_turbo", &fail); + if (!fail && noTurbo == 0) { + warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate"); + recommendPyPerf = true; + } + } +# endif + + if (recommendCheckFlags) { + recommendations.emplace_back("Make sure you compile for Release"); + } + if (recommendPyPerf) { + recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf"); + } +} + +void printStabilityInformationOnce(std::ostream* outStream) { + static bool shouldPrint = true; + if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) { + auto& os = *outStream; + shouldPrint = false; + std::vector warnings; + std::vector recommendations; + gatherStabilityInformation(warnings, recommendations); + if (warnings.empty()) { + return; + } + + os << "Warning, results might be unstable:" << std::endl; + for (auto const& w : warnings) { + os << "* " << w << std::endl; + } + + os << std::endl << "Recommendations" << std::endl; + for (auto const& r : recommendations) { + os << "* " << r << std::endl; + } + } +} + +// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry. +uint64_t& singletonHeaderHash() noexcept { + static uint64_t sHeaderHash{}; + return sHeaderHash; +} + +ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") +inline uint64_t hash_combine(uint64_t seed, uint64_t val) { + return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U)); +} + +// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference. +Clock::duration calcClockResolution(size_t numEvaluations) noexcept { + auto bestDuration = Clock::duration::max(); + Clock::time_point tBegin; + Clock::time_point tEnd; + for (size_t i = 0; i < numEvaluations; ++i) { + tBegin = Clock::now(); + do { + tEnd = Clock::now(); + } while (tBegin == tEnd); + bestDuration = (std::min)(bestDuration, tEnd - tBegin); + } + return bestDuration; +} + +// Calculates clock resolution once, and remembers the result +Clock::duration clockResolution() noexcept { + static Clock::duration const sResolution = calcClockResolution(20); + return sResolution; +} + +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +struct IterationLogic::Impl { + enum class State { warmup, upscaling_runtime, measuring, endless }; + + explicit Impl(Bench const& bench) + : mBench(bench) + , mResult(bench.config()) { + printStabilityInformationOnce(mBench.output()); + + // determine target runtime per epoch + mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple(); + if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) { + mTargetRuntimePerEpoch = mBench.maxEpochTime(); + } + if (mTargetRuntimePerEpoch < mBench.minEpochTime()) { + mTargetRuntimePerEpoch = mBench.minEpochTime(); + } + + if (isEndlessRunning(mBench.name())) { + std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl; + mNumIters = (std::numeric_limits::max)(); + mState = State::endless; + } else if (0 != mBench.warmup()) { + mNumIters = mBench.warmup(); + mState = State::warmup; + } else if (0 != mBench.epochIterations()) { + // exact number of iterations + mNumIters = mBench.epochIterations(); + mState = State::measuring; + } else { + mNumIters = mBench.minEpochIterations(); + mState = State::upscaling_runtime; + } + } + + // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow. + ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept { + auto doubleElapsed = d(elapsed); + auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch); + auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters); + + auto doubleMinEpochIters = d(mBench.minEpochIterations()); + if (doubleNewIters < doubleMinEpochIters) { + doubleNewIters = doubleMinEpochIters; + } + doubleNewIters *= 1.0 + 0.2 * mRng.uniform01(); + + // +0.5 for correct rounding when casting + // NOLINTNEXTLINE(bugprone-incorrect-roundings) + return static_cast(doubleNewIters + 0.5); + } + + ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) { + if (elapsed * 10 < mTargetRuntimePerEpoch) { + // we are far below the target runtime. Multiply iterations by 10 (with overflow check) + if (mNumIters * 10 < mNumIters) { + // overflow :-( + showResult("iterations overflow. Maybe your code got optimized away?"); + mNumIters = 0; + return; + } + mNumIters *= 10; + } else { + mNumIters = calcBestNumIters(elapsed, mNumIters); + } + } + + void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept { +# if defined(ANKERL_NANOBENCH_LOG_ENABLED) + auto oldIters = mNumIters; +# endif + + switch (mState) { + case State::warmup: + if (isCloseEnoughForMeasurements(elapsed)) { + // if elapsed is close enough, we can skip upscaling and go right to measurements + // still, we don't add the result to the measurements. + mState = State::measuring; + mNumIters = calcBestNumIters(elapsed, mNumIters); + } else { + // not close enough: switch to upscaling + mState = State::upscaling_runtime; + upscale(elapsed); + } + break; + + case State::upscaling_runtime: + if (isCloseEnoughForMeasurements(elapsed)) { + // if we are close enough, add measurement and switch to always measuring + mState = State::measuring; + mTotalElapsed += elapsed; + mTotalNumIters += mNumIters; + mResult.add(elapsed, mNumIters, pc); + mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters); + } else { + upscale(elapsed); + } + break; + + case State::measuring: + // just add measurements - no questions asked. Even when runtime is low. But we can't ignore + // that fluctuation, or else we would bias the result + mTotalElapsed += elapsed; + mTotalNumIters += mNumIters; + mResult.add(elapsed, mNumIters, pc); + if (0 != mBench.epochIterations()) { + mNumIters = mBench.epochIterations(); + } else { + mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters); + } + break; + + case State::endless: + mNumIters = (std::numeric_limits::max)(); + break; + } + + if (static_cast(mResult.size()) == mBench.epochs()) { + // we got all the results that we need, finish it + showResult(""); + mNumIters = 0; + } + + ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, d(elapsed.count())) << " elapsed, " + << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) << " target. oldIters=" + << oldIters << ", mNumIters=" << mNumIters << ", mState=" << static_cast(mState)); + } + + // NOLINTNEXTLINE(readability-function-cognitive-complexity) + void showResult(std::string const& errorMessage) const { + ANKERL_NANOBENCH_LOG(errorMessage); + + if (mBench.output() != nullptr) { + // prepare column data /////// + std::vector columns; + + auto rMedian = mResult.median(Result::Measure::elapsed); + + if (mBench.relative()) { + double d = 100.0; + if (!mBench.results().empty()) { + d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0; + } + columns.emplace_back(11, 1, "relative", "%", d); + } + + if (mBench.complexityN() > 0) { + columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN()); + } + + columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "", + rMedian / (mBench.timeUnit().count() * mBench.batch())); + columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian); + + double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed); + columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0); + + double rInsMedian = -1.0; + if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) { + rInsMedian = mResult.median(Result::Measure::instructions); + columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch()); + } + + double rCycMedian = -1.0; + if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) { + rCycMedian = mResult.median(Result::Measure::cpucycles); + columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch()); + } + if (rInsMedian > 0.0 && rCycMedian > 0.0) { + columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian); + } + if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) { + double const rBraMedian = mResult.median(Result::Measure::branchinstructions); + columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch()); + if (mResult.has(Result::Measure::branchmisses)) { + double p = 0.0; + if (rBraMedian >= 1e-9) { + p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian; + } + columns.emplace_back(10, 1, "miss%", "%", p); + } + } + + columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed)); + + // write everything + auto& os = *mBench.output(); + + // combine all elements that are relevant for printing the header + uint64_t hash = 0; + hash = hash_combine(std::hash{}(mBench.unit()), hash); + hash = hash_combine(std::hash{}(mBench.title()), hash); + hash = hash_combine(std::hash{}(mBench.timeUnitName()), hash); + hash = hash_combine(std::hash{}(mBench.timeUnit().count()), hash); + hash = hash_combine(std::hash{}(mBench.relative()), hash); + hash = hash_combine(std::hash{}(mBench.performanceCounters()), hash); + + if (hash != singletonHeaderHash()) { + singletonHeaderHash() = hash; + + // no result yet, print header + os << std::endl; + for (auto const& col : columns) { + os << col.title(); + } + os << "| " << mBench.title() << std::endl; + + for (auto const& col : columns) { + os << col.separator(); + } + os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl; + } + + if (!errorMessage.empty()) { + for (auto const& col : columns) { + os << col.invalid(); + } + os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl; + } else { + for (auto const& col : columns) { + os << col.value(); + } + os << "| "; + auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05; + if (showUnstable) { + os << ":wavy_dash: "; + } + os << fmt::MarkDownCode(mBench.name()); + if (showUnstable) { + auto avgIters = d(mTotalNumIters) / d(mBench.epochs()); + // NOLINTNEXTLINE(bugprone-incorrect-roundings) + auto suggestedIters = static_cast(avgIters * 10 + 0.5); + + os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters) + << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")"; + } + os << std::endl; + } + } + } + + ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept { + return elapsed * 3 >= mTargetRuntimePerEpoch * 2; + } + + uint64_t mNumIters = 1; // NOLINT(misc-non-private-member-variables-in-classes) + Bench const& mBench; // NOLINT(misc-non-private-member-variables-in-classes) + std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes) + Result mResult; // NOLINT(misc-non-private-member-variables-in-classes) + Rng mRng{123}; // NOLINT(misc-non-private-member-variables-in-classes) + std::chrono::nanoseconds mTotalElapsed{}; // NOLINT(misc-non-private-member-variables-in-classes) + uint64_t mTotalNumIters = 0; // NOLINT(misc-non-private-member-variables-in-classes) + State mState = State::upscaling_runtime; // NOLINT(misc-non-private-member-variables-in-classes) +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +IterationLogic::IterationLogic(Bench const& bench) + : mPimpl(new Impl(bench)) {} + +IterationLogic::~IterationLogic() { + delete mPimpl; +} + +uint64_t IterationLogic::numIters() const noexcept { + ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters); + return mPimpl->mNumIters; +} + +void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept { + mPimpl->add(elapsed, pc); +} + +void IterationLogic::moveResultTo(std::vector& results) noexcept { + results.emplace_back(std::move(mPimpl->mResult)); +} + +# if ANKERL_NANOBENCH(PERF_COUNTERS) + +ANKERL_NANOBENCH(IGNORE_PADDED_PUSH) +class LinuxPerformanceCounters { +public: + struct Target { + Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_) + : targetValue(targetValue_) + , correctMeasuringOverhead(correctMeasuringOverhead_) + , correctLoopOverhead(correctLoopOverhead_) {} + + uint64_t* targetValue{}; // NOLINT(misc-non-private-member-variables-in-classes) + bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes) + bool correctLoopOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes) + }; + + LinuxPerformanceCounters() = default; + LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete; + LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete; + LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete; + LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete; + ~LinuxPerformanceCounters(); + + // quick operation + inline void start() {} + + inline void stop() {} + + bool monitor(perf_sw_ids swId, Target target); + bool monitor(perf_hw_id hwId, Target target); + + ANKERL_NANOBENCH(NODISCARD) bool hasError() const noexcept { + return mHasError; + } + + // Just reading data is faster than enable & disabling. + // we subtract data ourselves. + inline void beginMeasure() { + if (mHasError) { + return; + } + + // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg) + mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + if (mHasError) { + return; + } + + // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg) + mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); + } + + inline void endMeasure() { + if (mHasError) { + return; + } + + // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg) + mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP)); + if (mHasError) { + return; + } + + auto const numBytes = sizeof(uint64_t) * mCounters.size(); + auto ret = read(mFd, mCounters.data(), numBytes); + mHasError = ret != static_cast(numBytes); + } + + void updateResults(uint64_t numIters); + + // rounded integer division + template + static inline T divRounded(T a, T divisor) { + return (a + divisor / 2) / divisor; + } + + ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") + static inline uint32_t mix(uint32_t x) noexcept { + x ^= x << 13U; + x ^= x >> 17U; + x ^= x << 5U; + return x; + } + + template + ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") + void calibrate(Op&& op) { + // clear current calibration data, + for (auto& v : mCalibratedOverhead) { + v = UINT64_C(0); + } + + // create new calibration data + auto newCalibration = mCalibratedOverhead; + for (auto& v : newCalibration) { + v = (std::numeric_limits::max)(); + } + for (size_t iter = 0; iter < 100; ++iter) { + beginMeasure(); + op(); + endMeasure(); + if (mHasError) { + return; + } + + for (size_t i = 0; i < newCalibration.size(); ++i) { + auto diff = mCounters[i]; + if (newCalibration[i] > diff) { + newCalibration[i] = diff; + } + } + } + + mCalibratedOverhead = std::move(newCalibration); + + { + // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles. + // marsaglia's xorshift: mov, sal/shr, xor. Times 3. + // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further. + // see https://godbolt.org/z/49RVQ5 + uint64_t const numIters = 100000U + (std::random_device{}() & 3U); + uint64_t n = numIters; + uint32_t x = 1234567; + + beginMeasure(); + while (n-- > 0) { + x = mix(x); + } + endMeasure(); + detail::doNotOptimizeAway(x); + auto measure1 = mCounters; + + n = numIters; + beginMeasure(); + while (n-- > 0) { + // we now run *twice* so we can easily calculate the overhead + x = mix(x); + x = mix(x); + } + endMeasure(); + detail::doNotOptimizeAway(x); + auto measure2 = mCounters; + + for (size_t i = 0; i < mCounters.size(); ++i) { + // factor 2 because we have two instructions per loop + auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0; + auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0; + auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0; + + mLoopOverhead[i] = divRounded(overhead, numIters); + } + } + } + +private: + bool monitor(uint32_t type, uint64_t eventid, Target target); + + std::map mIdToTarget{}; + + // start with minimum size of 3 for read_format + std::vector mCounters{3}; + std::vector mCalibratedOverhead{3}; + std::vector mLoopOverhead{3}; + + uint64_t mTimeEnabledNanos = 0; + uint64_t mTimeRunningNanos = 0; + int mFd = -1; + bool mHasError = false; +}; +ANKERL_NANOBENCH(IGNORE_PADDED_POP) + +LinuxPerformanceCounters::~LinuxPerformanceCounters() { + if (-1 != mFd) { + close(mFd); + } +} + +bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) { + return monitor(PERF_TYPE_SOFTWARE, swId, target); +} + +bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) { + return monitor(PERF_TYPE_HARDWARE, hwId, target); +} + +// overflow is ok, it's checked +ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") +void LinuxPerformanceCounters::updateResults(uint64_t numIters) { + // clear old data + for (auto& id_value : mIdToTarget) { + *id_value.second.targetValue = UINT64_C(0); + } + + if (mHasError) { + return; + } + + mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1]; + mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2]; + + for (uint64_t i = 0; i < mCounters[0]; ++i) { + auto idx = static_cast(3 + i * 2 + 0); + auto id = mCounters[idx + 1U]; + + auto it = mIdToTarget.find(id); + if (it != mIdToTarget.end()) { + + auto& tgt = it->second; + *tgt.targetValue = mCounters[idx]; + if (tgt.correctMeasuringOverhead) { + if (*tgt.targetValue >= mCalibratedOverhead[idx]) { + *tgt.targetValue -= mCalibratedOverhead[idx]; + } else { + *tgt.targetValue = 0U; + } + } + if (tgt.correctLoopOverhead) { + auto correctionVal = mLoopOverhead[idx] * numIters; + if (*tgt.targetValue >= correctionVal) { + *tgt.targetValue -= correctionVal; + } else { + *tgt.targetValue = 0U; + } + } + } + } +} + +bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) { + *target.targetValue = (std::numeric_limits::max)(); + if (mHasError) { + return false; + } + + auto pea = perf_event_attr(); + std::memset(&pea, 0, sizeof(perf_event_attr)); + pea.type = type; + pea.size = sizeof(perf_event_attr); + pea.config = eventid; + pea.disabled = 1; // start counter as disabled + pea.exclude_kernel = 1; + pea.exclude_hv = 1; + + // NOLINTNEXTLINE(hicpp-signed-bitwise) + pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + + const int pid = 0; // the current process + const int cpu = -1; // all CPUs +# if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14 + const unsigned long flags = PERF_FLAG_FD_CLOEXEC; +# else + const unsigned long flags = 0; +# endif + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) + auto fd = static_cast(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags)); + if (-1 == fd) { + return false; + } + if (-1 == mFd) { + // first call: set to fd, and use this from now on + mFd = fd; + } + uint64_t id = 0; + // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg) + if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) { + // couldn't get id + return false; + } + + // insert into map, rely on the fact that map's references are constant. + mIdToTarget.emplace(id, target); + + // prepare readformat with the correct size (after the insert) + auto size = 3 + 2 * mIdToTarget.size(); + mCounters.resize(size); + mCalibratedOverhead.resize(size); + mLoopOverhead.resize(size); + + return true; +} + +PerformanceCounters::PerformanceCounters() + : mPc(new LinuxPerformanceCounters()) + , mVal() + , mHas() { + + // HW events + mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false)); + if (!mHas.cpuCycles) { + // Fallback to cycles counter, reference cycles not available in many systems. + mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false)); + } + mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true)); + mHas.branchInstructions = + mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false)); + mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false)); + // mHas.branchMisses = false; + + // SW events + mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false)); + mHas.contextSwitches = + mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false)); + + mPc->start(); + mPc->calibrate([] { + auto before = ankerl::nanobench::Clock::now(); + auto after = ankerl::nanobench::Clock::now(); + (void)before; + (void)after; + }); + + if (mPc->hasError()) { + // something failed, don't monitor anything. + mHas = PerfCountSet{}; + } +} + +PerformanceCounters::~PerformanceCounters() { + // no need to check for nullptr, delete nullptr has no effect + delete mPc; +} + +void PerformanceCounters::beginMeasure() { + mPc->beginMeasure(); +} + +void PerformanceCounters::endMeasure() { + mPc->endMeasure(); +} + +void PerformanceCounters::updateResults(uint64_t numIters) { + mPc->updateResults(numIters); +} + +# else + +PerformanceCounters::PerformanceCounters() = default; +PerformanceCounters::~PerformanceCounters() = default; +void PerformanceCounters::beginMeasure() {} +void PerformanceCounters::endMeasure() {} +void PerformanceCounters::updateResults(uint64_t) {} + +# endif + +ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& PerformanceCounters::val() const noexcept { + return mVal; +} +ANKERL_NANOBENCH(NODISCARD) PerfCountSet const& PerformanceCounters::has() const noexcept { + return mHas; +} + +// formatting utilities +namespace fmt { + +// adds thousands separator to numbers +NumSep::NumSep(char sep) + : mSep(sep) {} + +char NumSep::do_thousands_sep() const { + return mSep; +} + +std::string NumSep::do_grouping() const { + return "\003"; +} + +// RAII to save & restore a stream's state +StreamStateRestorer::StreamStateRestorer(std::ostream& s) + : mStream(s) + , mLocale(s.getloc()) + , mPrecision(s.precision()) + , mWidth(s.width()) + , mFill(s.fill()) + , mFmtFlags(s.flags()) {} + +StreamStateRestorer::~StreamStateRestorer() { + restore(); +} + +// sets back all stream info that we remembered at construction +void StreamStateRestorer::restore() { + mStream.imbue(mLocale); + mStream.precision(mPrecision); + mStream.width(mWidth); + mStream.fill(mFill); + mStream.flags(mFmtFlags); +} + +Number::Number(int width, int precision, int64_t value) + : mWidth(width) + , mPrecision(precision) + , mValue(d(value)) {} + +Number::Number(int width, int precision, double value) + : mWidth(width) + , mPrecision(precision) + , mValue(value) {} + +std::ostream& Number::write(std::ostream& os) const { + StreamStateRestorer const restorer(os); + os.imbue(std::locale(os.getloc(), new NumSep(','))); + os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue; + return os; +} + +std::string Number::to_s() const { + std::stringstream ss; + write(ss); + return ss.str(); +} + +std::string to_s(uint64_t n) { + std::string str; + do { + str += static_cast('0' + static_cast(n % 10)); + n /= 10; + } while (n != 0); + std::reverse(str.begin(), str.end()); + return str; +} + +std::ostream& operator<<(std::ostream& os, Number const& n) { + return n.write(os); +} + +MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept + : mWidth(w) + , mPrecision(prec) + , mTitle(std::move(tit)) + , mSuffix(std::move(suff)) + , mValue(val) {} + +std::string MarkDownColumn::title() const { + std::stringstream ss; + ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' '; + return ss.str(); +} + +std::string MarkDownColumn::separator() const { + std::string sep(static_cast(mWidth), '-'); + sep.front() = '|'; + sep.back() = ':'; + return sep; +} + +std::string MarkDownColumn::invalid() const { + std::string sep(static_cast(mWidth), ' '); + sep.front() = '|'; + sep[sep.size() - 2] = '-'; + return sep; +} + +std::string MarkDownColumn::value() const { + std::stringstream ss; + auto width = mWidth - 2 - static_cast(mSuffix.size()); + ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' '; + return ss.str(); +} + +// Formats any text as markdown code, escaping backticks. +MarkDownCode::MarkDownCode(std::string const& what) { + mWhat.reserve(what.size() + 2); + mWhat.push_back('`'); + for (char const c : what) { + mWhat.push_back(c); + if ('`' == c) { + mWhat.push_back('`'); + } + } + mWhat.push_back('`'); +} + +std::ostream& MarkDownCode::write(std::ostream& os) const { + return os << mWhat; +} + +std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) { + return mdCode.write(os); +} +} // namespace fmt +} // namespace detail + +// provide implementation here so it's only generated once +Config::Config() = default; +Config::~Config() = default; +Config& Config::operator=(Config const&) = default; +Config& Config::operator=(Config&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default; +Config::Config(Config const&) = default; +Config::Config(Config&&) noexcept = default; + +// provide implementation here so it's only generated once +Result::~Result() = default; +Result& Result::operator=(Result const&) = default; +Result& Result::operator=(Result&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default; +Result::Result(Result const&) = default; +Result::Result(Result&&) noexcept = default; + +namespace detail { +template +inline constexpr typename std::underlying_type::type u(T val) noexcept { + return static_cast::type>(val); +} +} // namespace detail + +// Result returned after a benchmark has finished. Can be used as a baseline for relative(). +Result::Result(Config benchmarkConfig) + : mConfig(std::move(benchmarkConfig)) + , mNameToMeasurements{detail::u(Result::Measure::_size)} {} + +void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) { + using detail::d; + using detail::u; + + double const dIters = d(iters); + mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters); + + mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters); + if (pc.has().pageFaults) { + mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters); + } + if (pc.has().cpuCycles) { + mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters); + } + if (pc.has().contextSwitches) { + mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters); + } + if (pc.has().instructions) { + mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters); + } + if (pc.has().branchInstructions) { + double branchInstructions = 0.0; + // correcting branches: remove branch introduced by the while (...) loop for each iteration. + if (pc.val().branchInstructions > iters + 1U) { + branchInstructions = d(pc.val().branchInstructions - (iters + 1U)); + } + mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters); + + if (pc.has().branchMisses) { + // correcting branch misses + double branchMisses = d(pc.val().branchMisses); + if (branchMisses > branchInstructions) { + // can't have branch misses when there were branches... + branchMisses = branchInstructions; + } + + // assuming at least one missed branch for the loop + branchMisses -= 1.0; + if (branchMisses < 1.0) { + branchMisses = 1.0; + } + mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters); + } + } +} + +Config const& Result::config() const noexcept { + return mConfig; +} + +inline double calcMedian(std::vector& data) { + if (data.empty()) { + return 0.0; + } + std::sort(data.begin(), data.end()); + + auto midIdx = data.size() / 2U; + if (1U == (data.size() & 1U)) { + return data[midIdx]; + } + return (data[midIdx - 1U] + data[midIdx]) / 2U; +} + +double Result::median(Measure m) const { + // create a copy so we can sort + auto data = mNameToMeasurements[detail::u(m)]; + return calcMedian(data); +} + +double Result::average(Measure m) const { + using detail::d; + auto const& data = mNameToMeasurements[detail::u(m)]; + if (data.empty()) { + return 0.0; + } + + // create a copy so we can sort + return sum(m) / d(data.size()); +} + +double Result::medianAbsolutePercentError(Measure m) const { + // create copy + auto data = mNameToMeasurements[detail::u(m)]; + + // calculates MdAPE which is the median of percentage error + // see https://support.numxl.com/hc/en-us/articles/115001223503-MdAPE-Median-Absolute-Percentage-Error + auto med = calcMedian(data); + + // transform the data to absolute error + for (auto& x : data) { + x = (x - med) / x; + if (x < 0) { + x = -x; + } + } + return calcMedian(data); +} + +double Result::sum(Measure m) const noexcept { + auto const& data = mNameToMeasurements[detail::u(m)]; + return std::accumulate(data.begin(), data.end(), 0.0); +} + +double Result::sumProduct(Measure m1, Measure m2) const noexcept { + auto const& data1 = mNameToMeasurements[detail::u(m1)]; + auto const& data2 = mNameToMeasurements[detail::u(m2)]; + + if (data1.size() != data2.size()) { + return 0.0; + } + + double result = 0.0; + for (size_t i = 0, s = data1.size(); i != s; ++i) { + result += data1[i] * data2[i]; + } + return result; +} + +bool Result::has(Measure m) const noexcept { + return !mNameToMeasurements[detail::u(m)].empty(); +} + +double Result::get(size_t idx, Measure m) const { + auto const& data = mNameToMeasurements[detail::u(m)]; + return data.at(idx); +} + +bool Result::empty() const noexcept { + return 0U == size(); +} + +size_t Result::size() const noexcept { + auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)]; + return data.size(); +} + +double Result::minimum(Measure m) const noexcept { + auto const& data = mNameToMeasurements[detail::u(m)]; + if (data.empty()) { + return 0.0; + } + + // here its save to assume that at least one element is there + return *std::min_element(data.begin(), data.end()); +} + +double Result::maximum(Measure m) const noexcept { + auto const& data = mNameToMeasurements[detail::u(m)]; + if (data.empty()) { + return 0.0; + } + + // here its save to assume that at least one element is there + return *std::max_element(data.begin(), data.end()); +} + +std::string const& Result::context(char const* variableName) const { + return mConfig.mContext.at(variableName); +} + +std::string const& Result::context(std::string const& variableName) const { + return mConfig.mContext.at(variableName); +} + +Result::Measure Result::fromString(std::string const& str) { + if (str == "elapsed") { + return Measure::elapsed; + } + if (str == "iterations") { + return Measure::iterations; + } + if (str == "pagefaults") { + return Measure::pagefaults; + } + if (str == "cpucycles") { + return Measure::cpucycles; + } + if (str == "contextswitches") { + return Measure::contextswitches; + } + if (str == "instructions") { + return Measure::instructions; + } + if (str == "branchinstructions") { + return Measure::branchinstructions; + } + if (str == "branchmisses") { + return Measure::branchmisses; + } + // not found, return _size + return Measure::_size; +} + +// Configuration of a microbenchmark. +Bench::Bench() { + mConfig.mOut = &std::cout; +} + +Bench::Bench(Bench&&) noexcept = default; +Bench& Bench::operator=(Bench&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default; +Bench::Bench(Bench const&) = default; +Bench& Bench::operator=(Bench const&) = default; +Bench::~Bench() noexcept = default; + +double Bench::batch() const noexcept { + return mConfig.mBatch; +} + +double Bench::complexityN() const noexcept { + return mConfig.mComplexityN; +} + +// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100% +// means it is slower than the baseline. +Bench& Bench::relative(bool isRelativeEnabled) noexcept { + mConfig.mIsRelative = isRelativeEnabled; + return *this; +} +bool Bench::relative() const noexcept { + return mConfig.mIsRelative; +} + +Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept { + mConfig.mShowPerformanceCounters = showPerformanceCounters; + return *this; +} +bool Bench::performanceCounters() const noexcept { + return mConfig.mShowPerformanceCounters; +} + +// Operation unit. Defaults to "op", could be e.g. "byte" for string processing. +// If u differs from currently set unit, the stored results will be cleared. +// Use singular (byte, not bytes). +Bench& Bench::unit(char const* u) { + if (u != mConfig.mUnit) { + mResults.clear(); + } + mConfig.mUnit = u; + return *this; +} + +Bench& Bench::unit(std::string const& u) { + return unit(u.c_str()); +} + +std::string const& Bench::unit() const noexcept { + return mConfig.mUnit; +} + +Bench& Bench::timeUnit(std::chrono::duration const& tu, std::string const& tuName) { + mConfig.mTimeUnit = tu; + mConfig.mTimeUnitName = tuName; + return *this; +} + +std::string const& Bench::timeUnitName() const noexcept { + return mConfig.mTimeUnitName; +} + +std::chrono::duration const& Bench::timeUnit() const noexcept { + return mConfig.mTimeUnit; +} + +// If benchmarkTitle differs from currently set title, the stored results will be cleared. +Bench& Bench::title(const char* benchmarkTitle) { + if (benchmarkTitle != mConfig.mBenchmarkTitle) { + mResults.clear(); + } + mConfig.mBenchmarkTitle = benchmarkTitle; + return *this; +} +Bench& Bench::title(std::string const& benchmarkTitle) { + if (benchmarkTitle != mConfig.mBenchmarkTitle) { + mResults.clear(); + } + mConfig.mBenchmarkTitle = benchmarkTitle; + return *this; +} + +std::string const& Bench::title() const noexcept { + return mConfig.mBenchmarkTitle; +} + +Bench& Bench::name(const char* benchmarkName) { + mConfig.mBenchmarkName = benchmarkName; + return *this; +} + +Bench& Bench::name(std::string const& benchmarkName) { + mConfig.mBenchmarkName = benchmarkName; + return *this; +} + +std::string const& Bench::name() const noexcept { + return mConfig.mBenchmarkName; +} + +Bench& Bench::context(char const* variableName, char const* variableValue) { + mConfig.mContext[variableName] = variableValue; + return *this; +} + +Bench& Bench::context(std::string const& variableName, std::string const& variableValue) { + mConfig.mContext[variableName] = variableValue; + return *this; +} + +Bench& Bench::clearContext() { + mConfig.mContext.clear(); + return *this; +} + +// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch. +Bench& Bench::epochs(size_t numEpochs) noexcept { + mConfig.mNumEpochs = numEpochs; + return *this; +} +size_t Bench::epochs() const noexcept { + return mConfig.mNumEpochs; +} + +// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision. +Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept { + mConfig.mClockResolutionMultiple = multiple; + return *this; +} +size_t Bench::clockResolutionMultiple() const noexcept { + return mConfig.mClockResolutionMultiple; +} + +// Sets the maximum time each epoch should take. Default is 100ms. +Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept { + mConfig.mMaxEpochTime = t; + return *this; +} +std::chrono::nanoseconds Bench::maxEpochTime() const noexcept { + return mConfig.mMaxEpochTime; +} + +// Sets the maximum time each epoch should take. Default is 100ms. +Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept { + mConfig.mMinEpochTime = t; + return *this; +} +std::chrono::nanoseconds Bench::minEpochTime() const noexcept { + return mConfig.mMinEpochTime; +} + +Bench& Bench::minEpochIterations(uint64_t numIters) noexcept { + mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters; + return *this; +} +uint64_t Bench::minEpochIterations() const noexcept { + return mConfig.mMinEpochIterations; +} + +Bench& Bench::epochIterations(uint64_t numIters) noexcept { + mConfig.mEpochIterations = numIters; + return *this; +} +uint64_t Bench::epochIterations() const noexcept { + return mConfig.mEpochIterations; +} + +Bench& Bench::warmup(uint64_t numWarmupIters) noexcept { + mConfig.mWarmup = numWarmupIters; + return *this; +} +uint64_t Bench::warmup() const noexcept { + return mConfig.mWarmup; +} + +Bench& Bench::config(Config const& benchmarkConfig) { + mConfig = benchmarkConfig; + return *this; +} +Config const& Bench::config() const noexcept { + return mConfig; +} + +Bench& Bench::output(std::ostream* outstream) noexcept { + mConfig.mOut = outstream; + return *this; +} + +ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept { + return mConfig.mOut; +} + +std::vector const& Bench::results() const noexcept { + return mResults; +} + +Bench& Bench::render(char const* templateContent, std::ostream& os) { + ::ankerl::nanobench::render(templateContent, *this, os); + return *this; +} + +Bench& Bench::render(std::string const& templateContent, std::ostream& os) { + ::ankerl::nanobench::render(templateContent, *this, os); + return *this; +} + +std::vector Bench::complexityBigO() const { + std::vector bigOs; + auto rangeMeasure = BigO::collectRangeMeasure(mResults); + bigOs.emplace_back("O(1)", rangeMeasure, [](double) { + return 1.0; + }); + bigOs.emplace_back("O(n)", rangeMeasure, [](double n) { + return n; + }); + bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) { + return std::log2(n); + }); + bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) { + return n * std::log2(n); + }); + bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) { + return n * n; + }); + bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) { + return n * n * n; + }); + std::sort(bigOs.begin(), bigOs.end()); + return bigOs; +} + +Rng::Rng() + : mX(0) + , mY(0) { + std::random_device rd; + std::uniform_int_distribution dist; + do { + mX = dist(rd); + mY = dist(rd); + } while (mX == 0 && mY == 0); +} + +ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") +uint64_t splitMix64(uint64_t& state) noexcept { + uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15)); + z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9); + z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb); + return z ^ (z >> 31U); +} + +// Seeded as described in romu paper (update april 2020) +Rng::Rng(uint64_t seed) noexcept + : mX(splitMix64(seed)) + , mY(splitMix64(seed)) { + for (size_t i = 0; i < 10; ++i) { + operator()(); + } +} + +// only internally used to copy the RNG. +Rng::Rng(uint64_t x, uint64_t y) noexcept + : mX(x) + , mY(y) {} + +Rng Rng::copy() const noexcept { + return Rng{mX, mY}; +} + +Rng::Rng(std::vector const& data) + : mX(0) + , mY(0) { + if (data.size() != 2) { + throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " + + detail::fmt::to_s(data.size())); + } + mX = data[0]; + mY = data[1]; +} + +std::vector Rng::state() const { + std::vector data(2); + data[0] = mX; + data[1] = mY; + return data; +} + +BigO::RangeMeasure BigO::collectRangeMeasure(std::vector const& results) { + BigO::RangeMeasure rangeMeasure; + for (auto const& result : results) { + if (result.config().mComplexityN > 0.0) { + rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed)); + } + } + return rangeMeasure; +} + +BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure) + : mName(std::move(bigOName)) { + + // estimate the constant factor + double sumRangeMeasure = 0.0; + double sumRangeRange = 0.0; + + for (const auto& rm : rangeMeasure) { + sumRangeMeasure += rm.first * rm.second; + sumRangeRange += rm.first * rm.first; + } + mConstant = sumRangeMeasure / sumRangeRange; + + // calculate root mean square + double err = 0.0; + double sumMeasure = 0.0; + for (const auto& rm : rangeMeasure) { + auto diff = mConstant * rm.first - rm.second; + err += diff * diff; + + sumMeasure += rm.second; + } + + auto n = detail::d(rangeMeasure.size()); + auto mean = sumMeasure / n; + mNormalizedRootMeanSquare = std::sqrt(err / n) / mean; +} + +BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure) + : BigO(std::string(bigOName), rangeMeasure) {} + +std::string const& BigO::name() const noexcept { + return mName; +} + +double BigO::constant() const noexcept { + return mConstant; +} + +double BigO::normalizedRootMeanSquare() const noexcept { + return mNormalizedRootMeanSquare; +} + +bool BigO::operator<(BigO const& other) const noexcept { + return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName); +} + +std::ostream& operator<<(std::ostream& os, BigO const& bigO) { + return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare(); +} + +std::ostream& operator<<(std::ostream& os, std::vector const& bigOs) { + detail::fmt::StreamStateRestorer const restorer(os); + os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl; + for (auto const& bigO : bigOs) { + os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " "; + os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% "; + os << "| " << bigO.name(); + os << std::endl; + } + return os; +} + +} // namespace nanobench +} // namespace ankerl + +#endif // ANKERL_NANOBENCH_IMPLEMENT +#endif // ANKERL_NANOBENCH_H_INCLUDED diff --git a/bench/profile.sh b/bench/profile.sh new file mode 100755 index 0000000..98e0016 --- /dev/null +++ b/bench/profile.sh @@ -0,0 +1,104 @@ +#!/bin/sh +# Profile UDPspeeder on target hardware. +# Usage: ./profile.sh [results_dir] +# +# Expects bench_udpspeeder_static and test_udpspeeder_static in the +# same directory as this script (or current directory). +# Outputs results to results_dir (default: ./profile_results/). + +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" 2>/dev/null && pwd)" || SCRIPT_DIR="." +RESULTS_DIR="${1:-./profile_results}" +mkdir -p "$RESULTS_DIR" + +# Find binaries: same dir as script, then cwd +find_bin() { + if [ -x "$SCRIPT_DIR/$1" ]; then echo "$SCRIPT_DIR/$1" + elif [ -x "./$1" ]; then echo "./$1" + else echo ""; fi +} + +BENCH_BIN="$(find_bin bench_udpspeeder_static)" +TEST_BIN="$(find_bin test_udpspeeder_static)" + +if [ -z "$BENCH_BIN" ]; then + echo "ERROR: bench_udpspeeder_static not found" >&2 + exit 1 +fi + +echo "=== UDPspeeder Profiling ===" +echo "Date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" +echo "Host: $(hostname 2>/dev/null || echo unknown)" +echo "" + +# --- System info --- +INFO="$RESULTS_DIR/system_info.txt" +{ + echo "hostname: $(hostname 2>/dev/null || echo unknown)" + echo "date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" + echo "uname: $(uname -a)" + echo "arch: $(uname -m)" + echo "" + + if [ -f /proc/cpuinfo ]; then + echo "--- /proc/cpuinfo (first core) ---" + # Print until first blank line (= first core only) + sed '/^$/q' /proc/cpuinfo + echo "" + + # Core count + cores=$(grep -c '^processor' /proc/cpuinfo 2>/dev/null || echo "?") + echo "core_count: $cores" + echo "" + fi + + # CPU frequency if available + if [ -d /sys/devices/system/cpu/cpu0/cpufreq ]; then + echo "--- cpufreq ---" + for f in scaling_cur_freq scaling_min_freq scaling_max_freq scaling_governor; do + p="/sys/devices/system/cpu/cpu0/cpufreq/$f" + [ -f "$p" ] && echo "$f: $(cat "$p")" + done + echo "" + fi +} > "$INFO" 2>&1 +echo "System info: $INFO" + +# --- Tests --- +if [ -n "$TEST_BIN" ]; then + echo "" + echo "--- Running tests ---" + TEST_LOG="$RESULTS_DIR/test_output.txt" + if "$TEST_BIN" > "$TEST_LOG" 2>&1; then + echo "Tests: PASSED" + else + echo "Tests: FAILED (see $TEST_LOG)" >&2 + cat "$TEST_LOG" + exit 1 + fi +else + echo "WARNING: test_udpspeeder_static not found, skipping tests" >&2 +fi + +# --- Benchmarks --- +echo "" +echo "--- Running benchmarks ---" + +# Human-readable output (tee to both console and file) +BENCH_LOG="$RESULTS_DIR/bench_output.txt" +"$BENCH_BIN" 2>&1 | tee "$BENCH_LOG" + +# JSON output for machine consumption +BENCH_JSON="$RESULTS_DIR/bench_results.json" +"$BENCH_BIN" --json 2>/dev/null +if [ -f bench_results.json ]; then + mv bench_results.json "$BENCH_JSON" + echo "" + echo "JSON results: $BENCH_JSON" +fi + +echo "" +echo "=== Done ===" +echo "Results in: $RESULTS_DIR/" +ls -la "$RESULTS_DIR/" diff --git a/bench/test_crc32.cpp b/bench/test_crc32.cpp new file mode 100644 index 0000000..7230ec4 --- /dev/null +++ b/bench/test_crc32.cpp @@ -0,0 +1,190 @@ +#include "bench_common.h" +#include "crc32c.h" +#include "crc32/Crc32.h" +#include +#include +#include + +#define TEST(name, expr) do { \ + if (!(expr)) { printf(" FAIL: %s\n", name); failures++; } \ + else { printf(" ok: %s\n", name); } \ +} while(0) + +/* --- Old CRC32 (zlib polynomial) baseline regression anchor --- */ + +static int test_crc32_old_known_answer() { + int failures = 0; + + /* Standard test vector: CRC32 of "123456789" = 0xCBF43926 */ + const char *tv = "123456789"; + uint32_t got = crc32_fast(tv, 9); + char msg[128]; + snprintf(msg, sizeof(msg), "crc32_old(\"123456789\") = 0x%08X (expected 0xCBF43926)", got); + TEST(msg, got == 0xCBF43926); + + /* Empty input */ + uint32_t empty = crc32_fast("", 0); + snprintf(msg, sizeof(msg), "crc32_old(\"\") = 0x%08X (expected 0x00000000)", empty); + TEST(msg, empty == 0x00000000); + + return failures; +} + +/* --- CRC32C (Castagnoli) known-answer tests --- */ + +static int test_crc32c_known_answer() { + int failures = 0; + char msg[128]; + + /* IETF/SCTP standard test vector: CRC32C of "123456789" = 0xE3069283 */ + const char *tv = "123456789"; + uint32_t sw = crc32c_sw(tv, 9); + snprintf(msg, sizeof(msg), "crc32c_sw(\"123456789\") = 0x%08X (expected 0xE3069283)", sw); + TEST(msg, sw == 0xE3069283); + + /* Empty input */ + uint32_t empty = crc32c_sw("", 0); + snprintf(msg, sizeof(msg), "crc32c_sw(\"\") = 0x%08X (expected 0x00000000)", empty); + TEST(msg, empty == 0x00000000); + + /* Dispatched version should agree */ + uint32_t dispatched = crc32c(tv, 9); + snprintf(msg, sizeof(msg), "crc32c(\"123456789\") = 0x%08X (expected 0xE3069283)", dispatched); + TEST(msg, dispatched == 0xE3069283); + + return failures; +} + +/* --- Hardware vs software agreement --- */ + +static int test_crc32c_hw_sw_agree() { + int failures = 0; + char msg[128]; + + if (!crc32c_has_hw()) { + printf(" skip: no CRC32C hardware support detected\n"); + return 0; + } + + /* Test across various sizes and data patterns */ + for (int i = 0; i < bench_sizes_count; i++) { + size_t sz = bench_sizes[i]; + char *buf = (char *)malloc(sz); + for (size_t j = 0; j < sz; j++) + buf[j] = (char)((j * 13 + 7) & 0xFF); + + uint32_t sw = crc32c_sw(buf, sz); + uint32_t hw = crc32c_hw(buf, sz); + + snprintf(msg, sizeof(msg), "crc32c hw==sw at %zu bytes (sw=0x%08X hw=0x%08X)", + sz, sw, hw); + TEST(msg, sw == hw); + + free(buf); + } + + /* Also test odd sizes that stress alignment/tail handling */ + int odd_sizes[] = {1, 3, 7, 15, 31, 63, 127, 255, 1023, 1499}; + for (int s = 0; s < (int)(sizeof(odd_sizes)/sizeof(odd_sizes[0])); s++) { + int sz = odd_sizes[s]; + char *buf = (char *)malloc(sz); + for (int j = 0; j < sz; j++) + buf[j] = (char)((j * 41 + 3) & 0xFF); + + uint32_t sw = crc32c_sw(buf, sz); + uint32_t hw = crc32c_hw(buf, sz); + + snprintf(msg, sizeof(msg), "crc32c hw==sw at %d bytes (odd)", sz); + TEST(msg, sw == hw); + + free(buf); + } + + return failures; +} + +/* --- Incremental chaining --- */ + +static int test_crc32c_chaining() { + int failures = 0; + char msg[128]; + const int sz = 1024; + char buf[1024]; + + for (int i = 0; i < sz; i++) + buf[i] = (char)((i * 17 + 5) & 0xFF); + + /* Full CRC in one shot */ + uint32_t full = crc32c(buf, sz); + + /* CRC in two halves, chained */ + uint32_t first_half = crc32c(buf, sz / 2); + uint32_t chained = crc32c(buf + sz / 2, sz / 2, first_half); + + snprintf(msg, sizeof(msg), + "crc32c chaining: full=0x%08X chained=0x%08X", full, chained); + TEST(msg, full == chained); + + return failures; +} + +static int test_crc32c_unaligned() { + int failures = 0; + char msg[128]; + + int sizes[] = {64, 256, 1500}; + int offsets[] = {1, 3}; + + for (int si = 0; si < 3; si++) { + int sz = sizes[si]; + /* Allocate with extra headroom for offsets */ + char *raw = (char *)malloc(sz + 8); + for (int i = 0; i < sz + 8; i++) + raw[i] = (char)((i * 41 + 3) & 0xFF); + + for (int oi = 0; oi < 2; oi++) { + int off = offsets[oi]; + int len = sz - off; + + uint32_t sw = crc32c_sw(raw + off, len); + uint32_t dispatched = crc32c(raw + off, len); + + snprintf(msg, sizeof(msg), + "crc32c unaligned off=%d len=%d: dispatched==sw (0x%08X)", + off, len, sw); + TEST(msg, dispatched == sw); + + if (crc32c_has_hw()) { + uint32_t hw = crc32c_hw(raw + off, len); + snprintf(msg, sizeof(msg), + "crc32c unaligned off=%d len=%d: hw==sw (0x%08X)", + off, len, sw); + TEST(msg, hw == sw); + } + } + free(raw); + } + + return failures; +} + +int run_crc32_tests() { + int failures = 0; + + printf("[CRC32 old known-answer]\n"); + failures += test_crc32_old_known_answer(); + + printf("[CRC32C known-answer]\n"); + failures += test_crc32c_known_answer(); + + printf("[CRC32C hw vs sw agreement]\n"); + failures += test_crc32c_hw_sw_agree(); + + printf("[CRC32C incremental chaining]\n"); + failures += test_crc32c_chaining(); + + printf("[CRC32C unaligned input]\n"); + failures += test_crc32c_unaligned(); + + return failures; +} diff --git a/bench/test_fec.cpp b/bench/test_fec.cpp new file mode 100644 index 0000000..6687d13 --- /dev/null +++ b/bench/test_fec.cpp @@ -0,0 +1,263 @@ +#include "bench_common.h" +#include "lib/rs.h" +#include +#include +#include + +#define TEST(name, expr) do { \ + if (!(expr)) { printf(" FAIL: %s\n", name); failures++; } \ + else { printf(" ok: %s\n", name); } \ +} while(0) + +static void fill_pattern(char *buf, int sz, int seed) { + for (int i = 0; i < sz; i++) + buf[i] = (char)((i + seed) & 0xFF); +} + +static int test_addmul1_identity() { + int failures = 0; + const int sz = 256; + gf dst[256], src[256], expected[256]; + + /* Multiply by 1: dst ^= src * 1 == dst ^= src */ + memset(dst, 0, sz); + for (int i = 0; i < sz; i++) src[i] = (gf)(i & 0xFF); + + bench_addmul1(dst, src, 1, sz); + + TEST("addmul1(dst=0, src, c=1) == src", memcmp(dst, src, sz) == 0); + + /* Multiply by 0: dst should be unchanged */ + for (int i = 0; i < sz; i++) dst[i] = (gf)i; + memcpy(expected, dst, sz); + + bench_addmul1(dst, src, 0, sz); + + TEST("addmul1(dst, src, c=0) leaves dst unchanged", memcmp(dst, expected, sz) == 0); + + return failures; +} + +static int test_addmul1_linearity() { + int failures = 0; + const int sz = 256; + gf src[256], dst_a[256], dst_b[256], dst_ab[256]; + + for (int i = 0; i < sz; i++) src[i] = (gf)((i * 37 + 11) & 0xFF); + + /* addmul1(c=a) then addmul1(c=b) should equal addmul1(c=a^b) + * in GF(2^8), addition is XOR, but multiplication distributes: + * src*a XOR src*b == src*(a XOR b) [only in GF(2^n)] + * We verify this by running both paths. */ + gf a = 0x53, b = 0xCA; + + memset(dst_a, 0, sz); + bench_addmul1(dst_a, src, a, sz); + bench_addmul1(dst_a, src, b, sz); + + memset(dst_ab, 0, sz); + /* In GF(2^8), src*a ^ src*b = src*(a^b) */ + bench_addmul1(dst_ab, src, a ^ b, sz); + + TEST("addmul1 linearity: (src*a)^(src*b) == src*(a^b)", + memcmp(dst_a, dst_ab, sz) == 0); + + return failures; +} + +static int test_addmul1_sizes() { + int failures = 0; + + /* Test that addmul1 works at all benchmark sizes (catches off-by-one in unrolling) */ + for (int i = 0; i < bench_sizes_count; i++) { + int sz = (int)bench_sizes[i]; + gf *dst = (gf *)calloc(sz, 1); + gf *src = (gf *)calloc(sz, 1); + + for (int j = 0; j < sz; j++) src[j] = (gf)((j * 7) & 0xFF); + bench_addmul1(dst, src, 1, sz); + + char name[64]; + snprintf(name, sizeof(name), "addmul1 c=1 at %d bytes", sz); + TEST(name, memcmp(dst, src, sz) == 0); + + free(dst); + free(src); + } + return failures; +} + +static int test_rs_roundtrip(int k, int n, int pkt_sz) { + int failures = 0; + int redundant = n - k; + char label[64]; + snprintf(label, sizeof(label), "rs round-trip k=%d n=%d sz=%d", k, n, pkt_sz); + + /* Allocate and fill original data */ + char **data = (char **)calloc(n, sizeof(char *)); + char **orig = (char **)calloc(k, sizeof(char *)); + for (int i = 0; i < n; i++) + data[i] = (char *)calloc(1, pkt_sz); + for (int i = 0; i < k; i++) { + fill_pattern(data[i], pkt_sz, i * 31); + orig[i] = (char *)calloc(1, pkt_sz); + memcpy(orig[i], data[i], pkt_sz); + } + + /* Encode */ + rs_encode2(k, n, data, pkt_sz); + + /* Simulate losing the first 'redundant' data packets */ + for (int i = 0; i < redundant; i++) + data[i] = NULL; + + /* Decode */ + int rc = rs_decode2(k, n, data, pkt_sz); + if (rc != 0) { + snprintf(label, sizeof(label), "rs_decode2 returned %d for k=%d n=%d", rc, k, n); + TEST(label, 0); + goto cleanup; + } + + /* Verify recovered data matches originals */ + for (int i = 0; i < k; i++) { + snprintf(label, sizeof(label), "rs data[%d] matches (k=%d n=%d)", i, k, n); + TEST(label, data[i] != NULL && memcmp(data[i], orig[i], pkt_sz) == 0); + } + +cleanup: + /* Free all non-null pointers in data[] (decode may have rearranged them) */ + /* Since rs_decode reuses memory, we need to track what was allocated */ + /* Simple approach: free orig separately, free remaining data bufs */ + for (int i = 0; i < k; i++) free(orig[i]); + free(orig); + /* data[] pointers may alias the original allocations; the calloc'd buffers + * that weren't NULLed are still valid. We allocated n buffers initially, + * NULLed 'redundant' of them. The decode reused the non-null ones. + * Since we can't easily track which are unique, just leak here — it's a test. */ + free(data); + + return failures; +} + +/* + * RS round-trip losing specific shard indices (instead of always the first r). + * lose_idx[0..lose_count-1] lists which of the n shards to NULL before decode. + */ +static int test_rs_roundtrip_pattern(int k, int n, int pkt_sz, + const int *lose_idx, int lose_count, + const char *pattern_name) { + int failures = 0; + int redundant = n - k; + char label[128]; + + if (lose_count > redundant) { + snprintf(label, sizeof(label), "rs k=%d n=%d %s: lose_count(%d) > redundant(%d)", + k, n, pattern_name, lose_count, redundant); + TEST(label, 0); + return failures; + } + + char **data = (char **)calloc(n, sizeof(char *)); + char **orig = (char **)calloc(k, sizeof(char *)); + for (int i = 0; i < n; i++) + data[i] = (char *)calloc(1, pkt_sz); + for (int i = 0; i < k; i++) { + fill_pattern(data[i], pkt_sz, i * 31); + orig[i] = (char *)calloc(1, pkt_sz); + memcpy(orig[i], data[i], pkt_sz); + } + + rs_encode2(k, n, data, pkt_sz); + + for (int i = 0; i < lose_count; i++) + data[lose_idx[i]] = NULL; + + int rc = rs_decode2(k, n, data, pkt_sz); + snprintf(label, sizeof(label), "rs k=%d n=%d %s: decode ok", k, n, pattern_name); + if (rc != 0) { + TEST(label, 0); + goto cleanup; + } + TEST(label, 1); + + for (int i = 0; i < k; i++) { + snprintf(label, sizeof(label), "rs k=%d n=%d %s: data[%d] matches", + k, n, pattern_name, i); + TEST(label, data[i] != NULL && memcmp(data[i], orig[i], pkt_sz) == 0); + } + +cleanup: + for (int i = 0; i < k; i++) free(orig[i]); + free(orig); + free(data); + return failures; +} + +int run_fec_tests() { + int failures = 0; + + /* GF tables are initialized inside fec_new; force init via a dummy allocation */ + void *dummy = fec_new(2, 3); + fec_free(dummy); + + printf("[addmul1 identity]\n"); + failures += test_addmul1_identity(); + + printf("[addmul1 linearity]\n"); + failures += test_addmul1_linearity(); + + printf("[addmul1 sizes]\n"); + failures += test_addmul1_sizes(); + + printf("[rs round-trip: lose first r]\n"); + failures += test_rs_roundtrip(2, 4, 1500); + failures += test_rs_roundtrip(5, 8, 1500); + failures += test_rs_roundtrip(10, 15, 1024); + + /* Additional k/n combos */ + printf("[rs round-trip: more k/n combos]\n"); + failures += test_rs_roundtrip(1, 2, 1500); + failures += test_rs_roundtrip(1, 3, 1500); + failures += test_rs_roundtrip(20, 30, 1024); + failures += test_rs_roundtrip(50, 75, 512); + + /* Diverse loss patterns */ + printf("[rs round-trip: lose last r]\n"); + { + /* k=5 n=8: lose shards 5,6,7 (last 3) */ + int lose[] = {5, 6, 7}; + failures += test_rs_roundtrip_pattern(5, 8, 1500, lose, 3, "lose-last"); + } + { + /* k=10 n=15: lose shards 10,11,12,13,14 */ + int lose[] = {10, 11, 12, 13, 14}; + failures += test_rs_roundtrip_pattern(10, 15, 1024, lose, 5, "lose-last"); + } + + printf("[rs round-trip: lose every-other]\n"); + { + /* k=5 n=8: lose shards 0,2,4 (every other, 3 lost = r) */ + int lose[] = {0, 2, 4}; + failures += test_rs_roundtrip_pattern(5, 8, 1500, lose, 3, "lose-evens"); + } + { + /* k=10 n=15: lose shards 1,3,5,7,9 (odd indices, 5 lost = r) */ + int lose[] = {1, 3, 5, 7, 9}; + failures += test_rs_roundtrip_pattern(10, 15, 1024, lose, 5, "lose-odds"); + } + + printf("[rs round-trip: lose middle]\n"); + { + /* k=5 n=8: lose shards 2,3,4 (middle) */ + int lose[] = {2, 3, 4}; + failures += test_rs_roundtrip_pattern(5, 8, 1500, lose, 3, "lose-middle"); + } + { + /* k=20 n=30: lose shards 5,10,15,20,25,6,11,16,21,26 (scattered) */ + int lose[] = {5, 6, 10, 11, 15, 16, 20, 21, 25, 26}; + failures += test_rs_roundtrip_pattern(20, 30, 512, lose, 10, "lose-scattered"); + } + + return failures; +} diff --git a/bench/test_main.cpp b/bench/test_main.cpp new file mode 100644 index 0000000..90b3421 --- /dev/null +++ b/bench/test_main.cpp @@ -0,0 +1,23 @@ +#include "bench_common.h" +#include + +int main() { + int failures = 0; + + printf("=== FEC Tests ===\n"); + failures += run_fec_tests(); + + printf("\n=== CRC32 Tests ===\n"); + failures += run_crc32_tests(); + + printf("\n=== Packet Cook Tests ===\n"); + failures += run_packet_tests(); + + printf("\n"); + if (failures == 0) + printf("All tests passed.\n"); + else + printf("%d test(s) FAILED.\n", failures); + + return failures > 0 ? 1 : 0; +} diff --git a/bench/test_packet.cpp b/bench/test_packet.cpp new file mode 100644 index 0000000..9259fba --- /dev/null +++ b/bench/test_packet.cpp @@ -0,0 +1,275 @@ +#include "bench_common.h" +#include "packet_cook.h" +#include +#include +#include + +/* Stubs for packet_cook.cpp dependencies — production uses common.cpp */ +#ifndef BENCH_PACKET_STUBS_DEFINED +#define BENCH_PACKET_STUBS_DEFINED +void get_fake_random_chars(char *s, int len) { + for (int i = 0; i < len; i++) + s[i] = (char)(rand() & 0xFF); +} + +int random_between(unsigned int a, unsigned int b) { + if (a == b) return (int)a; + return (int)(a + (unsigned int)rand() % (b + 1 - a)); +} +#endif + +#define TEST(name, expr) do { \ + if (!(expr)) { printf(" FAIL: %s\n", name); failures++; } \ + else { printf(" ok: %s\n", name); } \ +} while(0) + +static int test_cook_roundtrip() { + int failures = 0; + cook_ctx_t ctx = {}; + strcpy(ctx.key, "testkey123"); + cook_ctx_prepare_key(&ctx); + ctx.iv_min = 4; + ctx.iv_max = 32; + + int sizes[] = { 1, 16, 64, 256, 1024, 1500 }; + int nsizes = sizeof(sizes) / sizeof(sizes[0]); + + for (int s = 0; s < nsizes; s++) { + int sz = sizes[s]; + char orig[4096], buf[4096]; + + /* Fill with pattern */ + for (int i = 0; i < sz; i++) + orig[i] = (char)((i * 37 + 11) & 0xFF); + memcpy(buf, orig, sz); + + int len = sz; + int rc = do_cook(&ctx, buf, len); + + char label[80]; + snprintf(label, sizeof(label), "do_cook succeeds at %d bytes", sz); + TEST(label, rc == 0 && len > sz); + + rc = de_cook(&ctx, buf, len); + snprintf(label, sizeof(label), "de_cook succeeds at %d bytes", sz); + TEST(label, rc == 0 && len == sz); + + snprintf(label, sizeof(label), "round-trip data matches at %d bytes", sz); + TEST(label, memcmp(buf, orig, sz) == 0); + } + + return failures; +} + +static int test_cook_checksum_only() { + int failures = 0; + cook_ctx_t ctx = {}; + ctx.iv_min = 4; + ctx.iv_max = 32; + ctx.disable_obscure = 1; + ctx.disable_xor = 1; + + char orig[1600], buf[1600]; + int sz = 100; + for (int i = 0; i < sz; i++) orig[i] = (char)i; + memcpy(buf, orig, sz); + + int len = sz; + do_cook(&ctx, buf, len); + TEST("checksum adds 4 bytes", len == sz + 4); + + int rc = de_cook(&ctx, buf, len); + TEST("checksum round-trip succeeds", rc == 0 && len == sz); + TEST("checksum data matches", memcmp(buf, orig, sz) == 0); + + /* Corrupt a byte and verify detection */ + memcpy(buf, orig, sz); + len = sz; + do_cook(&ctx, buf, len); + buf[0] ^= 0x01; + rc = de_cook(&ctx, buf, len); + TEST("checksum detects corruption", rc != 0); + + return failures; +} + +static int test_cook_disabled() { + int failures = 0; + cook_ctx_t ctx = {}; + ctx.iv_min = 4; + ctx.iv_max = 32; + ctx.disable_checksum = 1; + ctx.disable_obscure = 1; + ctx.disable_xor = 1; + + char buf[256]; + int sz = 100; + for (int i = 0; i < sz; i++) buf[i] = (char)i; + char orig[256]; + memcpy(orig, buf, sz); + + int len = sz; + do_cook(&ctx, buf, len); + TEST("all disabled: length unchanged", len == sz); + TEST("all disabled: data unchanged", memcmp(buf, orig, sz) == 0); + + return failures; +} + +static int test_xor_tile_roundtrip() { + int failures = 0; + int vec_w = bench_cook_vec_width(); + char label[128]; + + int tile_lens[] = {vec_w, vec_w * 2, vec_w * 5}; + int num_tiles = 3; + int data_lens[] = {1, 7, 8, 15, 16, 31, 32, 63, 64, 1500}; + int num_datas = 10; + int offsets[] = {0, 1, 3, 7}; + int num_offsets = 4; + + for (int tl = 0; tl < num_tiles; tl++) { + int tile_len = tile_lens[tl]; + char tile[256]; + for (int i = 0; i < tile_len; i++) + tile[i] = (char)((i * 37 + 11) & 0xFF); + + for (int dl = 0; dl < num_datas; dl++) { + int data_len = data_lens[dl]; + for (int ol = 0; ol < num_offsets; ol++) { + int offset = offsets[ol]; + char backing[2048]; + char orig[2048]; + char *data = backing + offset; + for (int i = 0; i < data_len; i++) + data[i] = (char)((i * 13 + 7) & 0xFF); + memcpy(orig, data, data_len); + + /* XOR once should change data (tile is non-zero) */ + bench_xor_tile(data, data_len, tile, tile_len); + int changed = (memcmp(data, orig, data_len) != 0); + + /* XOR again should restore original */ + bench_xor_tile(data, data_len, tile, tile_len); + + snprintf(label, sizeof(label), + "xor_tile tile=%d data=%d off=%d", tile_len, data_len, offset); + TEST(label, changed && memcmp(data, orig, data_len) == 0); + } + } + } + return failures; +} + +static int test_cook_combo(int disable_checksum, int disable_obscure, int disable_xor, + int sz) { + int failures = 0; + char label[128]; + const char *cs = disable_checksum ? "off" : "on"; + const char *ob = disable_obscure ? "off" : "on"; + const char *xr = disable_xor ? "off" : "on"; + + cook_ctx_t ctx = {}; + strcpy(ctx.key, "testkey123"); + cook_ctx_prepare_key(&ctx); + ctx.iv_min = 4; + ctx.iv_max = 32; + ctx.disable_checksum = disable_checksum; + ctx.disable_obscure = disable_obscure; + ctx.disable_xor = disable_xor; + + char orig[4096], buf[4096]; + for (int i = 0; i < sz; i++) + orig[i] = (char)((i * 37 + 11) & 0xFF); + memcpy(buf, orig, sz); + + int len = sz; + int rc = do_cook(&ctx, buf, len); + + snprintf(label, sizeof(label), "cook cs=%s ob=%s xr=%s sz=%d: encode ok", cs, ob, xr, sz); + TEST(label, rc == 0); + + rc = de_cook(&ctx, buf, len); + snprintf(label, sizeof(label), "cook cs=%s ob=%s xr=%s sz=%d: decode ok", cs, ob, xr, sz); + TEST(label, rc == 0 && len == sz); + + snprintf(label, sizeof(label), "cook cs=%s ob=%s xr=%s sz=%d: data matches", cs, ob, xr, sz); + TEST(label, memcmp(buf, orig, sz) == 0); + + return failures; +} + +static int test_cook_all_combos() { + int failures = 0; + int sizes[] = {64, 1500}; + for (int s = 0; s < 2; s++) { + for (int cs = 0; cs <= 1; cs++) + for (int ob = 0; ob <= 1; ob++) + for (int xr = 0; xr <= 1; xr++) + failures += test_cook_combo(cs, ob, xr, sizes[s]); + } + return failures; +} + +static int test_cook_unaligned() { + int failures = 0; + char label[128]; + int offsets[] = {0, 1, 3, 5, 7}; + int num_offsets = 5; + int sizes[] = {64, 256, 1500}; + int nsizes = 3; + + for (int ol = 0; ol < num_offsets; ol++) { + int offset = offsets[ol]; + for (int s = 0; s < nsizes; s++) { + int sz = sizes[s]; + /* +offset for misalignment, +200 for cook overhead */ + char backing[4096]; + char orig[4096]; + char *buf = backing + offset; + + cook_ctx_t ctx = {}; + strcpy(ctx.key, "testkey123"); + cook_ctx_prepare_key(&ctx); + ctx.iv_min = 4; + ctx.iv_max = 32; + + for (int i = 0; i < sz; i++) + buf[i] = (char)((i * 37 + 11) & 0xFF); + memcpy(orig, buf, sz); + + int len = sz; + do_cook(&ctx, buf, len); + int rc = de_cook(&ctx, buf, len); + + snprintf(label, sizeof(label), + "cook unaligned off=%d sz=%d: round-trip", offset, sz); + TEST(label, rc == 0 && len == sz && memcmp(buf, orig, sz) == 0); + } + } + return failures; +} + +int run_packet_tests() { + int failures = 0; + + printf("[cook round-trip]\n"); + failures += test_cook_roundtrip(); + + printf("[cook checksum only]\n"); + failures += test_cook_checksum_only(); + + printf("[cook all disabled]\n"); + failures += test_cook_disabled(); + + printf("[xor_tile round-trip]\n"); + failures += test_xor_tile_roundtrip(); + + printf("[cook all 8 enable/disable combos]\n"); + failures += test_cook_all_combos(); + + printf("[cook unaligned buffers]\n"); + failures += test_cook_unaligned(); + + return failures; +} diff --git a/bench/throughput.sh b/bench/throughput.sh new file mode 100755 index 0000000..8bf7e96 --- /dev/null +++ b/bench/throughput.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# bench/throughput.sh — Measure end-to-end UDP tunnel throughput +# +# Usage: ./bench/throughput.sh [options] +# --duration N seconds per iteration (default: 5) +# --fec X:Y FEC parameter (default: disabled) +# --disable-fec explicitly disable FEC +# --iterations N number of runs, reports median (default: 3) +# --json output JSON for github-action-benchmark + +set -euo pipefail + +BINARY="" +DURATION=10 +FEC_ARGS="--disable-fec" +FEC_LABEL="no-fec" +ITERATIONS=5 +JSON=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --duration) DURATION="$2"; shift 2 ;; + --fec) FEC_ARGS="-f $2"; FEC_LABEL="fec-${2//:/-}"; shift 2 ;; + --disable-fec) FEC_ARGS="--disable-fec"; FEC_LABEL="no-fec"; shift ;; + --iterations) ITERATIONS="$2"; shift 2 ;; + --json) JSON=1; shift ;; + -*) echo "Unknown option: $1" >&2; exit 1 ;; + *) BINARY="$1"; shift ;; + esac +done + +if [[ -z "$BINARY" ]]; then + echo "Usage: $0 [--duration N] [--fec X:Y] [--json]" >&2 + exit 1 +fi + +if [[ ! -x "$BINARY" ]]; then + echo "Error: $BINARY is not executable" >&2 + exit 1 +fi + +PORT_TUNNEL=20000 +PORT_APP=20001 +PORT_CLIENT=20002 + +# Kill any leftover processes from previous runs +kill_tunnel() { + local pids + pids=$(jobs -p 2>/dev/null) || true + if [[ -n "$pids" ]]; then + kill $pids 2>/dev/null || true + wait $pids 2>/dev/null || true + fi +} +trap kill_tunnel EXIT + +run_once() { + local tmpfile + tmpfile=$(mktemp) + kill_tunnel + + # UDP receiver: writes "bytes elapsed" to tmpfile, exits after 2s of no data + python3 -c " +import socket, time, sys +sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +sock.bind(('127.0.0.1', $PORT_APP)) +sock.settimeout(2) +total = 0 +start = None +try: + while True: + data = sock.recv(65535) + if start is None: + start = time.monotonic() + total += len(data) +except socket.timeout: + pass +elapsed = time.monotonic() - start if start else 0 +result = f'{total} {elapsed:.6f}' +sys.stdout.write(result + '\n') +sys.stdout.flush() +" > "$tmpfile" 2>/dev/null & + local recv_pid=$! + + # Start tunnel + $BINARY -s -l 127.0.0.1:$PORT_TUNNEL -r 127.0.0.1:$PORT_APP $FEC_ARGS --log-level 0 >/dev/null 2>&1 & + local server_pid=$! + + $BINARY -c -l 127.0.0.1:$PORT_CLIENT -r 127.0.0.1:$PORT_TUNNEL $FEC_ARGS --log-level 0 >/dev/null 2>&1 & + local client_pid=$! + + sleep 1 + + # UDP sender: blasts 1400-byte packets for DURATION seconds + python3 -c " +import socket, time +sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +payload = b'\x00' * 1400 +end = time.monotonic() + $DURATION +while time.monotonic() < end: + try: + sock.sendto(payload, ('127.0.0.1', $PORT_CLIENT)) + except OSError: + pass +" + echo " sender done, waiting for receiver..." >&2 + + # Wait for receiver to exit naturally (2s socket timeout after last packet) + wait $recv_pid 2>/dev/null || true + + # Kill tunnel processes + kill $server_pid $client_pid 2>/dev/null || true + wait $server_pid $client_pid 2>/dev/null || true + + # Parse result + local result bytes elapsed + result=$(cat "$tmpfile") + rm -f "$tmpfile" + + bytes=$(echo "$result" | awk '{print $1}') + elapsed=$(echo "$result" | awk '{print $2}') + + echo " received $bytes bytes in ${elapsed}s" >&2 + + if [[ -z "$bytes" || "$bytes" == "0" ]]; then + echo "0.0" + return + fi + + python3 -c "print(f'{$bytes / $elapsed / 1e6 * 8:.1f}')" +} + +# Warmup run (discarded) — primes tunnel, caches, socket buffers +echo " Warmup run..." >&2 +run_once > /dev/null + +# Run iterations and compute median +results=() +for i in $(seq 1 "$ITERATIONS"); do + echo " Run $i/$ITERATIONS..." >&2 + mbps=$(run_once) + results+=("$mbps") + echo " → $mbps Mbps" >&2 +done + +IFS=$'\n' sorted=($(printf '%s\n' "${results[@]}" | sort -n)); unset IFS +median_idx=$(( ITERATIONS / 2 )) +median=${sorted[$median_idx]} +median=${median:-0.0} + +if [[ $JSON -eq 1 ]]; then + printf '{"name": "throughput/%s", "unit": "Mbps", "value": %s}\n' "$FEC_LABEL" "$median" +else + echo "Throughput ($FEC_LABEL): $median Mbps [runs: ${results[*]}]" +fi diff --git a/common.h b/common.h index 9f59799..25f7b65 100644 --- a/common.h +++ b/common.h @@ -270,13 +270,13 @@ struct address_t // TODO scope id char *get_str(); void to_str(char *); - inline int is_vaild() { + inline int is_valid() { u32_t ret = ((sockaddr *)&inner)->sa_family; return (ret == AF_INET || ret == AF_INET6); } inline u32_t get_type() { - assert(is_vaild()); + assert(is_valid()); u32_t ret = ((sockaddr *)&inner)->sa_family; return ret; } diff --git a/connection.cpp b/connection.cpp index 9a0ee89..5c256f0 100644 --- a/connection.cpp +++ b/connection.cpp @@ -6,6 +6,7 @@ */ #include "connection.h" +#include "io_uring_recv.h" // const int disable_conv_clear=0;//a udp connection in the multiplexer is called conversation in this program,conv for short. @@ -18,13 +19,22 @@ void server_clear_function(u64_t u64) // used in conv_manager in server mode.fo { fd64_t fd64 = u64; assert(fd_manager.exist(fd64)); - ev_io &watcher = fd_manager.get_info(fd64).io_watcher; - address_t &addr = fd_manager.get_info(fd64).addr; // - assert(conn_manager.exist(addr)); // - struct ev_loop *loop = conn_manager.find_insert(addr).loop; // overkill ? should we just use ev_default_loop(0)? +#ifdef __linux__ + if (g_uring_ctx && g_uring_ctx->available) { + uring_cancel(g_uring_ctx, uring_tag(URING_TAG_SERVER_REMOTE, fd64)); + uring_submit(g_uring_ctx); + } else +#endif + { + ev_io &watcher = fd_manager.get_info(fd64).io_watcher; - ev_io_stop(loop, &watcher); + address_t &addr = fd_manager.get_info(fd64).addr; + assert(conn_manager.exist(addr)); + struct ev_loop *loop = conn_manager.find_insert(addr).loop; + + ev_io_stop(loop, &watcher); + } fd_manager.fd64_close(fd64); } diff --git a/crc32/Crc32.cpp b/crc32/Crc32.cpp index dd7b518..3581522 100644 --- a/crc32/Crc32.cpp +++ b/crc32/Crc32.cpp @@ -54,12 +54,12 @@ #error "endian detection failed" #endif -#if defined(IS_LITTLE_ENDIAN) - #define __BYTE_ORDER __LITTLE_ENDIAN -#endif - -#if defined(IS_BIG_ENDIAN) - #define __BYTE_ORDER __BIG_ENDIAN +#ifndef __BYTE_ORDER + #if defined(IS_LITTLE_ENDIAN) + #define __BYTE_ORDER __LITTLE_ENDIAN + #elif defined(IS_BIG_ENDIAN) + #define __BYTE_ORDER __BIG_ENDIAN + #endif #endif // define endianess and some integer data types diff --git a/crc32c.h b/crc32c.h new file mode 100644 index 0000000..0c1fb65 --- /dev/null +++ b/crc32c.h @@ -0,0 +1,194 @@ +#ifndef BENCH_CRC32C_H +#define BENCH_CRC32C_H + +#include +#include + +/* + * CRC32C (Castagnoli) implementation — polynomial 0x82F63B78 + * + * Three paths: + * crc32c_sw() — software slicing-by-8, works everywhere + * crc32c_hw() — hardware intrinsics (SSE4.2 / ARMv8-CRC) + * crc32c() — runtime dispatch to hw or sw + */ + +/* ---- Software slicing-by-8 -------------------------------------------- */ + +/* + * Lookup table for CRC32C polynomial 0x82F63B78 (bit-reversed 0x1EDC6F41). + * 8 slices x 256 entries. Generated at init time by crc32c_init_sw_table(). + */ +static uint32_t crc32c_table[8][256]; +static int crc32c_table_ready = 0; + +static void crc32c_init_sw_table(void) { + if (crc32c_table_ready) return; + const uint32_t poly = 0x82F63B78; + for (int i = 0; i < 256; i++) { + uint32_t crc = (uint32_t)i; + for (int j = 0; j < 8; j++) + crc = (crc >> 1) ^ (poly & (-(int32_t)(crc & 1))); + crc32c_table[0][i] = crc; + } + for (int i = 0; i < 256; i++) { + uint32_t crc = crc32c_table[0][i]; + for (int s = 1; s < 8; s++) { + crc = crc32c_table[0][crc & 0xFF] ^ (crc >> 8); + crc32c_table[s][i] = crc; + } + } + crc32c_table_ready = 1; +} + +static uint32_t crc32c_sw(const void *data, size_t length, uint32_t previousCrc32 = 0) { + crc32c_init_sw_table(); + const uint8_t *p = (const uint8_t *)data; + uint32_t crc = ~previousCrc32; + + /* Process 8 bytes at a time */ + while (length >= 8) { + crc ^= (uint32_t)p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24); + crc = crc32c_table[7][crc & 0xFF] ^ + crc32c_table[6][(crc >> 8) & 0xFF] ^ + crc32c_table[5][(crc >> 16) & 0xFF] ^ + crc32c_table[4][(crc >> 24) & 0xFF] ^ + crc32c_table[3][p[4]] ^ + crc32c_table[2][p[5]] ^ + crc32c_table[1][p[6]] ^ + crc32c_table[0][p[7]]; + p += 8; + length -= 8; + } + + /* Remaining bytes */ + while (length--) + crc = crc32c_table[0][(crc ^ *p++) & 0xFF] ^ (crc >> 8); + + return ~crc; +} + +/* ---- Hardware: x86_64 SSE4.2 ----------------------------------------- */ + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) +#include + +#ifdef __GNUC__ +__attribute__((target("sse4.2"))) +#endif +static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) { + const uint8_t *p = (const uint8_t *)data; + uint64_t crc = ~(uint64_t)previousCrc32; + +#if defined(__x86_64__) || defined(_M_X64) + /* Process 8 bytes at a time on 64-bit */ + while (length >= 8) { + uint64_t val; + __builtin_memcpy(&val, p, 8); + crc = _mm_crc32_u64(crc, val); + p += 8; + length -= 8; + } +#endif + /* Process 4 bytes at a time */ + while (length >= 4) { + uint32_t val; + __builtin_memcpy(&val, p, 4); + crc = _mm_crc32_u32((uint32_t)crc, val); + p += 4; + length -= 4; + } + /* Remaining bytes */ + while (length--) + crc = _mm_crc32_u8((uint32_t)crc, *p++); + + return ~(uint32_t)crc; +} + +static int crc32c_has_hw(void) { + uint32_t eax, ebx, ecx, edx; + __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) + : "a"(1)); + return (ecx >> 20) & 1; /* SSE4.2 bit */ +} + +/* ---- Hardware: ARMv8-A CRC extension ---------------------------------- */ + +#elif defined(__aarch64__) || defined(__arm__) +#ifdef __ARM_FEATURE_CRC32 +#include + +static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) { + const uint8_t *p = (const uint8_t *)data; + uint32_t crc = ~previousCrc32; + +#ifdef __aarch64__ + while (length >= 8) { + uint64_t val; + __builtin_memcpy(&val, p, 8); + crc = __crc32cd(crc, val); + p += 8; + length -= 8; + } +#endif + while (length >= 4) { + uint32_t val; + __builtin_memcpy(&val, p, 4); + crc = __crc32cw(crc, val); + p += 4; + length -= 4; + } + while (length--) + crc = __crc32cb(crc, *p++); + + return ~crc; +} + +static int crc32c_has_hw(void) { + return 1; /* If __ARM_FEATURE_CRC32 is defined, the compiler guarantees it */ +} + +#else /* ARM without CRC extension */ + +static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) { + return crc32c_sw(data, length, previousCrc32); /* fallback */ +} + +static int crc32c_has_hw(void) { + return 0; +} + +#endif /* __ARM_FEATURE_CRC32 */ + +/* ---- No hardware support ---------------------------------------------- */ + +#else + +static uint32_t crc32c_hw(const void *data, size_t length, uint32_t previousCrc32 = 0) { + return crc32c_sw(data, length, previousCrc32); +} + +static int crc32c_has_hw(void) { + return 0; +} + +#endif /* architecture selection */ + +/* ---- Runtime dispatch ------------------------------------------------- */ + +typedef uint32_t (*crc32c_fn)(const void *, size_t, uint32_t); + +static uint32_t crc32c_resolve(const void *data, size_t length, uint32_t previousCrc32); +static crc32c_fn crc32c_impl = crc32c_resolve; + +static uint32_t crc32c_resolve(const void *data, size_t length, uint32_t previousCrc32) { + crc32c_impl = crc32c_has_hw() ? crc32c_hw : crc32c_sw; + return crc32c_impl(data, length, previousCrc32); +} + +static inline uint32_t crc32c(const void *data, size_t length, uint32_t previousCrc32 = 0) { + return crc32c_impl(data, length, previousCrc32); +} + +#endif /* BENCH_CRC32C_H */ diff --git a/delay_manager.cpp b/delay_manager.cpp index f8a9aa4..fdd1498 100644 --- a/delay_manager.cpp +++ b/delay_manager.cpp @@ -48,9 +48,7 @@ int delay_manager_t::add(my_time_t delay, const dest_t &dest, char *data, int le return -1; } if (delay == 0) { - static char buf[buf_len]; - delay_data.data = buf; - memcpy(buf, data, len); + delay_data.data = data; int ret = delay_data.handle(); if (ret != 0) { mylog(log_trace, "handle() return %d\n", ret); diff --git a/fec_manager.cpp b/fec_manager.cpp index 5d132c8..40bd917 100644 --- a/fec_manager.cpp +++ b/fec_manager.cpp @@ -510,39 +510,43 @@ int fec_decode_manager_t::input(char *s, int len) { mylog(log_warn, "data_num+redundant_num>=max_fec_packet_num\n"); return -1; } - if (!anti_replay.is_vaild(seq)) { - mylog(log_trace, "!anti_replay.is_vaild(seq) ,seq =%u\n", seq); + if (inner_index >= data_num + redundant_num) { + mylog(log_warn, "inner_index(%d) >= data_num+redundant_num(%d+%d)\n", inner_index, data_num, redundant_num); + return -1; + } + if (!anti_replay.is_valid(seq)) { + mylog(log_trace, "!anti_replay.is_valid(seq) ,seq =%u\n", seq); return 0; } - if (mp[seq].fec_done != 0) { + fec_group_t &group = group_find_or_create(seq); + + if (group.fec_done != 0) { mylog(log_debug, "fec already done, ignore, seq=%u\n", seq); return -1; } - if (mp[seq].group_mp.find(inner_index) != mp[seq].group_mp.end()) { + if (group.has_shard(inner_index)) { mylog(log_debug, "dup fec index\n"); // duplicate can happen on a normal network, so its just log_debug return -1; } - if (mp[seq].type == -1) - mp[seq].type = type; + if (group.type == -1) + group.type = type; else { - if (mp[seq].type != type) { + if (group.type != type) { mylog(log_warn, "type mismatch\n"); return -1; } } if (data_num != 0) { - // mp[seq].data_counter++; - - if (mp[seq].data_num == -1) { - mp[seq].data_num = data_num; - mp[seq].redundant_num = redundant_num; - mp[seq].len = len; + if (group.data_num == -1) { + group.data_num = data_num; + group.redundant_num = redundant_num; + group.len = len; } else { - if (mp[seq].data_num != data_num || mp[seq].redundant_num != redundant_num || mp[seq].len != len) { + if (group.data_num != data_num || group.redundant_num != redundant_num || group.len != len) { mylog(log_warn, "unexpected mp[seq].data_num!=data_num||mp[seq].redundant_num!=redundant_num||mp[seq].len!=len\n"); return -1; } @@ -555,11 +559,11 @@ int fec_decode_manager_t::input(char *s, int len) { u32_t tmp_seq = fec_data[index].seq; anti_replay.set_invaild(tmp_seq); - auto tmp_it = mp.find(tmp_seq); - if (tmp_it != mp.end()) { - int x = tmp_it->second.data_num; - int y = tmp_it->second.redundant_num; - int cnt = tmp_it->second.group_mp.size(); + fec_group_t *tmp_group = group_find(tmp_seq); + if (tmp_group) { + int x = tmp_group->data_num; + int y = tmp_group->redundant_num; + int cnt = tmp_group->shard_count; if (cnt < x) { if (debug_fec_dec) @@ -567,7 +571,7 @@ int fec_decode_manager_t::input(char *s, int len) { else mylog(log_trace, "[dec][failed]seq=%08x x=%d y=%d cnt=%d\n", tmp_seq, x, y, cnt); } - mp.erase(tmp_it); + group_erase(tmp_seq); } if (tmp_seq == seq) { mylog(log_warn, "unexpected tmp_seq==seq ,seq=%d\n", seq); @@ -585,58 +589,57 @@ int fec_decode_manager_t::input(char *s, int len) { assert(0 <= index && index < (int)fec_buff_num); assert(len + 100 < buf_len); memcpy(fec_data[index].buf, s + tmp_idx, len); - mp[seq].group_mp[inner_index] = index; + group.set_shard(inner_index, index); + group.shard_count++; // index++ at end of function - map &inner_mp = mp[seq].group_mp; - int about_to_fec = 0; if (type == 0) { - // assert((int)inner_mp.size()<=data_num); - if ((int)inner_mp.size() > data_num) { - mylog(log_warn, "inner_mp.size()>data_num\n"); + if (group.shard_count > data_num) { + mylog(log_warn, "shard_count>data_num\n"); anti_replay.set_invaild(seq); goto end; } - if ((int)inner_mp.size() == data_num) + if (group.shard_count == data_num) about_to_fec = 1; } else { - if (mp[seq].data_num != -1) { - if ((int)inner_mp.size() > mp[seq].data_num + 1) { - mylog(log_warn, "inner_mp.size()>data_num+1\n"); + if (group.data_num != -1) { + if (group.shard_count > group.data_num + 1) { + mylog(log_warn, "shard_count>data_num+1\n"); anti_replay.set_invaild(seq); goto end; } - if ((int)inner_mp.size() >= mp[seq].data_num) { + if (group.shard_count >= group.data_num) { about_to_fec = 1; } } } if (about_to_fec) { - int group_data_num = mp[seq].data_num; - int group_redundant_num = mp[seq].redundant_num; + int group_data_num = group.data_num; + int group_redundant_num = group.redundant_num; int x_got = 0; int y_got = 0; // mylog(log_error,"fec here!\n"); if (type == 0) { char *fec_tmp_arr[max_fec_packet_num + 5] = {0}; - for (auto it = inner_mp.begin(); it != inner_mp.end(); it++) { - if (it->first < group_data_num) + for (int i = 0; i < group_data_num + group_redundant_num; i++) { + if (!group.has_shard(i)) continue; + if (i < group_data_num) x_got++; else y_got++; - fec_tmp_arr[it->first] = fec_data[it->second].buf; + fec_tmp_arr[i] = fec_data[group.shard_idx[i]].buf; } assert(rs_decode2(group_data_num, group_data_num + group_redundant_num, fec_tmp_arr, len) == 0); // the input data has been modified in-place // this line should always succeed - mp[seq].fec_done = 1; + group.fec_done = 1; if (debug_fec_dec) - mylog(log_debug, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, int(inner_mp.size()), x_got, y_got); + mylog(log_debug, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, group.shard_count, x_got, y_got); else - mylog(log_trace, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, int(inner_mp.size()), x_got, y_got); + mylog(log_trace, "[dec]seq=%08x x=%d y=%d len=%d cnt=%d X=%d Y=%d\n", seq, group_data_num, group_redundant_num, len, group.shard_count, x_got, y_got); blob_decode.clear(); for (int i = 0; i < group_data_num; i++) { @@ -656,34 +659,32 @@ int fec_decode_manager_t::input(char *s, int len) { int max_len = -1; int fec_result_ok = 1; int data_check_ok = 1; - int debug_num = inner_mp.size(); + int debug_num = group.shard_count; int missed_packet[max_fec_packet_num + 5]; int missed_packet_counter = 0; - // outupt_s_arr_buf[max_fec_packet_num+5]={0}; - - // memset(output_s_arr_buf,0,sizeof(output_s_arr_buf));//in efficient - for (int i = 0; i < group_data_num + group_redundant_num; i++) { - output_s_arr_buf[i] = 0; - } - for (auto it = inner_mp.begin(); it != inner_mp.end(); it++) { - if (it->first < group_data_num) + if (!group.has_shard(i)) { + output_s_arr_buf[i] = 0; + continue; + } + int di = group.shard_idx[i]; + if (i < group_data_num) x_got++; else y_got++; - output_s_arr_buf[it->first] = fec_data[it->second].buf; - if (fec_data[it->second].len < (int)sizeof(u16_t)) { - mylog(log_warn, "fec_data[it->second].len<(int)sizeof(u16_t)"); + output_s_arr_buf[i] = fec_data[di].buf; + if (fec_data[di].len < (int)sizeof(u16_t)) { + mylog(log_warn, "fec_data[di].len<(int)sizeof(u16_t)"); data_check_ok = 0; } - if (fec_data[it->second].len > max_len) - max_len = fec_data[it->second].len; + if (fec_data[di].len > max_len) + max_len = fec_data[di].len; } - if (max_len != mp[seq].len) { + if (max_len != group.len) { data_check_ok = 0; mylog(log_warn, "max_len!=mp[seq].len"); } @@ -693,10 +694,13 @@ int fec_decode_manager_t::input(char *s, int len) { anti_replay.set_invaild(seq); goto end; } - for (auto it = inner_mp.begin(); it != inner_mp.end(); it++) { - int tmp_idx = it->second; - assert(max_len >= fec_data[tmp_idx].len); // guarenteed by data_check_ok - memset(fec_data[tmp_idx].buf + fec_data[tmp_idx].len, 0, max_len - fec_data[tmp_idx].len); + for (int i = 0; i < group_data_num + group_redundant_num; i++) { + if (!group.has_shard(i)) continue; + int di = group.shard_idx[i]; + assert(max_len >= fec_data[di].len); // guarenteed by data_check_ok + int pad = max_len - fec_data[di].len; + if (pad > 0) + memset(fec_data[di].buf + fec_data[di].len, 0, pad); } for (int i = 0; i < group_data_num; i++) { @@ -708,7 +712,7 @@ int fec_decode_manager_t::input(char *s, int len) { mylog(log_trace, "fec done,%d %d,missed_packet_counter=%d\n", group_data_num, group_redundant_num, missed_packet_counter); assert(rs_decode2(group_data_num, group_data_num + group_redundant_num, output_s_arr_buf, max_len) == 0); // this should always succeed - mp[seq].fec_done = 1; + group.fec_done = 1; int sum_ori = 0; diff --git a/fec_manager.h b/fec_manager.h index f92b651..dd809dd 100644 --- a/fec_manager.h +++ b/fec_manager.h @@ -13,7 +13,7 @@ #include "lib/rs.h" const int max_blob_packet_num = 30000; // how many packet can be contain in a blob_t ,can be set very large -const u32_t anti_replay_buff_size = 30000; // can be set very large +const u32_t anti_replay_table_size = 32768; // power of 2 for fast modulo const int max_fec_packet_num = 255; // this is the limitation of the rs lib extern u32_t fec_buff_num; @@ -182,55 +182,29 @@ struct fec_parameter_t { extern fec_parameter_t g_fec_par; // extern int dynamic_update_fec; -const int anti_replay_timeout = 120 * 1000; // 120s - struct anti_replay_t { - struct info_t { - my_time_t my_time; - int index; - }; + /* Direct-mapped table: slot = seq & MASK, stores the seq that owns it. + * is_valid: table[slot] != seq → valid (not yet seen). + * set_invaild: table[slot] = seq. + * Old entries naturally evicted when a new seq maps to the same slot. + * With 32K slots and monotonically increasing seqs, effective window + * is ~32K groups — comparable to the old 30K ring buffer. */ + static const u32_t TABLE_MASK = anti_replay_table_size - 1; + + u32_t table[anti_replay_table_size]; - u64_t replay_buffer[anti_replay_buff_size]; - unordered_map mp; - int index; anti_replay_t() { clear(); } int clear() { - memset(replay_buffer, -1, sizeof(replay_buffer)); - mp.clear(); - mp.rehash(anti_replay_buff_size * 3); - index = 0; + memset(table, 0xFF, sizeof(table)); return 0; } void set_invaild(u32_t seq) { - if (is_vaild(seq) == 0) { - mylog(log_trace, "seq %u exist\n", seq); - // assert(mp.find(seq)!=mp.end()); - // mp[seq].my_time=get_current_time_rough(); - return; - } - if (replay_buffer[index] != u64_t(i64_t(-1))) { - assert(mp.find(replay_buffer[index]) != mp.end()); - mp.erase(replay_buffer[index]); - } - replay_buffer[index] = seq; - assert(mp.find(seq) == mp.end()); - mp[seq].my_time = get_current_time(); - mp[seq].index = index; - index++; - if (index == int(anti_replay_buff_size)) index = 0; + table[seq & TABLE_MASK] = seq; } - int is_vaild(u32_t seq) { - if (mp.find(seq) == mp.end()) return 1; - - if (get_current_time() - mp[seq].my_time > anti_replay_timeout) { - replay_buffer[mp[seq].index] = u64_t(i64_t(-1)); - mp.erase(seq); - return 1; - } - - return 0; + int is_valid(u32_t seq) { + return table[seq & TABLE_MASK] != seq; } }; @@ -374,18 +348,45 @@ struct fec_data_t { int len; }; struct fec_group_t { - int type = -1; - int data_num = -1; - int redundant_num = -1; - int len = -1; - int fec_done = 0; - // int data_counter=0; - map group_mp; + u32_t seq; /* owner seq, 0xFFFFFFFF = empty slot */ + int type; + int data_num; + int redundant_num; + int len; + int fec_done; + int shard_count; + u32_t shard_bitmap[8]; /* 256 bits — replaces 1KB memset of shard_idx */ + int shard_idx[max_fec_packet_num + 1]; /* only valid where bitmap bit is set */ + + void init(u32_t new_seq) { + seq = new_seq; + type = -1; + data_num = -1; + redundant_num = -1; + len = -1; + fec_done = 0; + shard_count = 0; + memset(shard_bitmap, 0, sizeof(shard_bitmap)); /* 32 bytes vs old 1024 */ + } + int has_shard(int i) const { + return (shard_bitmap[i >> 5] >> (i & 31)) & 1; + } + void set_shard(int i, int val) { + shard_bitmap[i >> 5] |= (1u << (i & 31)); + shard_idx[i] = val; + } }; class fec_decode_manager_t : not_copy_able_t { anti_replay_t anti_replay; fec_data_t *fec_data = 0; - unordered_map mp; + + /* Direct-mapped group table: slot = seq & group_table_mask. + * Monotonically increasing seqs guarantee no two concurrent groups + * collide (table_size > max concurrent groups ≈ fec_buff_num). */ + fec_group_t *group_table = 0; + u32_t group_table_size; + u32_t group_table_mask; + blob_decode_t blob_decode; int index; @@ -398,28 +399,45 @@ class fec_decode_manager_t : not_copy_able_t { char *output_s_arr_buf[max_fec_packet_num + 100]; // only for type=1,for type=0 the buf inside blot_t is used int output_len_arr_buf[max_fec_packet_num + 100]; // same + fec_group_t &group_find_or_create(u32_t seq) { + fec_group_t &g = group_table[seq & group_table_mask]; + if (g.seq != seq) g.init(seq); + return g; + } + fec_group_t *group_find(u32_t seq) { + fec_group_t &g = group_table[seq & group_table_mask]; + return (g.seq == seq) ? &g : 0; + } + void group_erase(u32_t seq) { + fec_group_t &g = group_table[seq & group_table_mask]; + if (g.seq == seq) g.seq = 0xFFFFFFFF; + } + public: fec_decode_manager_t() { + /* Table size: next power of 2 >= fec_buff_num * 2 */ + group_table_size = 1; + while (group_table_size < fec_buff_num * 2) group_table_size <<= 1; + group_table_mask = group_table_size - 1; + fec_data = new fec_data_t[fec_buff_num + 5]; + group_table = new fec_group_t[group_table_size]; assert(fec_data != 0); + assert(group_table != 0); clear(); } - /* - fec_decode_manager_t(const fec_decode_manager_t &b) - { - assert(0==1);//not allowed to copy - }*/ ~fec_decode_manager_t() { mylog(log_debug, "fec_decode_manager destroyed\n"); if (fec_data != 0) { mylog(log_debug, "fec_data freed\n"); delete[] fec_data; } + delete[] group_table; } int clear() { anti_replay.clear(); - mp.clear(); - mp.rehash(fec_buff_num * 3); + for (u32_t i = 0; i < group_table_size; i++) + group_table[i].seq = 0xFFFFFFFF; for (int i = 0; i < (int)fec_buff_num; i++) fec_data[i].used = 0; diff --git a/io_uring_recv.cpp b/io_uring_recv.cpp new file mode 100644 index 0000000..7e23afc --- /dev/null +++ b/io_uring_recv.cpp @@ -0,0 +1,471 @@ +#include "io_uring_recv.h" + +uring_ctx_t *g_uring_ctx = NULL; + +#ifdef __linux__ + +#include "log.h" +#include +#include +#include +#include +#include +#include + +/* --- Raw syscall wrappers ------------------------------------------------ */ + +static int +sys_io_uring_setup(unsigned entries, struct io_uring_params *p) +{ + return (int)syscall(__NR_io_uring_setup, entries, p); +} + +static int +sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete, + unsigned flags, void *arg, size_t argsz) +{ + return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete, + flags, arg, argsz); +} + +static int +sys_io_uring_register(int fd, unsigned opcode, void *arg, unsigned nr_args) +{ + return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); +} + +/* --- Memory barrier helpers ---------------------------------------------- */ + +static inline void +io_uring_smp_store_release(unsigned *p, unsigned v) +{ + __atomic_store_n(p, v, __ATOMIC_RELEASE); +} + +static inline unsigned +io_uring_smp_load_acquire(const unsigned *p) +{ + return __atomic_load_n(p, __ATOMIC_ACQUIRE); +} + +/* --- SQE helpers --------------------------------------------------------- */ + +static struct io_uring_sqe * +get_sqe(uring_ctx_t *ctx) +{ + unsigned head = io_uring_smp_load_acquire(ctx->sq_head); + unsigned tail = *ctx->sq_tail; + if (tail - head >= ctx->sq_entries) + return NULL; /* SQ full */ + struct io_uring_sqe *sqe = &ctx->sqes[tail & ctx->sq_mask]; + return sqe; +} + +static void +submit_sqe(uring_ctx_t *ctx) +{ + unsigned tail = *ctx->sq_tail; + ctx->sq_array[tail & ctx->sq_mask] = tail & ctx->sq_mask; + io_uring_smp_store_release(ctx->sq_tail, tail + 1); +} + +/* --- Buffer ring helpers ------------------------------------------------- */ + +static void +buf_ring_add(uring_ctx_t *ctx, int buf_id) +{ + struct io_uring_buf_ring *br = ctx->buf_ring; + unsigned short idx = br->tail; + struct io_uring_buf *buf = &br->bufs[idx & (ctx->buf_count - 1)]; + buf->addr = (unsigned long long)(ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM); + buf->len = (__u32)(ctx->buf_size - URING_RECV_HEADROOM); + buf->bid = (__u16)buf_id; + /* For init: publish immediately. For runtime: use buf_ring_add_deferred + commit. */ + __atomic_store_n(&br->tail, (__u16)(idx + 1), __ATOMIC_RELEASE); +} + +static void +buf_ring_add_deferred(uring_ctx_t *ctx, int buf_id) +{ + /* Add entry without publishing (no atomic store on tail). + Caller must call uring_buf_ring_commit() when done. */ + struct io_uring_buf_ring *br = ctx->buf_ring; + unsigned short idx = ctx->buf_ring_pending; + struct io_uring_buf *buf = &br->bufs[idx & (ctx->buf_count - 1)]; + buf->addr = (unsigned long long)(ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM); + buf->len = (__u32)(ctx->buf_size - URING_RECV_HEADROOM); + buf->bid = (__u16)buf_id; + ctx->buf_ring_pending = (__u16)(idx + 1); +} + +/* --- Public API ---------------------------------------------------------- */ + +int +uring_init(uring_ctx_t *ctx, int queue_depth, int buf_count, int buf_size) +{ + if (getenv("UDPSPEEDER_NO_URING")) { + mylog(log_info, "io_uring: disabled by UDPSPEEDER_NO_URING\n"); + ctx->available = 0; + return -1; + } + memset(ctx, 0, sizeof(*ctx)); + ctx->ring_fd = -1; + ctx->available = 0; + ctx->bgid = 0; + ctx->buf_count = buf_count; + ctx->buf_size = buf_size; + + /* buf_count must be power of 2 for the ring */ + if (buf_count & (buf_count - 1)) { + mylog(log_warn, "io_uring: buf_count must be power of 2\n"); + return -1; + } + + /* 1. io_uring_setup */ + struct io_uring_params params; + memset(¶ms, 0, sizeof(params)); + /* CQ sized 4x buf_count: 2 multishot requests share the CQ, and we need + headroom for error/cancel CQEs that don't consume buffers. */ + params.flags = IORING_SETUP_CQSIZE; + params.cq_entries = (unsigned)(buf_count * 4); + + /* Try performance flags. Fall back if kernel rejects. */ + unsigned opt_flags = 0; +#ifdef IORING_SETUP_COOP_TASKRUN + opt_flags |= IORING_SETUP_COOP_TASKRUN; +#endif +#ifdef IORING_SETUP_SINGLE_ISSUER + opt_flags |= IORING_SETUP_SINGLE_ISSUER; +#endif + + params.flags |= opt_flags; + int fd = sys_io_uring_setup((unsigned)queue_depth, ¶ms); + if (fd < 0 && opt_flags) { + /* Retry without optional flags */ + memset(¶ms, 0, sizeof(params)); + params.flags = IORING_SETUP_CQSIZE; + params.cq_entries = (unsigned)(buf_count * 4); + fd = sys_io_uring_setup((unsigned)queue_depth, ¶ms); + } + if (fd < 0) { + mylog(log_info, "io_uring: io_uring_setup failed (errno %d), using fallback\n", errno); + return -1; + } + ctx->ring_fd = fd; + ctx->sq_entries = params.sq_entries; + ctx->cq_entries = params.cq_entries; + + /* 2. mmap SQ ring */ + ctx->sq_ring_sz = (size_t)(params.sq_off.array + params.sq_entries * sizeof(unsigned)); + ctx->sq_ring_ptr = mmap(NULL, ctx->sq_ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ctx->sq_ring_ptr == MAP_FAILED) { + mylog(log_warn, "io_uring: mmap SQ ring failed\n"); + goto fail; + } + ctx->sq_head = (unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.head); + ctx->sq_tail = (unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.tail); + ctx->sq_mask = *(unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.ring_mask); + ctx->sq_array = (unsigned *)((char *)ctx->sq_ring_ptr + params.sq_off.array); + + /* 3. mmap SQEs */ + ctx->sqes_sz = (size_t)(params.sq_entries * sizeof(struct io_uring_sqe)); + ctx->sqes = (struct io_uring_sqe *)mmap(NULL, ctx->sqes_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (ctx->sqes == MAP_FAILED) { + mylog(log_warn, "io_uring: mmap SQEs failed\n"); + goto fail; + } + + /* 4. mmap CQ ring */ + ctx->cq_ring_sz = (size_t)(params.cq_off.cqes + params.cq_entries * sizeof(struct io_uring_cqe)); + ctx->cq_ring_ptr = mmap(NULL, ctx->cq_ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); + if (ctx->cq_ring_ptr == MAP_FAILED) { + mylog(log_warn, "io_uring: mmap CQ ring failed\n"); + goto fail; + } + ctx->cq_head = (unsigned *)((char *)ctx->cq_ring_ptr + params.cq_off.head); + ctx->cq_tail = (unsigned *)((char *)ctx->cq_ring_ptr + params.cq_off.tail); + ctx->cq_mask = *(unsigned *)((char *)ctx->cq_ring_ptr + params.cq_off.ring_mask); + ctx->cqes = (struct io_uring_cqe *)((char *)ctx->cq_ring_ptr + params.cq_off.cqes); + + /* 5. Allocate buffer pool */ + ctx->buf_pool = (char *)aligned_alloc(4096, (size_t)buf_count * (size_t)buf_size); + if (!ctx->buf_pool) { + mylog(log_warn, "io_uring: buf_pool alloc failed\n"); + goto fail; + } + + /* 6. Set up provided buffer ring */ + { + size_t ring_sz = sizeof(struct io_uring_buf_ring) + + (size_t)buf_count * sizeof(struct io_uring_buf); + /* Must be page-aligned for kernel registration */ + size_t page = (size_t)sysconf(_SC_PAGESIZE); + ring_sz = (ring_sz + page - 1) & ~(page - 1); + + ctx->buf_ring = (struct io_uring_buf_ring *)mmap( + NULL, ring_sz, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ctx->buf_ring == MAP_FAILED) { + ctx->buf_ring = NULL; + mylog(log_warn, "io_uring: buf_ring mmap failed\n"); + goto fail; + } + memset(ctx->buf_ring, 0, ring_sz); + ctx->buf_ring->tail = 0; + ctx->buf_ring_pending = 0; + + struct io_uring_buf_reg reg; + memset(®, 0, sizeof(reg)); + reg.ring_addr = (unsigned long long)ctx->buf_ring; + reg.ring_entries = (__u32)buf_count; + reg.bgid = (__u16)ctx->bgid; + + int ret = sys_io_uring_register(fd, IORING_REGISTER_PBUF_RING, ®, 1); + if (ret < 0) { + mylog(log_info, "io_uring: REGISTER_PBUF_RING failed (errno %d), kernel too old?\n", errno); + goto fail; + } + + /* Populate the buffer ring with all buffers */ + for (int i = 0; i < buf_count; i++) { + buf_ring_add(ctx, i); + } + ctx->buf_ring_pending = ctx->buf_ring->tail; + } + + /* 7. Initialize msghdr template for recvmsg */ + memset(&ctx->recvmsg_hdr, 0, sizeof(ctx->recvmsg_hdr)); + memset(&ctx->recvmsg_name, 0, sizeof(ctx->recvmsg_name)); + ctx->recvmsg_hdr.msg_name = &ctx->recvmsg_name; + ctx->recvmsg_hdr.msg_namelen = sizeof(ctx->recvmsg_name); + + ctx->available = 1; + mylog(log_info, "io_uring: initialized (ring_fd=%d, %d buffers × %d bytes, cq=%u)\n", + fd, buf_count, buf_size, ctx->cq_entries); + return 0; + +fail: + uring_destroy(ctx); + return -1; +} + +void +uring_destroy(uring_ctx_t *ctx) +{ + if (ctx->buf_ring) { + size_t page = (size_t)sysconf(_SC_PAGESIZE); + size_t ring_sz = sizeof(struct io_uring_buf_ring) + + (size_t)ctx->buf_count * sizeof(struct io_uring_buf); + ring_sz = (ring_sz + page - 1) & ~(page - 1); + munmap(ctx->buf_ring, ring_sz); + ctx->buf_ring = NULL; + } + free(ctx->buf_pool); + ctx->buf_pool = NULL; + + if (ctx->sqes && ctx->sqes != MAP_FAILED) + munmap(ctx->sqes, ctx->sqes_sz); + if (ctx->sq_ring_ptr && ctx->sq_ring_ptr != MAP_FAILED) + munmap(ctx->sq_ring_ptr, ctx->sq_ring_sz); + if (ctx->cq_ring_ptr && ctx->cq_ring_ptr != MAP_FAILED) + munmap(ctx->cq_ring_ptr, ctx->cq_ring_sz); + + if (ctx->ring_fd >= 0) + close(ctx->ring_fd); + + ctx->ring_fd = -1; + ctx->available = 0; +} + +int +uring_add_multishot_recvmsg(uring_ctx_t *ctx, int fd, uint64_t user_data) +{ + struct io_uring_sqe *sqe = get_sqe(ctx); + if (!sqe) return -1; + + memset(sqe, 0, sizeof(*sqe)); + sqe->opcode = IORING_OP_RECVMSG; + sqe->fd = fd; + sqe->user_data = user_data; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->addr = (unsigned long long)&ctx->recvmsg_hdr; + sqe->buf_group = (__u16)ctx->bgid; + + submit_sqe(ctx); + return 0; +} + +int +uring_add_multishot_recv(uring_ctx_t *ctx, int fd, uint64_t user_data) +{ + struct io_uring_sqe *sqe = get_sqe(ctx); + if (!sqe) return -1; + + memset(sqe, 0, sizeof(*sqe)); + sqe->opcode = IORING_OP_RECV; + sqe->fd = fd; + sqe->user_data = user_data; + sqe->flags = IOSQE_BUFFER_SELECT; + sqe->ioprio = IORING_RECV_MULTISHOT; + sqe->buf_group = (__u16)ctx->bgid; + + submit_sqe(ctx); + return 0; +} + +int +uring_cancel(uring_ctx_t *ctx, uint64_t user_data) +{ + struct io_uring_sqe *sqe = get_sqe(ctx); + if (!sqe) return -1; + + memset(sqe, 0, sizeof(*sqe)); + sqe->opcode = IORING_OP_ASYNC_CANCEL; + sqe->addr = user_data; /* cancels SQE matching this user_data */ + + submit_sqe(ctx); + return 0; +} + +int +uring_submit(uring_ctx_t *ctx) +{ + unsigned submitted = *ctx->sq_tail - io_uring_smp_load_acquire(ctx->sq_head); + if (submitted == 0) return 0; + + int ret = sys_io_uring_enter(ctx->ring_fd, submitted, 0, + IORING_ENTER_SQ_WAKEUP, NULL, 0); + if (ret < 0) { + mylog(log_warn, "io_uring: io_uring_enter submit failed (errno %d)\n", errno); + return -1; + } + return ret; +} + +int +uring_submit_and_flush(uring_ctx_t *ctx) +{ + unsigned submitted = *ctx->sq_tail - io_uring_smp_load_acquire(ctx->sq_head); + unsigned flags = IORING_ENTER_GETEVENTS; + if (submitted > 0) + flags |= IORING_ENTER_SQ_WAKEUP; + + int ret = sys_io_uring_enter(ctx->ring_fd, submitted, 0, flags, NULL, 0); + if (ret < 0) { + mylog(log_warn, "io_uring: io_uring_enter submit+flush failed (errno %d)\n", errno); + return -1; + } + return ret; +} + +/* --- Batched CQ drain API ------------------------------------------------ */ + +unsigned +uring_cq_ready(uring_ctx_t *ctx) +{ + /* Acquire on tail ensures we see CQE data the kernel wrote before updating tail */ + unsigned head = *ctx->cq_head; /* our variable, no barrier needed */ + unsigned tail = io_uring_smp_load_acquire(ctx->cq_tail); + return tail - head; +} + +struct io_uring_cqe * +uring_cqe_at(uring_ctx_t *ctx, unsigned idx) +{ + /* idx is offset from current cq_head */ + unsigned head = *ctx->cq_head; + return &ctx->cqes[(head + idx) & ctx->cq_mask]; +} + +void +uring_cq_advance(uring_ctx_t *ctx, unsigned n) +{ + if (n == 0) return; + unsigned head = *ctx->cq_head; + io_uring_smp_store_release(ctx->cq_head, head + n); +} + +/* --- Batched buffer ring API --------------------------------------------- */ + +void +uring_recycle_buf(uring_ctx_t *ctx, int buf_id) +{ + buf_ring_add_deferred(ctx, buf_id); +} + +void +uring_buf_ring_commit(uring_ctx_t *ctx) +{ + /* Single atomic publish of all deferred buffer additions */ + __atomic_store_n(&ctx->buf_ring->tail, ctx->buf_ring_pending, __ATOMIC_RELEASE); +} + +void +uring_flush(uring_ctx_t *ctx) +{ + /* Trigger deferred completions by entering with GETEVENTS */ + sys_io_uring_enter(ctx->ring_fd, 0, 0, IORING_ENTER_GETEVENTS, NULL, 0); +} + +int +uring_parse_recvmsg_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe, + uring_recv_buf_t *out) +{ + if (cqe->res < 0) return -1; + + if (!(cqe->flags & IORING_CQE_F_BUFFER)) { + mylog(log_debug, "io_uring: recvmsg CQE missing BUFFER flag\n"); + return -1; + } + + int buf_id = (int)(cqe->flags >> IORING_CQE_BUFFER_SHIFT); + if (buf_id < 0 || buf_id >= ctx->buf_count) return -1; + + /* Kernel writes at registered addr = pool + id*buf_size + HEADROOM */ + char *kernel_start = ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM; + struct io_uring_recvmsg_out *hdr = (struct io_uring_recvmsg_out *)kernel_start; + + out->buf_id = buf_id; + out->addr_len = (socklen_t)(hdr->namelen < sizeof(out->addr) ? hdr->namelen : sizeof(out->addr)); + memcpy(&out->addr, kernel_start + sizeof(*hdr), out->addr_len); + /* Kernel reserves msg_namelen bytes (from template) for name area, + not hdr->namelen (actual). Use template sizes for offset. */ + int header_len = (int)(sizeof(*hdr) + ctx->recvmsg_hdr.msg_namelen + + ctx->recvmsg_hdr.msg_controllen); + out->data = kernel_start + header_len; + int max_payload = ctx->buf_size - URING_RECV_HEADROOM - header_len; + out->len = (int)hdr->payloadlen; + if (out->len > max_payload) out->len = max_payload; + + return 0; +} + +int +uring_parse_recv_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe, + uring_recv_buf_t *out) +{ + if (cqe->res < 0) return -1; + + if (!(cqe->flags & IORING_CQE_F_BUFFER)) { + mylog(log_debug, "io_uring: recv CQE missing BUFFER flag\n"); + return -1; + } + + int buf_id = (int)(cqe->flags >> IORING_CQE_BUFFER_SHIFT); + if (buf_id < 0 || buf_id >= ctx->buf_count) return -1; + + /* Kernel writes at registered addr = pool + id*buf_size + HEADROOM */ + char *kernel_start = ctx->buf_pool + (long)buf_id * ctx->buf_size + URING_RECV_HEADROOM; + out->buf_id = buf_id; + out->data = kernel_start; + out->len = cqe->res; + out->addr_len = 0; + + return 0; +} + +#endif /* __linux__ */ diff --git a/io_uring_recv.h b/io_uring_recv.h new file mode 100644 index 0000000..4ed24a3 --- /dev/null +++ b/io_uring_recv.h @@ -0,0 +1,150 @@ +#ifndef IO_URING_RECV_H_ +#define IO_URING_RECV_H_ + +#include "common.h" + +#ifdef __linux__ + +#include +#include + +/* --- Kernel constant fallbacks (for older headers) ----------------------- */ + +#include +#include + +/* + * Macro fallbacks — these are #define'd in kernel headers (not enums), + * so #ifndef works reliably. Struct/enum fallbacks are NOT provided; + * compilation requires kernel headers 6.0+ (Ubuntu 22.04 HWE or 24.04). + * Runtime probe handles older kernels gracefully. + */ +#ifndef IORING_RECV_MULTISHOT +#define IORING_RECV_MULTISHOT (1U << 1) +#endif +#ifndef IORING_CQE_F_MORE +#define IORING_CQE_F_MORE (1U << 1) +#endif +#ifndef IORING_CQE_F_BUFFER +#define IORING_CQE_F_BUFFER (1U << 0) +#endif +#ifndef IORING_CQE_BUFFER_SHIFT +#define IORING_CQE_BUFFER_SHIFT 16 +#endif + +/* --- Public API ---------------------------------------------------------- */ + +struct uring_recv_buf_t { + char *data; + int len; + struct sockaddr_storage addr; + socklen_t addr_len; + int buf_id; +}; + +struct uring_ctx_t { + int ring_fd; + int available; + + /* mmap'd ring pointers */ + void *sq_ring_ptr; + size_t sq_ring_sz; + void *cq_ring_ptr; + size_t cq_ring_sz; + struct io_uring_sqe *sqes; + size_t sqes_sz; + + /* SQ ring offsets */ + unsigned *sq_head; + unsigned *sq_tail; + unsigned *sq_array; + unsigned sq_mask; + unsigned sq_entries; + + /* CQ ring offsets */ + unsigned *cq_head; + unsigned *cq_tail; + unsigned cq_mask; + unsigned cq_entries; + struct io_uring_cqe *cqes; + + /* Provided buffer ring */ + struct io_uring_buf_ring *buf_ring; + unsigned short buf_ring_pending; /* shadow tail for deferred recycling */ + char *buf_pool; + int buf_count; + int buf_size; /* size per buffer including header room */ + int bgid; + + /* msghdr template for multishot recvmsg */ + struct msghdr recvmsg_hdr; + struct sockaddr_storage recvmsg_name; +}; + +/* User data tag encode/decode */ +static inline uint64_t uring_tag(uint8_t type, uint64_t payload) { + return ((uint64_t)type << 56) | (payload & 0x00FFFFFFFFFFFFFFULL); +} +static inline uint8_t uring_tag_type(uint64_t user_data) { + return (uint8_t)(user_data >> 56); +} +static inline uint64_t uring_tag_payload(uint64_t user_data) { + return user_data & 0x00FFFFFFFFFFFFFFULL; +} + +/* Tag types */ +#define URING_TAG_CLIENT_LOCAL 0x01 +#define URING_TAG_CLIENT_REMOTE 0x02 +#define URING_TAG_SERVER_LOCAL 0x03 +#define URING_TAG_SERVER_REMOTE 0x04 + +/* Headroom reserved before each provided buffer for in-place conv header */ +#define URING_RECV_HEADROOM 4 /* sizeof(u32_t) */ + +int uring_init(uring_ctx_t *ctx, int queue_depth, int buf_count, int buf_size); +void uring_destroy(uring_ctx_t *ctx); + +int uring_add_multishot_recvmsg(uring_ctx_t *ctx, int fd, uint64_t user_data); +int uring_add_multishot_recv(uring_ctx_t *ctx, int fd, uint64_t user_data); +int uring_cancel(uring_ctx_t *ctx, uint64_t user_data); +int uring_submit(uring_ctx_t *ctx); + +/* Batched CQ drain */ +unsigned uring_cq_ready(uring_ctx_t *ctx); +struct io_uring_cqe *uring_cqe_at(uring_ctx_t *ctx, unsigned idx); +void uring_cq_advance(uring_ctx_t *ctx, unsigned n); + +int uring_submit_and_flush(uring_ctx_t *ctx); +void uring_flush(uring_ctx_t *ctx); + +int uring_parse_recvmsg_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe, + uring_recv_buf_t *out); +int uring_parse_recv_cqe(uring_ctx_t *ctx, struct io_uring_cqe *cqe, + uring_recv_buf_t *out); +void uring_recycle_buf(uring_ctx_t *ctx, int buf_id); +void uring_buf_ring_commit(uring_ctx_t *ctx); + +/* Global pointer — set in tunnel event loop, used by connection.cpp cleanup */ +extern uring_ctx_t *g_uring_ctx; + +#else /* !__linux__ */ + +/* Stubs for non-Linux — always unavailable */ +struct uring_recv_buf_t { char *data; int len; int buf_id; }; +struct uring_ctx_t { int available; }; +static inline int uring_init(uring_ctx_t *ctx, int, int, int) { ctx->available = 0; return -1; } +static inline void uring_destroy(uring_ctx_t *) {} + +/* Tag helpers still available for compilation */ +static inline uint64_t uring_tag(uint8_t type, uint64_t payload) { + return ((uint64_t)type << 56) | (payload & 0x00FFFFFFFFFFFFFFULL); +} +#define URING_TAG_CLIENT_LOCAL 0x01 +#define URING_TAG_CLIENT_REMOTE 0x02 +#define URING_TAG_SERVER_LOCAL 0x03 +#define URING_TAG_SERVER_REMOTE 0x04 + +extern uring_ctx_t *g_uring_ctx; + +#endif /* __linux__ */ +#endif /* IO_URING_RECV_H_ */ diff --git a/lib/fec.cpp b/lib/fec.cpp index 982a7f1..7b559f4 100644 --- a/lib/fec.cpp +++ b/lib/fec.cpp @@ -210,6 +210,27 @@ init_mul_table() for (j=0; j< GF_SIZE+1; j++) gf_mul_table[0][j] = gf_mul_table[j][0] = 0; } + +/* + * SIMD nibble lookup tables for GF(2^8) multiply-by-constant. + * For each constant c, lo_table[c][i] = c*i and hi_table[c][i] = c*(i<<4). + * This enables PSHUFB/TBL to process 16 bytes per instruction pair. + */ +static gf gf_lo_table[GF_SIZE + 1][16] __attribute__((aligned(16))); +static gf gf_hi_table[GF_SIZE + 1][16] __attribute__((aligned(16))); + +static void +init_simd_tables() +{ + int c, i; + for (c = 0; c <= GF_SIZE; c++) { + for (i = 0; i < 16; i++) { + gf_lo_table[c][i] = gf_mul_table[c][i]; + gf_hi_table[c][i] = gf_mul_table[c][i << 4]; + } + } +} + #else /* GF_BITS > 8 */ static inline gf gf_mul(x,y) @@ -326,27 +347,308 @@ generate_gf(void) /* * addmul() computes dst[] = dst[] + c * src[] - * This is used often, so better optimize it! Currently the loop is - * unrolled 16 times, a good value for 486 and pentium-class machines. - * The case c=0 is also optimized, whereas c=1 is not. These - * calls are unfrequent in my typical apps so I did not bother. - * - * Note that gcc on + * + * SIMD paths use nibble decomposition: c*x = lo_table[x & 0x0F] ^ hi_table[x >> 4] + * where each table has 16 entries fitting in one 128-bit SIMD register. + * PSHUFB (x86 SSSE3) / TBL (ARM NEON) performs 16 parallel lookups. */ #define addmul(dst, src, c, sz) \ if (c != 0) addmul1(dst, src, c, sz) +#if defined(__x86_64__) +#include +#include + +static int cpu_has_avx2(void) +{ + unsigned int eax, ebx, ecx, edx; + + /* OSXSAVE — OS supports XSAVE */ + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) + return 0; + if (!(ecx & (1u << 27))) + return 0; + + /* XCR0 bits 1-2 — OS saves SSE+AVX state */ + unsigned int xcr0; + __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "edx"); + if ((xcr0 & 0x6) != 0x6) + return 0; + + /* AVX2: leaf 7, sub-leaf 0, EBX bit 5 */ + if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) + return 0; + return (ebx >> 5) & 1; +} + +static int cpu_has_avx512bw(void) +{ + unsigned int eax, ebx, ecx, edx; + + /* OSXSAVE — OS supports XSAVE */ + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) + return 0; + if (!(ecx & (1u << 27))) + return 0; + + /* XCR0 bits 1,2 (SSE+AVX) + 5,6,7 (opmask, ZMM_Hi256, Hi16_ZMM) */ + unsigned int xcr0; + __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "edx"); + if ((xcr0 & 0xE6) != 0xE6) + return 0; + + /* AVX-512BW: leaf 7, sub-leaf 0, EBX bit 30 */ + if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) + return 0; + return (ebx >> 30) & 1; +} + +__attribute__((target("ssse3"))) +static void +addmul1_ssse3(gf *dst, gf *src, gf c, int sz) +{ + __m128i tbl_lo = _mm_load_si128((const __m128i *)gf_lo_table[c]); + __m128i tbl_hi = _mm_load_si128((const __m128i *)gf_hi_table[c]); + __m128i mask = _mm_set1_epi8(0x0F); + + int i = 0; + /* 2x unrolled: process 32 bytes per iteration for better ILP */ + for (; i + 32 <= sz; i += 32) { + __m128i x1 = _mm_loadu_si128((const __m128i *)(src + i)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(src + i + 16)); + __m128i lo1 = _mm_shuffle_epi8(tbl_lo, _mm_and_si128(x1, mask)); + __m128i hi1 = _mm_shuffle_epi8(tbl_hi, _mm_and_si128(_mm_srli_epi64(x1, 4), mask)); + __m128i lo2 = _mm_shuffle_epi8(tbl_lo, _mm_and_si128(x2, mask)); + __m128i hi2 = _mm_shuffle_epi8(tbl_hi, _mm_and_si128(_mm_srli_epi64(x2, 4), mask)); + __m128i d1 = _mm_loadu_si128((const __m128i *)(dst + i)); + __m128i d2 = _mm_loadu_si128((const __m128i *)(dst + i + 16)); + _mm_storeu_si128((__m128i *)(dst + i), + _mm_xor_si128(d1, _mm_xor_si128(lo1, hi1))); + _mm_storeu_si128((__m128i *)(dst + i + 16), + _mm_xor_si128(d2, _mm_xor_si128(lo2, hi2))); + } + /* 16-byte tail */ + for (; i + 16 <= sz; i += 16) { + __m128i x = _mm_loadu_si128((const __m128i *)(src + i)); + __m128i lo = _mm_shuffle_epi8(tbl_lo, _mm_and_si128(x, mask)); + __m128i hi = _mm_shuffle_epi8(tbl_hi, + _mm_and_si128(_mm_srli_epi64(x, 4), mask)); + __m128i d = _mm_loadu_si128((const __m128i *)(dst + i)); + _mm_storeu_si128((__m128i *)(dst + i), + _mm_xor_si128(d, _mm_xor_si128(lo, hi))); + } + + /* scalar tail */ + USE_GF_MULC ; + GF_MULC0(c) ; + for (; i < sz; i++) + GF_ADDMULC(dst[i], src[i]); +} + +__attribute__((target("avx2"))) +static void +addmul1_avx2(gf *dst, gf *src, gf c, int sz) +{ + __m128i tbl128_lo = _mm_load_si128((const __m128i *)gf_lo_table[c]); + __m128i tbl128_hi = _mm_load_si128((const __m128i *)gf_hi_table[c]); + __m256i tbl_lo = _mm256_broadcastsi128_si256(tbl128_lo); + __m256i tbl_hi = _mm256_broadcastsi128_si256(tbl128_hi); + __m256i mask = _mm256_set1_epi8(0x0F); + + int i = 0; + /* 2x unrolled: process 64 bytes per iteration for better ILP */ + for (; i + 64 <= sz; i += 64) { + __m256i x1 = _mm256_loadu_si256((const __m256i *)(src + i)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(src + i + 32)); + __m256i lo1 = _mm256_shuffle_epi8(tbl_lo, _mm256_and_si256(x1, mask)); + __m256i hi1 = _mm256_shuffle_epi8(tbl_hi, _mm256_and_si256(_mm256_srli_epi64(x1, 4), mask)); + __m256i lo2 = _mm256_shuffle_epi8(tbl_lo, _mm256_and_si256(x2, mask)); + __m256i hi2 = _mm256_shuffle_epi8(tbl_hi, _mm256_and_si256(_mm256_srli_epi64(x2, 4), mask)); + __m256i d1 = _mm256_loadu_si256((const __m256i *)(dst + i)); + __m256i d2 = _mm256_loadu_si256((const __m256i *)(dst + i + 32)); + _mm256_storeu_si256((__m256i *)(dst + i), + _mm256_xor_si256(d1, _mm256_xor_si256(lo1, hi1))); + _mm256_storeu_si256((__m256i *)(dst + i + 32), + _mm256_xor_si256(d2, _mm256_xor_si256(lo2, hi2))); + } + /* 32-byte tail */ + for (; i + 32 <= sz; i += 32) { + __m256i x = _mm256_loadu_si256((const __m256i *)(src + i)); + __m256i lo = _mm256_shuffle_epi8(tbl_lo, _mm256_and_si256(x, mask)); + __m256i hi = _mm256_shuffle_epi8(tbl_hi, + _mm256_and_si256(_mm256_srli_epi64(x, 4), mask)); + __m256i d = _mm256_loadu_si256((const __m256i *)(dst + i)); + _mm256_storeu_si256((__m256i *)(dst + i), + _mm256_xor_si256(d, _mm256_xor_si256(lo, hi))); + } + + /* SSE tail: at most one 16-byte chunk */ + if (i + 16 <= sz) { + __m128i mx = _mm_set1_epi8(0x0F); + __m128i x = _mm_loadu_si128((const __m128i *)(src + i)); + __m128i lo = _mm_shuffle_epi8(tbl128_lo, _mm_and_si128(x, mx)); + __m128i hi = _mm_shuffle_epi8(tbl128_hi, + _mm_and_si128(_mm_srli_epi64(x, 4), mx)); + __m128i d = _mm_loadu_si128((const __m128i *)(dst + i)); + _mm_storeu_si128((__m128i *)(dst + i), + _mm_xor_si128(d, _mm_xor_si128(lo, hi))); + i += 16; + } + + /* scalar tail */ + USE_GF_MULC ; + GF_MULC0(c) ; + for (; i < sz; i++) + GF_ADDMULC(dst[i], src[i]); +} + +__attribute__((target("avx512bw"))) +static void +addmul1_avx512(gf *dst, gf *src, gf c, int sz) +{ + __m512i tbl_lo = _mm512_broadcast_i32x4( + _mm_load_si128((const __m128i *)gf_lo_table[c])); + __m512i tbl_hi = _mm512_broadcast_i32x4( + _mm_load_si128((const __m128i *)gf_hi_table[c])); + __m512i mask = _mm512_set1_epi8(0x0F); + + int i = 0; + /* 2x unrolled: process 128 bytes per iteration for better ILP */ + for (; i + 128 <= sz; i += 128) { + __m512i x1 = _mm512_loadu_si512(src + i); + __m512i x2 = _mm512_loadu_si512(src + i + 64); + __m512i lo1 = _mm512_shuffle_epi8(tbl_lo, _mm512_and_si512(x1, mask)); + __m512i hi1 = _mm512_shuffle_epi8(tbl_hi, + _mm512_and_si512(_mm512_srli_epi64(x1, 4), mask)); + __m512i lo2 = _mm512_shuffle_epi8(tbl_lo, _mm512_and_si512(x2, mask)); + __m512i hi2 = _mm512_shuffle_epi8(tbl_hi, + _mm512_and_si512(_mm512_srli_epi64(x2, 4), mask)); + __m512i d1 = _mm512_loadu_si512(dst + i); + __m512i d2 = _mm512_loadu_si512(dst + i + 64); + _mm512_storeu_si512(dst + i, + _mm512_ternarylogic_epi64(d1, lo1, hi1, 0x96)); + _mm512_storeu_si512(dst + i + 64, + _mm512_ternarylogic_epi64(d2, lo2, hi2, 0x96)); + } + /* 64-byte tail */ + for (; i + 64 <= sz; i += 64) { + __m512i x = _mm512_loadu_si512(src + i); + __m512i lo = _mm512_shuffle_epi8(tbl_lo, _mm512_and_si512(x, mask)); + __m512i hi = _mm512_shuffle_epi8(tbl_hi, + _mm512_and_si512(_mm512_srli_epi64(x, 4), mask)); + __m512i d = _mm512_loadu_si512(dst + i); + _mm512_storeu_si512(dst + i, + _mm512_ternarylogic_epi64(d, lo, hi, 0x96)); + } + + /* AVX2 tail: at most one 32-byte chunk */ + if (i + 32 <= sz) { + __m256i tbl256_lo = _mm256_broadcastsi128_si256( + _mm_load_si128((const __m128i *)gf_lo_table[c])); + __m256i tbl256_hi = _mm256_broadcastsi128_si256( + _mm_load_si128((const __m128i *)gf_hi_table[c])); + __m256i m256 = _mm256_set1_epi8(0x0F); + __m256i x = _mm256_loadu_si256((const __m256i *)(src + i)); + __m256i lo = _mm256_shuffle_epi8(tbl256_lo, _mm256_and_si256(x, m256)); + __m256i hi = _mm256_shuffle_epi8(tbl256_hi, + _mm256_and_si256(_mm256_srli_epi64(x, 4), m256)); + __m256i d = _mm256_loadu_si256((const __m256i *)(dst + i)); + _mm256_storeu_si256((__m256i *)(dst + i), + _mm256_xor_si256(d, _mm256_xor_si256(lo, hi))); + i += 32; + } + + /* SSE tail: at most one 16-byte chunk */ + if (i + 16 <= sz) { + __m128i mx = _mm_set1_epi8(0x0F); + __m128i x = _mm_loadu_si128((const __m128i *)(src + i)); + __m128i lo = _mm_shuffle_epi8( + _mm_load_si128((const __m128i *)gf_lo_table[c]), + _mm_and_si128(x, mx)); + __m128i hi = _mm_shuffle_epi8( + _mm_load_si128((const __m128i *)gf_hi_table[c]), + _mm_and_si128(_mm_srli_epi64(x, 4), mx)); + __m128i d = _mm_loadu_si128((const __m128i *)(dst + i)); + _mm_storeu_si128((__m128i *)(dst + i), + _mm_xor_si128(d, _mm_xor_si128(lo, hi))); + i += 16; + } + + /* scalar tail */ + USE_GF_MULC ; + GF_MULC0(c) ; + for (; i < sz; i++) + GF_ADDMULC(dst[i], src[i]); +} + +static void (*addmul1_x86_fn)(gf *, gf *, gf, int) = addmul1_ssse3; +#endif /* __x86_64__ */ + +#if defined(__aarch64__) +#include + +static void +addmul1_neon(gf *dst, gf *src, gf c, int sz) +{ + uint8x16_t tbl_lo = vld1q_u8(gf_lo_table[c]); + uint8x16_t tbl_hi = vld1q_u8(gf_hi_table[c]); + uint8x16_t mask = vdupq_n_u8(0x0F); + + int i = 0; + for (; i + 32 <= sz; i += 32) { + uint8x16_t x1 = vld1q_u8(src + i); + uint8x16_t x2 = vld1q_u8(src + i + 16); + uint8x16_t lo1 = vqtbl1q_u8(tbl_lo, vandq_u8(x1, mask)); + uint8x16_t hi1 = vqtbl1q_u8(tbl_hi, vshrq_n_u8(x1, 4)); + uint8x16_t lo2 = vqtbl1q_u8(tbl_lo, vandq_u8(x2, mask)); + uint8x16_t hi2 = vqtbl1q_u8(tbl_hi, vshrq_n_u8(x2, 4)); + uint8x16_t d1 = vld1q_u8(dst + i); + uint8x16_t d2 = vld1q_u8(dst + i + 16); + vst1q_u8(dst + i, veorq_u8(d1, veorq_u8(lo1, hi1))); + vst1q_u8(dst + i + 16, veorq_u8(d2, veorq_u8(lo2, hi2))); + } + for (; i + 16 <= sz; i += 16) { + uint8x16_t x = vld1q_u8(src + i); + uint8x16_t lo = vqtbl1q_u8(tbl_lo, vandq_u8(x, mask)); + uint8x16_t hi = vqtbl1q_u8(tbl_hi, vshrq_n_u8(x, 4)); + uint8x16_t d = vld1q_u8(dst + i); + vst1q_u8(dst + i, veorq_u8(d, veorq_u8(lo, hi))); + } + + /* scalar tail */ + USE_GF_MULC ; + GF_MULC0(c) ; + for (; i < sz; i++) + GF_ADDMULC(dst[i], src[i]); +} +#endif /* __aarch64__ */ + #define UNROLL 16 /* 1, 4, 8, 16 */ static void addmul1(gf *dst1, gf *src1, gf c, int sz) { +#if defined(__x86_64__) + addmul1_x86_fn(dst1, src1, c, sz); +#elif defined(__aarch64__) + addmul1_neon(dst1, src1, c, sz); +#else + /* + * Scalar fallback for MIPS, i486, ARMv7, etc. + * + * NOT auto-vectorizable: the 256-entry table lookup (gf_mulc_table[c][src[i]]) + * is a data-dependent gather. The nibble decomposition that makes PSHUFB/TBL + * work requires GF(2^8) algebraic insight no compiler performs. Pragmas like + * omp simd, __restrict__, and -ftree-vectorize don't help — they grant + * permission to vectorize but can't transform the lookup. Deliberate choice. + */ + if (sz <= 0) return; USE_GF_MULC ; gf *dst = dst1, *src = src1 ; gf *lim = &dst[sz - UNROLL + 1] ; GF_MULC0(c) ; -#if (UNROLL > 1) /* unrolling by 8/16 is quite effective on the pentium */ +#if (UNROLL > 1) for (; dst < lim ; dst += UNROLL, src += UNROLL ) { GF_ADDMULC( dst[0] , src[0] ); GF_ADDMULC( dst[1] , src[1] ); @@ -371,8 +673,9 @@ addmul1(gf *dst1, gf *src1, gf c, int sz) } #endif lim += UNROLL - 1 ; - for (; dst < lim; dst++, src++ ) /* final components */ + for (; dst < lim; dst++, src++ ) GF_ADDMULC( *dst , *src ); +#endif /* architecture dispatch */ } /* @@ -429,13 +732,12 @@ invert_mat(gf *src, int k) int irow, icol, row, col, i, ix ; int error = 1 ; - int *indxc = (int*)my_malloc(k*sizeof(int), "indxc"); - int *indxr = (int*)my_malloc(k*sizeof(int), "indxr"); - int *ipiv = (int*)my_malloc(k*sizeof(int), "ipiv"); - gf *id_row = NEW_GF_MATRIX(1, k); - gf *temp_row = NEW_GF_MATRIX(1, k); + int indxc[k]; + int indxr[k]; + int ipiv[k]; + gf id_row[k]; - bzero(id_row, k*sizeof(gf)); + memset(id_row, 0, (unsigned)k * sizeof(gf)); DEB( pivloops=0; pivswaps=0 ; /* diagnostic */ ) /* * ipiv marks elements already used as pivots. @@ -540,11 +842,6 @@ invert_mat(gf *src, int k) } error = 0 ; fail: - free(indxc); - free(indxr); - free(ipiv); - free(id_row); - free(temp_row); return error ; } @@ -628,6 +925,15 @@ init_fec() init_mul_table(); TOCK(ticks[0]); DDB(fprintf(stderr, "init_mul_table took %ldus\n", ticks[0]);) +#if (GF_BITS <= 8) + init_simd_tables(); +#endif +#if defined(__x86_64__) + if (cpu_has_avx512bw()) + addmul1_x86_fn = addmul1_avx512; + else if (cpu_has_avx2()) + addmul1_x86_fn = addmul1_avx2; +#endif fec_initialized = 1 ; } @@ -643,6 +949,9 @@ struct fec_parms { u_long magic ; int k, n ; /* parameters of the code */ gf *enc_matrix ; + gf *dec_matrix ; /* k*k scratch for build_decode_matrix */ + gf *dec_buf ; /* k*dec_buf_sz scratch for fec_decode */ + int dec_buf_sz ; /* current sz capacity, 0 = not yet allocated */ } ; void @@ -655,6 +964,8 @@ fec_free(void *p0) return ; } free(p->enc_matrix); + free(p->dec_matrix); + free(p->dec_buf); free(p); } @@ -682,6 +993,9 @@ fec_new(int k, int n) retval->k = k ; retval->n = n ; retval->enc_matrix = NEW_GF_MATRIX(n, k); + retval->dec_matrix = NEW_GF_MATRIX(k, k); + retval->dec_buf = NULL ; + retval->dec_buf_sz = 0 ; retval->magic = ( ( FEC_MAGIC ^ k) ^ n) ^ (int)((long)retval->enc_matrix) ; tmp_m = NEW_GF_MATRIX(n, k); /* @@ -792,11 +1106,11 @@ shuffle(gf *pkt[], int index[], int k) * indexes. The matrix must be already allocated as * a vector of k*k elements, in row-major order */ -static gf * -build_decode_matrix(struct fec_parms *code, gf *pkt[], int index[]) +static int +build_decode_matrix(struct fec_parms *code, gf *pkt[], int index[], gf *matrix) { int i , k = code->k ; - gf *p, *matrix = NEW_GF_MATRIX(k, k); + gf *p ; TICK(ticks[9]); for (i = 0, p = matrix ; i < k ; i++, p += k ) { @@ -807,21 +1121,19 @@ build_decode_matrix(struct fec_parms *code, gf *pkt[], int index[]) } else #endif if (index[i] < code->n ) - bcopy( &(code->enc_matrix[index[i]*k]), p, k*sizeof(gf) ); + bcopy( &(code->enc_matrix[index[i]*k]), p, k*sizeof(gf) ); else { fprintf(stderr, "decode: invalid index %d (max %d)\n", index[i], code->n - 1 ); - free(matrix) ; - return NULL ; + return -1 ; } } TICK(ticks[9]); if (invert_mat(matrix, k)) { - free(matrix); - matrix = NULL ; + return -1 ; } TOCK(ticks[9]); - return matrix ; + return 0 ; } /* @@ -841,29 +1153,32 @@ fec_decode(void *code0, void *pkt0[], int index[], int sz) { struct fec_parms * code=(struct fec_parms*)code0; gf **pkt=(gf**)pkt0; - gf *m_dec ; - gf **new_pkt ; int row, col , k = code->k ; + gf *new_pkt[k] ; if (GF_BITS > 8) sz /= 2 ; if (shuffle(pkt, index, k)) /* error if true */ return 1 ; - m_dec = build_decode_matrix(code, pkt, index); - - if (m_dec == NULL) + if (build_decode_matrix(code, pkt, index, code->dec_matrix)) return 1 ; /* error */ + + /* ensure decode scratch buffer is large enough */ + if (code->dec_buf_sz < sz) { + free(code->dec_buf); + code->dec_buf = (gf *)my_malloc(k * sz * sizeof(gf), "dec_buf"); + code->dec_buf_sz = sz ; + } /* * do the actual decoding */ - new_pkt = (gf** )my_malloc (k * sizeof (gf * ), "new pkt pointers" ); for (row = 0 ; row < k ; row++ ) { if (index[row] >= k) { - new_pkt[row] = (gf*)my_malloc (sz * sizeof (gf), "new pkt buffer" ); + new_pkt[row] = code->dec_buf + row * sz ; bzero(new_pkt[row], sz * sizeof(gf) ) ; for (col = 0 ; col < k ; col++ ) - addmul(new_pkt[row], pkt[col], m_dec[row*k + col], sz) ; + addmul(new_pkt[row], pkt[col], code->dec_matrix[row*k + col], sz) ; } } /* @@ -872,11 +1187,8 @@ fec_decode(void *code0, void *pkt0[], int index[], int sz) for (row = 0 ; row < k ; row++ ) { if (index[row] >= k) { bcopy(new_pkt[row], pkt[row], sz*sizeof(gf)); - free(new_pkt[row]); } } - free(new_pkt); - free(m_dec); return 0; } @@ -915,3 +1227,20 @@ test_gf() } } #endif /* TEST */ + +#ifdef BENCH_EXPOSE_INTERNALS +void bench_addmul1(gf *dst, gf *src, gf c, int sz) { + addmul1(dst, src, c, sz); +} +const char *bench_addmul1_impl() { +#if defined(__x86_64__) + if (addmul1_x86_fn == addmul1_avx512) return "avx512bw"; + if (addmul1_x86_fn == addmul1_avx2) return "avx2"; + return "ssse3"; +#elif defined(__aarch64__) + return "neon"; +#else + return "scalar"; +#endif +} +#endif diff --git a/makefile b/makefile index 5aa9cab..7520c13 100755 --- a/makefile +++ b/makefile @@ -10,12 +10,16 @@ cc_amd64=/toolchains/lede-sdk-17.01.2-x86-64_gcc-5.4.0_musl-1.1.16.Linux-x86_64/ #cc_bcm2708=/home/wangyu/raspberry/tools/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian/bin/arm-linux-gnueabihf-g++ -SOURCES0=main.cpp log.cpp common.cpp lib/fec.cpp lib/rs.cpp crc32/Crc32.cpp packet.cpp delay_manager.cpp fd_manager.cpp connection.cpp fec_manager.cpp misc.cpp tunnel_client.cpp tunnel_server.cpp -SOURCES=${SOURCES0} my_ev.cpp -isystem libev +SOURCES0=main.cpp log.cpp common.cpp lib/fec.cpp lib/rs.cpp packet.cpp packet_cook.cpp delay_manager.cpp fd_manager.cpp connection.cpp fec_manager.cpp misc.cpp tunnel_client.cpp tunnel_server.cpp io_uring_recv.cpp xor_spe.S +SOURCES=${SOURCES0} my_ev.cpp -isystem libev NAME=speederv2 -FLAGS= -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers ${OPT} +FLAGS= -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers -MMD -MP ${OPT} + +ifdef SPE +FLAGS += -DHAVE_PPC_SPE -Wa,-mspe +endif TARGETS=amd64 arm mips24kc_be x86 mips24kc_le @@ -110,10 +114,44 @@ release2: ${TARGETS} mingw_cross mingw_cross_wepoll mac_cross cp git_version.h version.txt tar -zcvf ${TAR} ${NAME}.exe ${NAME}_wepoll.exe ${NAME}_mac -clean: +clean: rm -f ${TAR} rm -f ${NAME} ${NAME}_cross ${NAME}.exe ${NAME}_wepoll.exe ${NAME}_mac rm -f git_version.h + rm -f *.d bench/*.d lib/*.d crc32/*.d + +-include $(wildcard *.d bench/*.d lib/*.d crc32/*.d) git_version: echo "const char *gitversion = \"$(shell git rev-parse HEAD)\";" > git_version.h + +# --- Benchmark and test targets --- +BENCH_SOURCES=bench/bench_main.cpp bench/bench_fec.cpp bench/bench_crc32.cpp bench/bench_packet.cpp lib/fec.cpp lib/rs.cpp crc32/Crc32.cpp packet_cook.cpp xor_spe.S +TEST_SOURCES=bench/test_main.cpp bench/test_fec.cpp bench/test_crc32.cpp bench/test_packet.cpp lib/fec.cpp lib/rs.cpp crc32/Crc32.cpp packet_cook.cpp xor_spe.S +BENCH_FLAGS=-std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers -O2 -DBENCH_EXPOSE_INTERNALS -MMD -MP + +ifdef SPE +BENCH_FLAGS += -DHAVE_PPC_SPE -Wa,-mspe +endif + +bench: git_version + ${cc_local} -o bench_udpspeeder -I. -Ibench ${BENCH_SOURCES} ${BENCH_FLAGS} + +test: git_version + ${cc_local} -o test_udpspeeder -I. -Ibench ${TEST_SOURCES} ${BENCH_FLAGS} + ./test_udpspeeder + +bench-static: git_version + ${cc_local} -o bench_udpspeeder_static -I. -Ibench ${BENCH_SOURCES} ${BENCH_FLAGS} -static + +test-static: git_version + ${cc_local} -o test_udpspeeder_static -I. -Ibench ${TEST_SOURCES} ${BENCH_FLAGS} -static + +bench-cross: git_version + ${CC} -o bench_udpspeeder_cross -I. -Ibench ${BENCH_SOURCES} ${BENCH_FLAGS} -static -lgcc_eh + +test-cross: git_version + ${CC} -o test_udpspeeder_cross -I. -Ibench ${TEST_SOURCES} ${BENCH_FLAGS} -static -lgcc_eh + +all-cross: git_version + ${CC} -o ${NAME}_cross -I. ${SOURCES} ${FLAGS} -lrt -static -lgcc_eh -O2 diff --git a/misc.cpp b/misc.cpp index 5918e57..34da625 100644 --- a/misc.cpp +++ b/misc.cpp @@ -13,7 +13,7 @@ int mtu_warn = 1350; int disable_mtu_warn = 1; int disable_fec = 0; -int disable_checksum = 0; +/* disable_checksum now lives in cook_ctx (packet.cpp) */ int debug_force_flush_fec = 0; @@ -207,6 +207,25 @@ int from_fec_to_normal(conn_info_t &conn_info, char *data, int len, int &out_n, return 0; } +int delay_send_batch(int n, my_time_t *delays, const dest_t &dest, char **data_arr, int *len_arr) { + if (n <= 0) return 0; + + /* Fast path: all delays zero and no random_drop → single sendmmsg */ + if (n > 1 && random_drop == 0) { + int all_zero = 1; + for (int i = 0; i < n; i++) { + if (delays[i] != 0) { all_zero = 0; break; } + } + if (all_zero) + return my_send_batch(dest, data_arr, len_arr, n); + } + + /* Slow path: individual sends with per-packet delay/drop */ + for (int i = 0; i < n; i++) + delay_send(delays[i], dest, data_arr[i], len_arr[i]); + return 0; +} + int delay_send(my_time_t delay, const dest_t &dest, char *data, int len) { // int rand=random()%100; // mylog(log_info,"rand = %d\n",rand); @@ -625,12 +644,13 @@ void process_arg(int argc, char *argv[]) { // opt_key+=opt; switch (opt) { case 'k': - sscanf(optarg, "%s\n", key_string); - mylog(log_debug, "key=%s\n", key_string); - if (strlen(key_string) == 0) { + sscanf(optarg, "%s\n", cook_ctx.key); + mylog(log_debug, "key=%s\n", cook_ctx.key); + if (strlen(cook_ctx.key) == 0) { mylog(log_fatal, "key len=0??\n"); myexit(-1); } + cook_ctx_prepare_key(&cook_ctx); break; case 'j': if (strchr(optarg, ':') == 0) { @@ -723,12 +743,12 @@ void process_arg(int argc, char *argv[]) { disable_fec = 1; } else if (strcmp(long_options[option_index].name, "disable-obscure") == 0) { mylog(log_info, "obscure disabled\n"); - disable_obscure = 1; + cook_ctx.disable_obscure = 1; } else if (strcmp(long_options[option_index].name, "disable-xor") == 0) { mylog(log_info, "xor disabled\n"); - disable_xor = 1; + cook_ctx.disable_xor = 1; } else if (strcmp(long_options[option_index].name, "disable-checksum") == 0) { - disable_checksum = 1; + cook_ctx.disable_checksum = 1; mylog(log_warn, "checksum disabled\n"); } else if (strcmp(long_options[option_index].name, "fix-latency") == 0) { mylog(log_info, "fix-latency enabled\n"); diff --git a/misc.h b/misc.h index 6ed637e..5350ad9 100644 --- a/misc.h +++ b/misc.h @@ -20,7 +20,7 @@ extern int mtu_warn; extern int disable_mtu_warn; extern int disable_fec; -extern int disable_checksum; +/* disable_checksum now lives in cook_ctx_t (packet_cook.h) */ extern int debug_force_flush_fec; @@ -62,6 +62,7 @@ int from_normal_to_fec(conn_info_t &conn_info, char *data, int len, int &out_n, int from_fec_to_normal(conn_info_t &conn_info, char *data, int len, int &out_n, char **&out_arr, int *&out_len, my_time_t *&out_delay); int delay_send(my_time_t delay, const dest_t &dest, char *data, int len); +int delay_send_batch(int n, my_time_t *delays, const dest_t &dest, char **data_arr, int *len_arr); int print_parameter(); int handle_command(char *s); diff --git a/packet.cpp b/packet.cpp index 87519c3..d1affff 100644 --- a/packet.cpp +++ b/packet.cpp @@ -9,10 +9,10 @@ #include "log.h" #include "packet.h" #include "misc.h" -#include "crc32/Crc32.h" +#include "crc32c.h" + +cook_ctx_t cook_ctx = { {}, 0, 0, {}, 4, 32, 0, 0, 0 }; -int iv_min = 4; -int iv_max = 32; //< 256; u64_t packet_send_count = 0; u64_t dup_packet_send_count = 0; u64_t packet_recv_count = 0; @@ -21,115 +21,8 @@ u64_t dup_packet_recv_count = 0; typedef u64_t anti_replay_seq_t; int disable_replay_filter = 0; -int disable_obscure = 0; -int disable_xor = 0; - int random_drop = 0; -char key_string[1000] = ""; - -// int local_listen_fd=-1; - -void encrypt_0(char *input, int &len, char *key) { - int i, j; - if (key[0] == 0) return; - for (i = 0, j = 0; i < len; i++, j++) { - if (key[j] == 0) j = 0; - input[i] ^= key[j]; - } -} - -void decrypt_0(char *input, int &len, char *key) { - int i, j; - if (key[0] == 0) return; - for (i = 0, j = 0; i < len; i++, j++) { - if (key[j] == 0) j = 0; - input[i] ^= key[j]; - } -} -int do_obscure_old(const char *input, int in_len, char *output, int &out_len) { - // memcpy(output,input,in_len); - // out_len=in_len; - // return 0; - - int i, j, k; - if (in_len > 65535 || in_len < 0) - return -1; - int iv_len = iv_min + rand() % (iv_max - iv_min); - get_fake_random_chars(output, iv_len); - memcpy(output + iv_len, input, in_len); - - output[iv_len + in_len] = (uint8_t)iv_len; - - output[iv_len + in_len] ^= output[0]; - output[iv_len + in_len] ^= key_string[0]; - - for (i = 0, j = 0, k = 1; i < in_len; i++, j++, k++) { - if (j == iv_len) j = 0; - if (key_string[k] == 0) k = 0; - output[iv_len + i] ^= output[j]; - output[iv_len + i] ^= key_string[k]; - } - - out_len = iv_len + in_len + 1; - return 0; -} - -int do_obscure(char *data, int &len) { - assert(len >= 0); - assert(len < buf_len); - - int iv_len = random_between(iv_min, iv_max); - get_fake_random_chars(data + len, iv_len); - data[iv_len + len] = (uint8_t)iv_len; - for (int i = 0, j = 0; i < len; i++, j++) { - if (j == iv_len) j = 0; - data[i] ^= data[len + j]; - } - - len = len + iv_len + 1; - return 0; -} - -int de_obscure(char *data, int &len) { - if (len < 1) return -1; - int iv_len = int((uint8_t)data[len - 1]); - - if (len < 1 + iv_len) return -1; - - len = len - 1 - iv_len; - for (int i = 0, j = 0; i < len; i++, j++) { - if (j == iv_len) j = 0; - data[i] ^= data[len + j]; - } - - return 0; -} -int de_obscure_old(const char *input, int in_len, char *output, int &out_len) { - // memcpy(output,input,in_len); - // out_len=in_len; - // return 0; - - int i, j, k; - if (in_len > 65535 || in_len < 0) { - mylog(log_debug, "in_len > 65535||in_len<0 , %d", in_len); - return -1; - } - int iv_len = int((uint8_t)(input[in_len - 1] ^ input[0] ^ key_string[0])); - out_len = in_len - 1 - iv_len; - if (out_len < 0) { - mylog(log_debug, "%d %d\n", in_len, out_len); - return -1; - } - for (i = 0, j = 0, k = 1; i < in_len; i++, j++, k++) { - if (j == iv_len) j = 0; - if (key_string[k] == 0) k = 0; - output[i] = input[iv_len + i] ^ input[j] ^ key_string[k]; - } - dup_packet_recv_count++; - return 0; -} - /* int sendto_fd_ip_port (int fd,u32_t ip,int port,char * buf, int len,int flags) { @@ -163,9 +56,91 @@ int send_fd(int fd, char *buf, int len, int flags) { return send(fd, buf, len, flags); } +int my_send_batch(const dest_t &dest, char **data_arr, int *len_arr, int count) { + if (count <= 0) return 0; + if (count == 1) return my_send(dest, data_arr[0], len_arr[0]); + + /* Cook all packets */ + if (dest.cook) { + for (int i = 0; i < count; i++) + do_cook(&cook_ctx, data_arr[i], len_arr[i]); + } + + /* Resolve fd and optional destination address. + * Copy address out of const dest (same as sendto_fd_addr taking addr by value). */ + int fd; + address_t addr_copy; + struct sockaddr *addr_ptr = NULL; + socklen_t addr_len = 0; + + switch (dest.type) { + case type_fd_addr: + fd = dest.inner.fd_addr.fd; + addr_copy = dest.inner.fd_addr.addr; + addr_ptr = (struct sockaddr *)&addr_copy.inner; + addr_len = addr_copy.get_len(); + break; + case type_fd64_addr: + if (!fd_manager.exist(dest.inner.fd64)) return -1; + fd = fd_manager.to_fd(dest.inner.fd64); + addr_copy = dest.inner.fd64_addr.addr; + addr_ptr = (struct sockaddr *)&addr_copy.inner; + addr_len = addr_copy.get_len(); + break; + case type_fd64: + if (!fd_manager.exist(dest.inner.fd64)) return -1; + fd = fd_manager.to_fd(dest.inner.fd64); + break; + case type_fd: + fd = dest.inner.fd; + break; + default: + for (int i = 0; i < count; i++) + my_send(dest, data_arr[i], len_arr[i]); + return count; + } + +#ifdef __linux__ + struct mmsghdr msgs[max_fec_packet_num]; + struct iovec iovecs[max_fec_packet_num]; + + for (int i = 0; i < count; i++) { + iovecs[i].iov_base = data_arr[i]; + iovecs[i].iov_len = len_arr[i]; + msgs[i].msg_hdr.msg_iov = &iovecs[i]; + msgs[i].msg_hdr.msg_iovlen = 1; + msgs[i].msg_hdr.msg_name = addr_ptr; + msgs[i].msg_hdr.msg_namelen = addr_len; + msgs[i].msg_hdr.msg_control = NULL; + msgs[i].msg_hdr.msg_controllen = 0; + msgs[i].msg_hdr.msg_flags = 0; + msgs[i].msg_len = 0; + } + + int ret = sendmmsg(fd, msgs, count, 0); + if (ret < 0) { + mylog(log_warn, "sendmmsg failed: %s\n", strerror(errno)); + } else if (ret < count) { + mylog(log_debug, "sendmmsg partial: %d/%d sent\n", ret, count); + } + return ret; +#else + int sent = 0; + for (int i = 0; i < count; i++) { + int ret; + if (addr_ptr) + ret = sendto(fd, data_arr[i], len_arr[i], 0, addr_ptr, addr_len); + else + ret = send(fd, data_arr[i], len_arr[i], 0); + if (ret >= 0) sent++; + } + return sent; +#endif +} + int my_send(const dest_t &dest, char *data, int len) { if (dest.cook) { - do_cook(data, len); + do_cook(&cook_ctx, data, len); } switch (dest.type) { case type_fd_addr: { @@ -238,7 +213,7 @@ int put_conv0(u32_t conv, const char *input, int len_in, char *&output, int &len u32_t n_conv = htonl(conv); memcpy(output, &n_conv, sizeof(n_conv)); memcpy(output + sizeof(n_conv), input, len_in); - u32_t crc32 = (u32_t)crc32_fast(output, len_in + sizeof(crc32)); + u32_t crc32 = (u32_t)crc32c(output, len_in + sizeof(crc32)); u32_t crc32_n = htonl(crc32); len_out = len_in + (int)(sizeof(n_conv)) + (int)sizeof(crc32_n); memcpy(output + len_in + (int)(sizeof(n_conv)), &crc32_n, sizeof(crc32_n)); @@ -258,56 +233,12 @@ int get_conv0(u32_t &conv, const char *input, int len_in, char *&output, int &le } memcpy(&crc32_n, input + len_in - (int)sizeof(crc32_n), sizeof(crc32_n)); u32_t crc32 = ntohl(crc32_n); - if (crc32 != (u32_t)crc32_fast(input, len_in - sizeof(crc32_n))) { + if (crc32 != (u32_t)crc32c(input, len_in - sizeof(crc32_n))) { mylog(log_debug, "crc32 check failed\n"); return -1; } return 0; } -int put_crc32(char *s, int &len) { - if (disable_checksum) return 0; - assert(len >= 0); - // if(len<0) return -1; - u32_t crc32 = (u32_t)crc32_fast(s, len); - write_u32(s + len, crc32); - len += sizeof(u32_t); - - return 0; -} - -int do_cook(char *data, int &len) { - put_crc32(data, len); - if (!disable_obscure) do_obscure(data, len); - if (!disable_xor) encrypt_0(data, len, key_string); - return 0; -} - -int de_cook(char *s, int &len) { - if (!disable_xor) decrypt_0(s, len, key_string); - if (!disable_obscure) { - int ret = de_obscure(s, len); - if (ret != 0) { - mylog(log_debug, "de_obscure fail\n"); - return ret; - } - } - int ret = rm_crc32(s, len); - if (ret != 0) { - mylog(log_debug, "rm_crc32 fail\n"); - return ret; - } - return 0; -} -int rm_crc32(char *s, int &len) { - if (disable_checksum) return 0; - assert(len >= 0); - len -= sizeof(u32_t); - if (len < 0) return -1; - u32_t crc32_in = read_u32(s + len); - u32_t crc32 = (u32_t)crc32_fast(s, len); - if (crc32 != crc32_in) return -1; - return 0; -} /* int do_obs() { @@ -324,6 +255,14 @@ int put_conv(u32_t conv, const char *input, int len_in, char *&output, int &len_ return 0; } +int put_conv_inplace(u32_t conv, char *buf, int data_len, int &len_out) { + /* buf must have data at buf+sizeof(u32_t) with sizeof(u32_t) bytes of headroom. + * Writes conv header at buf[0..3], total len = data_len + 4. */ + u32_t n_conv = htonl(conv); + memcpy(buf, &n_conv, sizeof(n_conv)); + len_out = data_len + (int)sizeof(n_conv); + return 0; +} int get_conv(u32_t &conv, const char *input, int len_in, char *&output, int &len_out) { u32_t n_conv; memcpy(&n_conv, input, sizeof(n_conv)); diff --git a/packet.h b/packet.h index b8d2582..1cdd327 100644 --- a/packet.h +++ b/packet.h @@ -10,37 +10,29 @@ #include "common.h" #include "fd_manager.h" +#include "packet_cook.h" -extern int iv_min; -extern int iv_max; //< 256; +extern cook_ctx_t cook_ctx; extern u64_t packet_send_count; extern u64_t dup_packet_send_count; extern u64_t packet_recv_count; extern u64_t dup_packet_recv_count; -extern char key_string[1000]; extern int disable_replay_filter; extern int random_drop; -extern int disable_obscure; -extern int disable_xor; int my_send(const dest_t &dest, char *data, int len); +int my_send_batch(const dest_t &dest, char **data_arr, int *len_arr, int count); -void encrypt_0(char *input, int &len, char *key); -void decrypt_0(char *input, int &len, char *key); int add_seq(char *data, int &data_len); int remove_seq(char *data, int &data_len); -int do_obscure(const char *input, int in_len, char *output, int &out_len); -int de_obscure(const char *input, int in_len, char *output, int &out_len); -// int sendto_fd_u64 (int fd,u64_t u64,char * buf, int len,int flags); int sendto_ip_port(u32_t ip, int port, char *buf, int len, int flags); int send_fd(int fd, char *buf, int len, int flags); int put_conv(u32_t conv, const char *input, int len_in, char *&output, int &len_out); +int put_conv_inplace(u32_t conv, char *buf, int data_len, int &len_out); int get_conv(u32_t &conv, const char *input, int len_in, char *&output, int &len_out); -int put_crc32(char *s, int &len); -int rm_crc32(char *s, int &len); -int do_cook(char *data, int &len); -int de_cook(char *s, int &len); +int put_conv0(u32_t conv, const char *input, int len_in, char *&output, int &len_out); +int get_conv0(u32_t &conv, const char *input, int len_in, char *&output, int &len_out); #endif /* PACKET_H_ */ diff --git a/packet_cook.cpp b/packet_cook.cpp new file mode 100644 index 0000000..be8017d --- /dev/null +++ b/packet_cook.cpp @@ -0,0 +1,459 @@ +#include "packet_cook.h" +#include "crc32c.h" +#include +#include +#include + +#if defined(__x86_64__) || defined(_M_X64) +#include /* SSE2 — baseline on all x86_64 */ +#include /* AVX2 — for xor_tile_avx2 (guarded by target attr) */ +#define COOK_VEC_WIDTH 16 +#elif defined(__aarch64__) +#include +#define COOK_VEC_WIDTH 16 +#elif defined(HAVE_PPC_SPE) +#define COOK_VEC_WIDTH 8 +#else +#define COOK_VEC_WIDTH ((int)sizeof(unsigned long)) +#endif + +#ifdef HAVE_PPC_SPE +extern "C" void xor_tile_spe(char *data, int len, const char *tile, int tile_len); +#endif + +/* Provided by common.cpp in production, stubs in bench */ +extern "C++" void get_fake_random_chars(char *s, int len); +extern "C++" int random_between(uint32_t a, uint32_t b); + +static const int cook_buf_len = 3800; /* matches common.h buf_len */ + +static void +cook_write_u32(char *p, uint32_t l) +{ + *(unsigned char *)(p + 3) = (unsigned char)((l >> 0) & 0xff); + *(unsigned char *)(p + 2) = (unsigned char)((l >> 8) & 0xff); + *(unsigned char *)(p + 1) = (unsigned char)((l >> 16) & 0xff); + *(unsigned char *)(p + 0) = (unsigned char)((l >> 24) & 0xff); +} + +static uint32_t +cook_read_u32(char *p) +{ + uint32_t res; + res = *(const unsigned char *)(p + 0); + res = *(const unsigned char *)(p + 1) + (res << 8); + res = *(const unsigned char *)(p + 2) + (res << 8); + res = *(const unsigned char *)(p + 3) + (res << 8); + return res; +} + +/* --- SIMD repeating-pattern XOR ----------------------------------------- */ + +static int +cook_gcd(int a, int b) +{ + while (b) { int t = b; b = a % b; a = t; } + return a; +} + +static int +cook_lcm(int a, int b) +{ + return a / cook_gcd(a, b) * b; +} + +/* + * Fill tile[0..tile_len-1] with pat[0..pat_len-1] repeating. + * tile_len must be a multiple of pat_len. + */ +static void +expand_tile(char *tile, int tile_len, const char *pat, int pat_len) +{ + memcpy(tile, pat, pat_len); + int filled = pat_len; + while (filled < tile_len) { + int chunk = tile_len - filled; + if (chunk > filled) chunk = filled; + memcpy(tile + filled, tile, chunk); + filled += chunk; + } +} + +/* + * XOR data[0..len-1] with tile[0..tile_len-1] repeating. + * tile_len MUST be a multiple of COOK_VEC_WIDTH. + */ +#if defined(__x86_64__) || defined(_M_X64) +__attribute__((target("avx2"))) +static void +xor_tile_avx2(char *data, int len, const char *tile, int tile_len) +{ + int t = 0, i = 0; + if (tile_len == 16) { + /* Common case: broadcast 16-byte tile to 256-bit, no wrap logic */ + __m256i tile256 = _mm256_broadcastsi128_si256( + _mm_loadu_si128((const __m128i *)tile)); + for (; i + 32 <= len; i += 32) { + __m256i d = _mm256_loadu_si256((const __m256i *)(data + i)); + _mm256_storeu_si256((__m256i *)(data + i), + _mm256_xor_si256(d, tile256)); + } + /* t stays 0: i is multiple of 32, tile_len=16, so (i % 16) == 0 */ + } else { + /* General case: tile_len is a multiple of 16 */ + for (; i + 32 <= len; i += 32) { + __m128i k1 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; + if (t >= tile_len) t -= tile_len; + __m128i k2 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; + if (t >= tile_len) t -= tile_len; + __m256i key = _mm256_set_m128i(k2, k1); + __m256i d = _mm256_loadu_si256((const __m256i *)(data + i)); + _mm256_storeu_si256((__m256i *)(data + i), + _mm256_xor_si256(d, key)); + } + } + /* SSE2 tail */ + for (; i + 16 <= len; i += 16) { + __m128i d = _mm_loadu_si128((const __m128i *)(data + i)); + __m128i k = _mm_loadu_si128((const __m128i *)(tile + t)); + _mm_storeu_si128((__m128i *)(data + i), _mm_xor_si128(d, k)); + t += 16; + if (t >= tile_len) t = 0; + } + /* scalar tail */ + for (; i < len; i++) { + data[i] ^= tile[t]; + if (++t >= tile_len) t = 0; + } +} + +__attribute__((target("avx512bw"))) +static void +xor_tile_avx512(char *data, int len, const char *tile, int tile_len) +{ + int t = 0, i = 0; + if (tile_len == 16) { + /* Common case: broadcast 16-byte tile to 512-bit, no wrap logic */ + __m512i tile512 = _mm512_broadcast_i32x4( + _mm_loadu_si128((const __m128i *)tile)); + for (; i + 64 <= len; i += 64) { + __m512i d = _mm512_loadu_si512(data + i); + _mm512_storeu_si512(data + i, _mm512_xor_si512(d, tile512)); + } + /* t stays 0: i is multiple of 64, tile_len=16, so (i % 16) == 0 */ + } else { + /* General case: tile_len is a multiple of 16 */ + for (; i + 64 <= len; i += 64) { + __m128i k1 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; if (t >= tile_len) t -= tile_len; + __m128i k2 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; if (t >= tile_len) t -= tile_len; + __m128i k3 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; if (t >= tile_len) t -= tile_len; + __m128i k4 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; if (t >= tile_len) t -= tile_len; + __m512i key = _mm512_castsi128_si512(k1); + key = _mm512_inserti32x4(key, k2, 1); + key = _mm512_inserti32x4(key, k3, 2); + key = _mm512_inserti32x4(key, k4, 3); + __m512i d = _mm512_loadu_si512(data + i); + _mm512_storeu_si512(data + i, _mm512_xor_si512(d, key)); + } + } + /* AVX2 tail */ + if (i + 32 <= len) { + __m128i k1 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; if (t >= tile_len) t -= tile_len; + __m128i k2 = _mm_loadu_si128((const __m128i *)(tile + t)); + t += 16; if (t >= tile_len) t -= tile_len; + __m256i key = _mm256_set_m128i(k2, k1); + __m256i d = _mm256_loadu_si256((const __m256i *)(data + i)); + _mm256_storeu_si256((__m256i *)(data + i), + _mm256_xor_si256(d, key)); + i += 32; + } + /* SSE2 tail */ + if (i + 16 <= len) { + __m128i d = _mm_loadu_si128((const __m128i *)(data + i)); + __m128i k = _mm_loadu_si128((const __m128i *)(tile + t)); + _mm_storeu_si128((__m128i *)(data + i), _mm_xor_si128(d, k)); + t += 16; if (t >= tile_len) t -= tile_len; + i += 16; + } + /* scalar tail */ + for (; i < len; i++) { + data[i] ^= tile[t]; + if (++t >= tile_len) t = 0; + } +} +#endif + +#if defined(__x86_64__) || defined(_M_X64) +/* Runtime SIMD tier: 0=SSE2, 1=AVX2, 2=AVX-512BW */ +static int xor_simd_tier = -1; +#endif + +static void +xor_tile(char *data, int len, const char *tile, int tile_len) +{ +#if defined(__x86_64__) || defined(_M_X64) + if (xor_simd_tier < 0) { + unsigned int eax, ebx, ecx, edx; + xor_simd_tier = 0; + /* Check AVX2: CPUID leaf 7, EBX bit 5 */ + __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(7), "c"(0)); + if ((ebx >> 5) & 1) + xor_simd_tier = 1; + /* Check AVX-512BW: OSXSAVE + XCR0 + CPUID leaf 7, EBX bit 30 */ + if (xor_simd_tier >= 1) { + __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(1), "c"(0)); + if (ecx & (1u << 27)) { /* OSXSAVE */ + unsigned int xcr0; + __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "edx"); + if ((xcr0 & 0xE6) == 0xE6) { /* SSE+AVX+opmask+ZMM */ + __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(7), "c"(0)); + if ((ebx >> 30) & 1) + xor_simd_tier = 2; + } + } + } + } + if (xor_simd_tier >= 2) { + xor_tile_avx512(data, len, tile, tile_len); + return; + } + if (xor_simd_tier >= 1) { + xor_tile_avx2(data, len, tile, tile_len); + return; + } + int t = 0, i = 0; + for (; i + 16 <= len; i += 16) { + __m128i d = _mm_loadu_si128((const __m128i *)(data + i)); + __m128i k = _mm_loadu_si128((const __m128i *)(tile + t)); + _mm_storeu_si128((__m128i *)(data + i), _mm_xor_si128(d, k)); + t += 16; + if (t >= tile_len) t = 0; + } + for (; i < len; i++) { + data[i] ^= tile[t]; + t++; + if (t >= tile_len) t = 0; + } +#elif defined(__aarch64__) + int t = 0, i = 0; + for (; i + 32 <= len; i += 32) { + uint8x16_t d1 = vld1q_u8((const uint8_t *)(data + i)); + uint8x16_t k1 = vld1q_u8((const uint8_t *)(tile + t)); + t += 16; if (t >= tile_len) t = 0; + uint8x16_t d2 = vld1q_u8((const uint8_t *)(data + i + 16)); + uint8x16_t k2 = vld1q_u8((const uint8_t *)(tile + t)); + t += 16; if (t >= tile_len) t = 0; + vst1q_u8((uint8_t *)(data + i), veorq_u8(d1, k1)); + vst1q_u8((uint8_t *)(data + i + 16), veorq_u8(d2, k2)); + } + for (; i + 16 <= len; i += 16) { + uint8x16_t d = vld1q_u8((const uint8_t *)(data + i)); + uint8x16_t k = vld1q_u8((const uint8_t *)(tile + t)); + vst1q_u8((uint8_t *)(data + i), veorq_u8(d, k)); + t += 16; + if (t >= tile_len) t = 0; + } + for (; i < len; i++) { + data[i] ^= tile[t]; + t++; + if (t >= tile_len) t = 0; + } +#elif defined(HAVE_PPC_SPE) + int t = 0, i = 0; + /* Scalar head: align data pointer to 8 bytes for evldd */ + int head = (8 - ((uintptr_t)data & 7)) & 7; + if (head > len) head = len; + for (; i < head; i++) { + data[i] ^= tile[t]; + if (++t >= tile_len) t = 0; + } + int remaining = len - i; + if (remaining >= 8 && t != 0) { + /* Tile offset not 0 after head — rotate tile so SPE sees offset=0 */ + char rtile[512 + 8]; + assert(tile_len <= 512); + memcpy(rtile, tile + t, tile_len - t); + memcpy(rtile + (tile_len - t), tile, t); + memcpy(rtile + tile_len, rtile, 8); /* SPE evldd padding */ + xor_tile_spe(data + i, remaining, rtile, tile_len); + } else if (remaining >= 8) { + /* Data aligned and tile offset 0 — call SPE directly */ + xor_tile_spe(data + i, remaining, tile, tile_len); + } else { + /* Too short for SPE — scalar tail */ + for (; i < len; i++) { + data[i] ^= tile[t]; + if (++t >= tile_len) t = 0; + } + } +#else + /* Word-width XOR for generic platforms (MIPS, RISC-V, PPC, i486, ARMv7). + * sizeof(unsigned long) = 4 on 32-bit, 8 on 64-bit. */ + int t = 0, i = 0; + for (; i + COOK_VEC_WIDTH <= len; i += COOK_VEC_WIDTH) { + unsigned long d, k; + memcpy(&d, data + i, sizeof(d)); + memcpy(&k, tile + t, sizeof(k)); + d ^= k; + memcpy(data + i, &d, sizeof(d)); + t += COOK_VEC_WIDTH; + if (t >= tile_len) t = 0; + } + for (; i < len; i++) { + data[i] ^= tile[t]; + if (++t >= tile_len) t = 0; + } +#endif +} + +/* + * Expand pattern into a SIMD-aligned tile on the stack and XOR. + * Used for obscure IV (4-32 bytes, changes per packet). + */ +static void +xor_with_pattern(char *data, int len, const char *pat, int pat_len) +{ + if (pat_len <= 0 || len <= 0) return; + int tile_len = cook_lcm(pat_len, COOK_VEC_WIDTH); + /* Extra COOK_VEC_WIDTH bytes: when SPE evldd reads 8 bytes at a + * non-zero tile offset, the load may straddle the tile boundary. + * Padding with a copy of the tile start makes this safe. */ + char tile[512 + COOK_VEC_WIDTH]; + assert(tile_len <= 512); + expand_tile(tile, tile_len, pat, pat_len); + memcpy(tile + tile_len, tile, COOK_VEC_WIDTH); + xor_tile(data, len, tile, tile_len); +} + +/* --- Key preparation ---------------------------------------------------- */ + +void +cook_ctx_prepare_key(cook_ctx_t *ctx) +{ + ctx->key_len = (int)strlen(ctx->key); + if (ctx->key_len == 0) { + ctx->key_tile_len = 0; + return; + } + ctx->key_tile_len = cook_lcm(ctx->key_len, COOK_VEC_WIDTH); + assert(ctx->key_tile_len + COOK_VEC_WIDTH <= (int)sizeof(ctx->key_tile)); + expand_tile(ctx->key_tile, ctx->key_tile_len, ctx->key, ctx->key_len); + memcpy(ctx->key_tile + ctx->key_tile_len, ctx->key_tile, COOK_VEC_WIDTH); +} + +/* --- Cook operations ---------------------------------------------------- */ + +static void +encrypt_0(cook_ctx_t *ctx, char *input, int len) +{ + if (ctx->key_tile_len == 0) return; + xor_tile(input, len, ctx->key_tile, ctx->key_tile_len); +} + +static int +do_obscure(cook_ctx_t *ctx, char *data, int &len) +{ + assert(len >= 0); + assert(len < cook_buf_len); + + int iv_len = random_between(ctx->iv_min, ctx->iv_max); + get_fake_random_chars(data + len, iv_len); + data[iv_len + len] = (uint8_t)iv_len; + xor_with_pattern(data, len, data + len, iv_len); + + len = len + iv_len + 1; + return 0; +} + +static int +de_obscure(cook_ctx_t *ctx, char *data, int &len) +{ + if (len < 1) return -1; + int iv_len = int((uint8_t)data[len - 1]); + + if (iv_len < ctx->iv_min || iv_len > ctx->iv_max) return -1; + if (len < 1 + iv_len) return -1; + + len = len - 1 - iv_len; + xor_with_pattern(data, len, data + len, iv_len); + + return 0; +} + +static int +put_crc32(cook_ctx_t *ctx, char *s, int &len) +{ + if (ctx->disable_checksum) return 0; + assert(len >= 0); + uint32_t crc = (uint32_t)crc32c(s, len); + cook_write_u32(s + len, crc); + len += (int)sizeof(uint32_t); + return 0; +} + +static int +rm_crc32(cook_ctx_t *ctx, char *s, int &len) +{ + if (ctx->disable_checksum) return 0; + assert(len >= 0); + len -= (int)sizeof(uint32_t); + if (len < 0) return -1; + uint32_t crc_in = cook_read_u32(s + len); + uint32_t crc = (uint32_t)crc32c(s, len); + if (crc != crc_in) return -1; + return 0; +} + +int +do_cook(cook_ctx_t *ctx, char *data, int &len) +{ + put_crc32(ctx, data, len); + if (!ctx->disable_obscure) do_obscure(ctx, data, len); + if (!ctx->disable_xor) encrypt_0(ctx, data, len); + return 0; +} + +int +de_cook(cook_ctx_t *ctx, char *data, int &len) +{ + if (!ctx->disable_xor) encrypt_0(ctx, data, len); + if (!ctx->disable_obscure) { + if (de_obscure(ctx, data, len) != 0) + return -1; + } + if (rm_crc32(ctx, data, len) != 0) + return -2; + return 0; +} + +#ifdef BENCH_EXPOSE_INTERNALS +void bench_xor_tile(char *data, int len, const char *tile, int tile_len) { + xor_tile(data, len, tile, tile_len); +} +int bench_cook_vec_width() { + return COOK_VEC_WIDTH; +} +const char *bench_xor_tile_impl() { + /* Trigger detection if not yet run */ + char dummy[16] = {}, tile[16] = {}; + xor_tile(dummy, 1, tile, 16); +#if defined(__x86_64__) || defined(_M_X64) + if (xor_simd_tier >= 2) return "avx512bw"; + if (xor_simd_tier >= 1) return "avx2"; + return "sse2"; +#elif defined(__aarch64__) + return "neon"; +#elif defined(HAVE_PPC_SPE) + return "spe"; +#else + return "scalar"; +#endif +} +#endif diff --git a/packet_cook.h b/packet_cook.h new file mode 100644 index 0000000..6a3a328 --- /dev/null +++ b/packet_cook.h @@ -0,0 +1,20 @@ +#ifndef PACKET_COOK_H_ +#define PACKET_COOK_H_ + +struct cook_ctx_t { + char key[1000]; + int key_len; /* cached strlen(key), set by cook_ctx_prepare_key */ + int key_tile_len; /* lcm(key_len, vec_width), 0 if no key */ + char key_tile[16000]; /* key repeated to tile_len for SIMD XOR */ + int iv_min; + int iv_max; + int disable_checksum; + int disable_obscure; + int disable_xor; +}; + +void cook_ctx_prepare_key(cook_ctx_t *ctx); +int do_cook(cook_ctx_t *ctx, char *data, int &len); +int de_cook(cook_ctx_t *ctx, char *data, int &len); + +#endif /* PACKET_COOK_H_ */ diff --git a/tunnel_client.cpp b/tunnel_client.cpp index 83ccf05..477ef4d 100644 --- a/tunnel_client.cpp +++ b/tunnel_client.cpp @@ -1,11 +1,9 @@ #include "tunnel.h" +#include "io_uring_recv.h" -void data_from_local_or_fec_timeout(conn_info_t &conn_info, int is_time_out) { +static void client_process_local_packet(conn_info_t &conn_info, char *data, int data_len, + struct sockaddr *src_addr, socklen_t src_addr_len) { fd64_t &remote_fd64 = conn_info.remote_fd64; - int &local_listen_fd = conn_info.local_listen_fd; - - char data[buf_len]; - int data_len; address_t addr; u32_t conv; int out_n; @@ -17,126 +15,62 @@ void data_from_local_or_fec_timeout(conn_info_t &conn_info, int is_time_out) { dest.inner.fd64 = remote_fd64; dest.cook = 1; - if (is_time_out) { - // fd64_t fd64=events[idx].data.u64; - mylog(log_trace, "events[idx].data.u64 == conn_info.fec_encode_manager.get_timer_fd64()\n"); - - // uint64_t value; - // if(!fd_manager.exist(fd64)) //fd64 has been closed - //{ - // mylog(log_trace,"!fd_manager.exist(fd64)"); - // continue; - // } - // if((ret=read(fd_manager.to_fd(fd64), &value, 8))!=8) - //{ - // mylog(log_trace,"(ret=read(fd_manager.to_fd(fd64), &value, 8))!=8,ret=%d\n",ret); - // continue; - // } - // if(value==0) - //{ - // mylog(log_debug,"value==0\n"); - // continue; - // } - // assert(value==1); - from_normal_to_fec(conn_info, 0, 0, out_n, out_arr, out_len, out_delay); - } else // events[idx].data.u64 == (u64_t)local_listen_fd - { - mylog(log_trace, "events[idx].data.u64 == (u64_t)local_listen_fd\n"); - address_t::storage_t udp_new_addr_in = {0}; - socklen_t udp_new_addr_len = sizeof(address_t::storage_t); - if ((data_len = recvfrom(local_listen_fd, data, max_data_len + 1, 0, - (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len)) == -1) { - mylog(log_debug, "recv_from error,this shouldnt happen,err=%s,but we can try to continue\n", get_sock_error()); - return; - }; - - if (data_len == max_data_len + 1) { - mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len); - return; - } - - if (!disable_mtu_warn && data_len >= mtu_warn) { - mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn); - } + if (data_len == max_data_len + 1) { + mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len); + return; + } - addr.from_sockaddr((struct sockaddr *)&udp_new_addr_in, udp_new_addr_len); + if (!disable_mtu_warn && data_len >= mtu_warn) { + mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn); + } - mylog(log_trace, "Received packet from %s, len: %d\n", addr.get_str(), data_len); + addr.from_sockaddr(src_addr, src_addr_len); - // u64_t u64=ip_port.to_u64(); + mylog(log_trace, "Received packet from %s, len: %d\n", addr.get_str(), data_len); - if (!conn_info.conv_manager.c.is_data_used(addr)) { - if (conn_info.conv_manager.c.get_size() >= max_conv_num) { - mylog(log_warn, "ignored new udp connect bc max_conv_num exceed\n"); - return; - } - conv = conn_info.conv_manager.c.get_new_conv(); - conn_info.conv_manager.c.insert_conv(conv, addr); - mylog(log_info, "new packet from %s,conv_id=%x\n", addr.get_str(), conv); - } else { - conv = conn_info.conv_manager.c.find_conv_by_data(addr); - mylog(log_trace, "conv=%d\n", conv); + if (!conn_info.conv_manager.c.is_data_used(addr)) { + if (conn_info.conv_manager.c.get_size() >= max_conv_num) { + mylog(log_warn, "ignored new udp connect bc max_conv_num exceed\n"); + return; } - conn_info.conv_manager.c.update_active_time(conv); - char *new_data; - int new_len; - put_conv(conv, data, data_len, new_data, new_len); - - mylog(log_trace, "data_len=%d new_len=%d\n", data_len, new_len); - from_normal_to_fec(conn_info, new_data, new_len, out_n, out_arr, out_len, out_delay); + conv = conn_info.conv_manager.c.get_new_conv(); + conn_info.conv_manager.c.insert_conv(conv, addr); + mylog(log_info, "new packet from %s,conv_id=%x\n", addr.get_str(), conv); + } else { + conv = conn_info.conv_manager.c.find_conv_by_data(addr); + mylog(log_trace, "conv=%d\n", conv); } - mylog(log_trace, "out_n=%d\n", out_n); - for (int i = 0; i < out_n; i++) { - delay_send(out_delay[i], dest, out_arr[i], out_len[i]); - } -} -static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { - assert(!(revents & EV_ERROR)); + conn_info.conv_manager.c.update_active_time(conv); + int new_len; + put_conv_inplace(conv, data, data_len, new_len); - conn_info_t &conn_info = *((conn_info_t *)watcher->data); + mylog(log_trace, "data_len=%d new_len=%d\n", data_len, new_len); + from_normal_to_fec(conn_info, data, new_len, out_n, out_arr, out_len, out_delay); - data_from_local_or_fec_timeout(conn_info, 0); + mylog(log_trace, "out_n=%d\n", out_n); + delay_send_batch(out_n, out_delay, dest, out_arr, out_len); } -static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { - assert(!(revents & EV_ERROR)); - - conn_info_t &conn_info = *((conn_info_t *)watcher->data); - - char data[buf_len]; - if (!fd_manager.exist(watcher->u64)) // fd64 has been closed - { - mylog(log_trace, "!fd_manager.exist(events[idx].data.u64)"); - return; - } - fd64_t &remote_fd64 = conn_info.remote_fd64; - int &remote_fd = conn_info.remote_fd; - - assert(watcher->u64 == remote_fd64); - - int fd = fd_manager.to_fd(remote_fd64); - - int data_len = recv(fd, data, max_data_len + 1, 0); - +static void client_process_remote_packet(conn_info_t &conn_info, char *data, int data_len) { if (data_len == max_data_len + 1) { mylog(log_warn, "huge packet, data_len > %d, packet truncated, dropped\n", max_data_len); return; } - mylog(log_trace, "received data from udp fd %d, len=%d\n", remote_fd, data_len); + mylog(log_trace, "received data from remote, len=%d\n", data_len); if (data_len < 0) { if (get_sock_errno() == ECONNREFUSED) { - mylog(log_debug, "recv failed %d ,udp_fd%d,errno:%s\n", data_len, remote_fd, get_sock_error()); + mylog(log_debug, "recv failed %d ,errno:%s\n", data_len, get_sock_error()); } - mylog(log_warn, "recv failed %d ,udp_fd%d,errno:%s\n", data_len, remote_fd, get_sock_error()); + mylog(log_warn, "recv failed %d ,errno:%s\n", data_len, get_sock_error()); return; } if (!disable_mtu_warn && data_len > mtu_warn) { mylog(log_warn, "huge packet,data len=%d (>%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn); } - if (de_cook(data, data_len) != 0) { + if (de_cook(&cook_ctx, data, data_len) != 0) { mylog(log_debug, "de_cook error"); return; } @@ -174,6 +108,66 @@ static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) } } +void data_from_local_or_fec_timeout(conn_info_t &conn_info, int is_time_out) { + fd64_t &remote_fd64 = conn_info.remote_fd64; + int &local_listen_fd = conn_info.local_listen_fd; + int out_n; + char **out_arr; + int *out_len; + my_time_t *out_delay; + dest_t dest; + dest.type = type_fd64; + dest.inner.fd64 = remote_fd64; + dest.cook = 1; + + if (is_time_out) { + mylog(log_trace, "events[idx].data.u64 == conn_info.fec_encode_manager.get_timer_fd64()\n"); + from_normal_to_fec(conn_info, 0, 0, out_n, out_arr, out_len, out_delay); + mylog(log_trace, "out_n=%d\n", out_n); + delay_send_batch(out_n, out_delay, dest, out_arr, out_len); + } else { + /* Single-packet path (fallback) */ + char data[buf_len]; + int data_len; + address_t::storage_t udp_new_addr_in = {0}; + socklen_t udp_new_addr_len = sizeof(address_t::storage_t); + if ((data_len = recvfrom(local_listen_fd, data + sizeof(u32_t), max_data_len + 1, 0, + (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len)) == -1) { + mylog(log_debug, "recv_from error,this shouldnt happen,err=%s,but we can try to continue\n", get_sock_error()); + return; + }; + client_process_local_packet(conn_info, data, data_len, + (struct sockaddr *)&udp_new_addr_in, udp_new_addr_len); + } +} +static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { + assert(!(revents & EV_ERROR)); + + conn_info_t &conn_info = *((conn_info_t *)watcher->data); + + data_from_local_or_fec_timeout(conn_info, 0); +} + +static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { + assert(!(revents & EV_ERROR)); + + conn_info_t &conn_info = *((conn_info_t *)watcher->data); + + if (!fd_manager.exist(watcher->u64)) // fd64 has been closed + { + mylog(log_trace, "!fd_manager.exist(events[idx].data.u64)"); + return; + } + fd64_t &remote_fd64 = conn_info.remote_fd64; + assert(watcher->u64 == remote_fd64); + + int fd = fd_manager.to_fd(remote_fd64); + + char data[buf_len]; + int data_len = recv(fd, data, max_data_len + 1, 0); + client_process_remote_packet(conn_info, data, data_len); +} + static void fifo_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { assert(!(revents & EV_ERROR)); int fifo_fd = watcher->fd; @@ -229,18 +223,98 @@ static void conn_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int re dest.inner.fd64 = conn_info.remote_fd64; dest.cook = 1; from_normal_to_fec(conn_info, 0, 0, out_n, out_arr, out_len, out_delay); - for (int i = 0; i < out_n; i++) { - delay_send(out_delay[i], dest, out_arr[i], out_len[i]); - } + delay_send_batch(out_n, out_delay, dest, out_arr, out_len); } } +#ifdef __linux__ +static uring_ctx_t client_uring_ctx; +static conn_info_t *client_uring_conn_info; +static void client_uring_drain(struct ev_loop *loop); +#endif + static void prepare_cb(struct ev_loop *loop, struct ev_prepare *watcher, int revents) { assert(!(revents & EV_ERROR)); delay_manager.check(); } + +#ifdef __linux__ + +static void client_uring_drain(struct ev_loop *loop) { + conn_info_t &conn_info = *client_uring_conn_info; + uring_ctx_t *ctx = &client_uring_ctx; + + for (;;) { + unsigned ready = uring_cq_ready(ctx); + if (ready == 0) + break; + + int need_submit = 0; + + for (unsigned i = 0; i < ready; i++) { + struct io_uring_cqe *cqe = uring_cqe_at(ctx, i); + uint8_t type = uring_tag_type(cqe->user_data); + int more = cqe->flags & IORING_CQE_F_MORE; + + if (cqe->res < 0) { + if (!more && cqe->res != -ECANCELED) { + if (type == URING_TAG_CLIENT_LOCAL) + uring_add_multishot_recvmsg(ctx, conn_info.local_listen_fd, cqe->user_data); + else if (type == URING_TAG_CLIENT_REMOTE) + uring_add_multishot_recv(ctx, fd_manager.to_fd(conn_info.remote_fd64), cqe->user_data); + need_submit = 1; + } + continue; + } + + if (type == URING_TAG_CLIENT_LOCAL) { + uring_recv_buf_t recv_buf; + if (uring_parse_recvmsg_cqe(ctx, cqe, &recv_buf) == 0) { + /* Zero-copy: recvmsg has 140+ bytes of headroom before payload; + use sizeof(u32_t) of it for in-place conv header insertion. */ + char *data = recv_buf.data - sizeof(u32_t); + int data_len = recv_buf.len < (int)(buf_len - sizeof(u32_t)) ? recv_buf.len : (int)(buf_len - sizeof(u32_t)); + client_process_local_packet(conn_info, data, data_len, + (struct sockaddr *)&recv_buf.addr, recv_buf.addr_len); + uring_recycle_buf(ctx, recv_buf.buf_id); + } + } else if (type == URING_TAG_CLIENT_REMOTE) { + uring_recv_buf_t recv_buf; + if (uring_parse_recv_cqe(ctx, cqe, &recv_buf) == 0) { + client_process_remote_packet(conn_info, recv_buf.data, recv_buf.len); + uring_recycle_buf(ctx, recv_buf.buf_id); + } + } + + if (!more) { + if (type == URING_TAG_CLIENT_LOCAL) + uring_add_multishot_recvmsg(ctx, conn_info.local_listen_fd, cqe->user_data); + else if (type == URING_TAG_CLIENT_REMOTE) + uring_add_multishot_recv(ctx, fd_manager.to_fd(conn_info.remote_fd64), cqe->user_data); + need_submit = 1; + } + } + + /* Single batched advance + buffer commit */ + uring_cq_advance(ctx, ready); + uring_buf_ring_commit(ctx); + + /* Submit any re-arms and flush deferred completions in one syscall */ + if (need_submit) + uring_submit_and_flush(ctx); + else + uring_flush(ctx); + } +} + +static void client_uring_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { + assert(!(revents & EV_ERROR)); + client_uring_drain(loop); +} +#endif + int tunnel_client_event_loop() { int i, j, k; int ret; @@ -268,19 +342,6 @@ int tunnel_client_event_loop() { conn_info.loop = loop; - // ev.events = EPOLLIN; - // ev.data.u64 = local_listen_fd; - // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, local_listen_fd, &ev); - // if (ret!=0) { - // mylog(log_fatal,"add udp_listen_fd error\n"); - // myexit(-1); - // } - struct ev_io local_listen_watcher; - local_listen_watcher.data = &conn_info; - - ev_io_init(&local_listen_watcher, local_listen_cb, local_listen_fd, EV_READ); - ev_io_start(loop, &local_listen_watcher); - int &remote_fd = conn_info.remote_fd; fd64_t &remote_fd64 = conn_info.remote_fd64; @@ -289,21 +350,37 @@ int tunnel_client_event_loop() { mylog(log_debug, "remote_fd64=%llu\n", remote_fd64); - // ev.events = EPOLLIN; - // ev.data.u64 = remote_fd64; + int use_uring = 0; +#ifdef __linux__ + if (uring_init(&client_uring_ctx, 64, 256, buf_len) == 0) { + g_uring_ctx = &client_uring_ctx; + client_uring_conn_info = &conn_info; + static struct ev_io uring_watcher; + ev_io_init(&uring_watcher, client_uring_cb, client_uring_ctx.ring_fd, EV_READ); + ev_io_start(loop, &uring_watcher); + + uring_add_multishot_recvmsg(&client_uring_ctx, local_listen_fd, + uring_tag(URING_TAG_CLIENT_LOCAL, 0)); + uring_add_multishot_recv(&client_uring_ctx, remote_fd, + uring_tag(URING_TAG_CLIENT_REMOTE, 0)); + uring_submit(&client_uring_ctx); + use_uring = 1; + mylog(log_info, "io_uring: active for client sockets\n"); + } +#endif - // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, remote_fd, &ev); - // if (ret!= 0) { - // mylog(log_fatal,"add raw_fd error\n"); - // myexit(-1); - // } + struct ev_io local_listen_watcher; + local_listen_watcher.data = &conn_info; + ev_io_init(&local_listen_watcher, local_listen_cb, local_listen_fd, EV_READ); + if (!use_uring) + ev_io_start(loop, &local_listen_watcher); struct ev_io remote_watcher; remote_watcher.data = &conn_info; remote_watcher.u64 = remote_fd64; - ev_io_init(&remote_watcher, remote_cb, remote_fd, EV_READ); - ev_io_start(loop, &remote_watcher); + if (!use_uring) + ev_io_start(loop, &remote_watcher); // ev.events = EPOLLIN; // ev.data.u64 = delay_manager.get_timer_fd(); diff --git a/tunnel_server.cpp b/tunnel_server.cpp index aaa82c7..79e2be7 100644 --- a/tunnel_server.cpp +++ b/tunnel_server.cpp @@ -6,6 +6,7 @@ */ #include "tunnel.h" +#include "io_uring_recv.h" static void conn_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int revents); static void fec_encode_cb(struct ev_loop *loop, struct ev_timer *watcher, int revents); @@ -15,12 +16,57 @@ enum tmp_mode_t { is_from_remote = 0, is_fec_timeout, is_conn_timer }; +static void server_process_remote_packet(conn_info_t &conn_info, fd64_t fd64, char *data, int data_len) { + /* Pre-condition: fd_manager.exist(fd64), data received at data + sizeof(u32_t) */ + assert(conn_info.conv_manager.s.is_data_used(fd64)); + + u32_t conv = conn_info.conv_manager.s.find_conv_by_data(fd64); + conn_info.conv_manager.s.update_active_time(conv); + conn_info.update_active_time(); + + if (data_len == max_data_len + 1) { + mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len); + return; + } + + mylog(log_trace, "received a packet from udp_fd,len:%d,conv=%d\n", data_len, conv); + + if (data_len < 0) { + mylog(log_debug, "udp fd,recv_len<0 continue,%s\n", get_sock_error()); + return; + } + + if (!disable_mtu_warn && data_len >= mtu_warn) { + mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn); + } + + int new_len; + put_conv_inplace(conv, data, data_len, new_len); + + address_t &addr = conn_info.addr; + int &local_listen_fd = conn_info.local_listen_fd; + + int out_n; + char **out_arr; + int *out_len; + my_time_t *out_delay; + dest_t dest; + dest.inner.fd_addr.fd = local_listen_fd; + dest.inner.fd_addr.addr = addr; + dest.type = type_fd_addr; + dest.cook = 1; + + from_normal_to_fec(conn_info, data, new_len, out_n, out_arr, out_len, out_delay); + + mylog(log_trace, "out_n=%d\n", out_n); + delay_send_batch(out_n, out_delay, dest, out_arr, out_len); +} + void data_from_remote_or_fec_timeout_or_conn_timer(conn_info_t &conn_info, fd64_t fd64, tmp_mode_t mode) { int ret; char data[buf_len]; int data_len; - u32_t conv; // fd64_t fd64=events[idx].data.u64; // mylog(log_trace,"events[idx].data.u64 >u32_t(-1),%llu\n",(u64_t)events[idx].data.u64); @@ -77,81 +123,39 @@ void data_from_remote_or_fec_timeout_or_conn_timer(conn_info_t &conn_info, fd64_ return; } - // fd64_t &fd64 =conn_info.remote_fd64; - assert(conn_info.conv_manager.s.is_data_used(fd64)); - - conv = conn_info.conv_manager.s.find_conv_by_data(fd64); - conn_info.conv_manager.s.update_active_time(conv); - conn_info.update_active_time(); - int fd = fd_manager.to_fd(fd64); - data_len = recv(fd, data, max_data_len + 1, 0); - - if (data_len == max_data_len + 1) { - mylog(log_warn, "huge packet from upper level, data_len > %d, packet truncated, dropped\n", max_data_len); - return; - } - - mylog(log_trace, "received a packet from udp_fd,len:%d,conv=%d\n", data_len, conv); - - if (data_len < 0) { - mylog(log_debug, "udp fd,recv_len<0 continue,%s\n", get_sock_error()); - - return; - } - - if (!disable_mtu_warn && data_len >= mtu_warn) { - mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn); - } - - char *new_data; - int new_len; - put_conv(conv, data, data_len, new_data, new_len); - - from_normal_to_fec(conn_info, new_data, new_len, out_n, out_arr, out_len, out_delay); + /* Receive with sizeof(u32_t) headroom for in-place conv header */ + data_len = recv(fd, data + sizeof(u32_t), max_data_len + 1, 0); + server_process_remote_packet(conn_info, fd64, data, data_len); + return; } else { assert(0 == 1); } mylog(log_trace, "out_n=%d\n", out_n); - for (int i = 0; i < out_n; i++) { - delay_send(out_delay[i], dest, out_arr[i], out_len[i]); - } + delay_send_batch(out_n, out_delay, dest, out_arr, out_len); } -static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { - assert(!(revents & EV_ERROR)); - - int local_listen_fd = watcher->fd; +static void server_process_tunnel_packet(struct ev_loop *loop, int local_listen_fd, + char *data, int data_len, + struct sockaddr *src_addr, socklen_t src_addr_len) { int ret; - mylog(log_trace, "events[idx].data.u64 == (u64_t)local_listen_fd\n"); - char data[buf_len]; - int data_len; - address_t::storage_t udp_new_addr_in = {0}; - socklen_t udp_new_addr_len = sizeof(address_t::storage_t); - if ((data_len = recvfrom(local_listen_fd, data, max_data_len + 1, 0, - (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len)) == -1) { - mylog(log_error, "recv_from error,this shouldnt happen,err=%s,but we can try to continue\n", get_sock_error()); - return; - }; - if (data_len == max_data_len + 1) { mylog(log_warn, "huge packet, data_len > %d, packet truncated, dropped\n", max_data_len); return; } address_t addr; - addr.from_sockaddr((struct sockaddr *)&udp_new_addr_in, udp_new_addr_len); + addr.from_sockaddr(src_addr, src_addr_len); mylog(log_trace, "Received packet from %s,len: %d\n", addr.get_str(), data_len); - if (!disable_mtu_warn && data_len >= mtu_warn) ///////////////////////delete this for type 0 in furture - { + if (!disable_mtu_warn && data_len >= mtu_warn) { mylog(log_warn, "huge packet,data len=%d (>=%d).strongly suggested to set a smaller mtu at upper level,to get rid of this warn\n ", data_len, mtu_warn); } - if (de_cook(data, data_len) != 0) { + if (de_cook(&cook_ctx, data, data_len) != 0) { mylog(log_debug, "de_cook error"); return; } @@ -162,33 +166,16 @@ static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int rev return; } - // conn_manager.insert(addr); conn_info_t &conn_info = conn_manager.find_insert(addr); conn_info.addr = addr; conn_info.loop = ev_default_loop(0); conn_info.local_listen_fd = local_listen_fd; - // u64_t fec_fd64=conn_info.fec_encode_manager.get_timer_fd64(); - // mylog(log_debug,"fec_fd64=%llu\n",fec_fd64); - // ev.events = EPOLLIN; - // ev.data.u64 = fec_fd64; - // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd_manager.to_fd(fec_fd64), &ev); - - // fd_manager.get_info(fec_fd64).ip_port=ip_port; - conn_info.timer.data = &conn_info; ev_init(&conn_info.timer, conn_timer_cb); ev_timer_set(&conn_info.timer, 0, timer_interval / 1000.0); ev_timer_start(loop, &conn_info.timer); - // conn_info.timer.add_fd64_to_epoll(epoll_fd); - // conn_info.timer.set_timer_repeat_us(timer_interval*1000); - - // mylog(log_debug,"conn_info.timer.get_timer_fd64()=%llu\n",conn_info.timer.get_timer_fd64()); - - // u64_t timer_fd64=conn_info.timer.get_timer_fd64(); - // fd_manager.get_info(timer_fd64).ip_port=ip_port; - conn_info.fec_encode_manager.set_data(&conn_info); conn_info.fec_encode_manager.set_loop_and_cb(loop, fec_encode_cb); @@ -228,20 +215,26 @@ static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int rev } fd64_t fd64 = fd_manager.create(new_udp_fd); - // ev.events = EPOLLIN; - // ev.data.u64 = fd64; - // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, new_udp_fd, &ev); conn_info.conv_manager.s.insert_conv(conv, fd64); fd_manager.get_info(fd64).addr = addr; - ev_io &io_watcher = fd_manager.get_info(fd64).io_watcher; - io_watcher.u64 = fd64; - io_watcher.data = &conn_info; +#ifdef __linux__ + if (g_uring_ctx && g_uring_ctx->available) { + uring_add_multishot_recv(g_uring_ctx, new_udp_fd, + uring_tag(URING_TAG_SERVER_REMOTE, fd64)); + uring_submit(g_uring_ctx); + } else +#endif + { + ev_io &io_watcher = fd_manager.get_info(fd64).io_watcher; + io_watcher.u64 = fd64; + io_watcher.data = &conn_info; - ev_init(&io_watcher, remote_cb); - ev_io_set(&io_watcher, new_udp_fd, EV_READ); - ev_io_start(conn_info.loop, &io_watcher); + ev_init(&io_watcher, remote_cb); + ev_io_set(&io_watcher, new_udp_fd, EV_READ); + ev_io_start(conn_info.loop, &io_watcher); + } mylog(log_info, "[%s]new conv %x,fd %d created,fd64=%llu\n", addr.get_str(), conv, new_udp_fd, fd64); } @@ -254,6 +247,26 @@ static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int rev } } +static void local_listen_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { + assert(!(revents & EV_ERROR)); + + int local_listen_fd = watcher->fd; + + char data[buf_len]; + int data_len; + address_t::storage_t udp_new_addr_in = {0}; + socklen_t udp_new_addr_len = sizeof(address_t::storage_t); + data_len = recvfrom(local_listen_fd, data, max_data_len + 1, 0, + (struct sockaddr *)&udp_new_addr_in, &udp_new_addr_len); + if (data_len < 0) { + mylog(log_error, "recv_from error,err=%s\n", get_sock_error()); + return; + } + + server_process_tunnel_packet(loop, local_listen_fd, data, data_len, + (struct sockaddr *)&udp_new_addr_in, udp_new_addr_len); +} + static void remote_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { assert(!(revents & EV_ERROR)); @@ -304,12 +317,17 @@ static void conn_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int re data_from_remote_or_fec_timeout_or_conn_timer(conn_info, 0, is_conn_timer); } +static void server_uring_drain(struct ev_loop *loop); + static void prepare_cb(struct ev_loop *loop, struct ev_prepare *watcher, int revents) { assert(!(revents & EV_ERROR)); delay_manager.check(); } +#ifdef __linux__ +#endif + static void global_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int revents) { assert(!(revents & EV_ERROR)); @@ -319,6 +337,100 @@ static void global_timer_cb(struct ev_loop *loop, struct ev_timer *watcher, int mylog(log_trace, "events[idx].data.u64==(u64_t)timer.get_timer_fd()\n"); } +#ifdef __linux__ +static uring_ctx_t server_uring_ctx; +static int server_local_listen_fd; + +static void server_uring_drain(struct ev_loop *loop) { + uring_ctx_t *ctx = &server_uring_ctx; + int local_listen_fd = server_local_listen_fd; + + for (;;) { + unsigned ready = uring_cq_ready(ctx); + if (ready == 0) + break; + + int need_submit = 0; + + for (unsigned i = 0; i < ready; i++) { + struct io_uring_cqe *cqe = uring_cqe_at(ctx, i); + uint8_t type = uring_tag_type(cqe->user_data); + int more = cqe->flags & IORING_CQE_F_MORE; + + if (cqe->res < 0) { + if (!more && cqe->res != -ECANCELED) { + if (type == URING_TAG_SERVER_LOCAL) { + uring_add_multishot_recvmsg(ctx, local_listen_fd, cqe->user_data); + need_submit = 1; + } else if (type == URING_TAG_SERVER_REMOTE) { + fd64_t fd64 = (fd64_t)uring_tag_payload(cqe->user_data); + if (fd_manager.exist(fd64)) { + uring_add_multishot_recv(ctx, fd_manager.to_fd(fd64), cqe->user_data); + need_submit = 1; + } + } + } + continue; + } + + if (type == URING_TAG_SERVER_LOCAL) { + uring_recv_buf_t recv_buf; + if (uring_parse_recvmsg_cqe(ctx, cqe, &recv_buf) == 0) { + server_process_tunnel_packet(loop, local_listen_fd, recv_buf.data, recv_buf.len, + (struct sockaddr *)&recv_buf.addr, recv_buf.addr_len); + uring_recycle_buf(ctx, recv_buf.buf_id); + } + } else if (type == URING_TAG_SERVER_REMOTE) { + fd64_t fd64 = (fd64_t)uring_tag_payload(cqe->user_data); + uring_recv_buf_t recv_buf; + if (uring_parse_recv_cqe(ctx, cqe, &recv_buf) == 0) { + if (fd_manager.exist(fd64)) { + address_t &addr = fd_manager.get_info(fd64).addr; + if (conn_manager.exist(addr)) { + conn_info_t &conn_info = conn_manager.find_insert(addr); + /* Zero-copy: URING_RECV_HEADROOM bytes before data + reserved for in-place conv header insertion. */ + char *data = recv_buf.data - sizeof(u32_t); + int data_len = recv_buf.len < (int)(buf_len - sizeof(u32_t)) ? recv_buf.len : (int)(buf_len - sizeof(u32_t)); + server_process_remote_packet(conn_info, fd64, data, data_len); + } + } + uring_recycle_buf(ctx, recv_buf.buf_id); + } + } + + if (!more) { + if (type == URING_TAG_SERVER_LOCAL) { + uring_add_multishot_recvmsg(ctx, local_listen_fd, cqe->user_data); + need_submit = 1; + } else if (type == URING_TAG_SERVER_REMOTE) { + fd64_t fd64 = (fd64_t)uring_tag_payload(cqe->user_data); + if (fd_manager.exist(fd64)) { + uring_add_multishot_recv(ctx, fd_manager.to_fd(fd64), cqe->user_data); + need_submit = 1; + } + } + } + } + + /* Single batched advance + buffer commit */ + uring_cq_advance(ctx, ready); + uring_buf_ring_commit(ctx); + + /* Submit any re-arms and flush deferred completions in one syscall */ + if (need_submit) + uring_submit_and_flush(ctx); + else + uring_flush(ctx); + } +} + +static void server_uring_cb(struct ev_loop *loop, struct ev_io *watcher, int revents) { + assert(!(revents & EV_ERROR)); + server_uring_drain(loop); +} +#endif + int tunnel_server_event_loop() { int i, j, k; int ret; @@ -349,17 +461,27 @@ int tunnel_server_event_loop() { // mylog(log_fatal,"add udp_listen_fd error\n"); // myexit(-1); // } + int use_uring = 0; +#ifdef __linux__ + server_local_listen_fd = local_listen_fd; + if (uring_init(&server_uring_ctx, 64, 256, buf_len) == 0) { + g_uring_ctx = &server_uring_ctx; + static struct ev_io uring_watcher; + ev_io_init(&uring_watcher, server_uring_cb, server_uring_ctx.ring_fd, EV_READ); + ev_io_start(loop, &uring_watcher); + + uring_add_multishot_recvmsg(&server_uring_ctx, local_listen_fd, + uring_tag(URING_TAG_SERVER_LOCAL, 0)); + uring_submit(&server_uring_ctx); + use_uring = 1; + mylog(log_info, "io_uring: active for server sockets\n"); + } +#endif + struct ev_io local_listen_watcher; ev_io_init(&local_listen_watcher, local_listen_cb, local_listen_fd, EV_READ); - ev_io_start(loop, &local_listen_watcher); - - // ev.events = EPOLLIN; - // ev.data.u64 = delay_manager.get_timer_fd(); - // ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, delay_manager.get_timer_fd(), &ev); - // if (ret!= 0) { - // mylog(log_fatal,"add delay_manager.get_timer_fd() error\n"); - // myexit(-1); - // } + if (!use_uring) + ev_io_start(loop, &local_listen_watcher); delay_manager.set_loop_and_cb(loop, delay_manager_cb); diff --git a/xor_spe.S b/xor_spe.S new file mode 100644 index 0000000..d00ca81 --- /dev/null +++ b/xor_spe.S @@ -0,0 +1,159 @@ +/* + * xor_spe.S — SPE-accelerated XOR for PowerPC e500v2 + * + * Uses SPE (Signal Processing Extension) 64-bit operations: + * evldd/evstdd: 8-byte aligned load/store + * evxor: 64-bit XOR + * + * GCC 9+ removed SPE intrinsics, but gas still supports the opcodes. + * Follows the Linux kernel pattern (arch/powerpc/crypto/aes-spe-core.S). + * + * Requires CONFIG_SPE=y in the kernel for context save/restore of the + * upper 32 bits of GPRs. + */ + +/* Mark stack as non-executable (suppresses ld warning) */ +.section .note.GNU-stack,"",@progbits + +#ifdef HAVE_PPC_SPE + +.section .text +.globl xor_tile_spe +.type xor_tile_spe, @function + +/* + * void xor_tile_spe(char *data, int len, const char *tile, int tile_len) + * + * XOR data[0..len-1] with tile[0..tile_len-1] repeating. + * + * REQUIREMENTS (enforced by C caller in packet_cook.cpp): + * - data MUST be 8-byte aligned + * - tile MUST be 8-byte aligned + * - tile_len MUST be a multiple of 8 (COOK_VEC_WIDTH) + * - Tile offset always starts at 0 (caller pre-rotates if needed) + * + * PPC calling convention: + * r3 = data pointer (8-byte aligned) + * r4 = len + * r5 = tile pointer (8-byte aligned) + * r6 = tile_len (multiple of 8) + * + * Clobbers: r0, r7, r8, r9, r10, r11, r12, ctr + * Uses SPE: upper halves of r7, r8, r9, r10, r11, r12 + */ +.align 4 +xor_tile_spe: + /* Return immediately if len <= 0 */ + cmpwi %r4, 0 + blelr + + /* r0 = tile offset (always starts at 0) */ + li %r0, 0 + + /* Process 32 bytes per iteration (4x evldd). */ + srwi. %r7, %r4, 5 /* r7 = len / 32 = iteration count */ + beq .Ltail8 /* < 32 bytes remaining */ + mtctr %r7 + +.Lmain_loop: + /* Compute tile pointer: r8 = tile + offset */ + add %r8, %r5, %r0 + + /* Load 4 doublewords from data */ + evldd %r7, 0(%r3) + evldd %r9, 8(%r3) + evldd %r11, 16(%r3) + + /* Load and XOR first tile doubleword */ + evldd %r10, 0(%r8) + evxor %r7, %r7, %r10 + + /* Advance tile offset, check wrap for each doubleword */ + addic %r0, %r0, 8 + cmpw %r0, %r6 + blt .Lnw1 + li %r0, 0 +.Lnw1: + add %r8, %r5, %r0 + evldd %r10, 0(%r8) + evxor %r9, %r9, %r10 + + addic %r0, %r0, 8 + cmpw %r0, %r6 + blt .Lnw2 + li %r0, 0 +.Lnw2: + add %r8, %r5, %r0 + evldd %r10, 0(%r8) + evxor %r11, %r11, %r10 + + addic %r0, %r0, 8 + cmpw %r0, %r6 + blt .Lnw3 + li %r0, 0 +.Lnw3: + add %r8, %r5, %r0 + evldd %r12, 24(%r3) + evldd %r10, 0(%r8) + evxor %r12, %r12, %r10 + + addic %r0, %r0, 8 + cmpw %r0, %r6 + blt .Lnw4 + li %r0, 0 +.Lnw4: + + /* Store 4 XORed doublewords */ + evstdd %r7, 0(%r3) + evstdd %r9, 8(%r3) + evstdd %r11, 16(%r3) + evstdd %r12, 24(%r3) + + addi %r3, %r3, 32 + bdnz .Lmain_loop + +.Ltail8: + /* Process remaining 8-byte chunks */ + andi. %r7, %r4, 24 /* r7 = (len % 32) & ~7 = remaining 8-byte blocks * 8 */ + beq .Ltail1 + srwi %r7, %r7, 3 + mtctr %r7 + +.Ltail8_loop: + add %r8, %r5, %r0 + evldd %r7, 0(%r3) + evldd %r10, 0(%r8) + evxor %r7, %r7, %r10 + evstdd %r7, 0(%r3) + addi %r3, %r3, 8 + addic %r0, %r0, 8 + cmpw %r0, %r6 + blt .Ltail8_nowrap + li %r0, 0 +.Ltail8_nowrap: + bdnz .Ltail8_loop + +.Ltail1: + /* Process remaining bytes (0-7) */ + andi. %r7, %r4, 7 + beqlr + mtctr %r7 + +.Ltail1_loop: + lbz %r8, 0(%r3) + lbzx %r9, %r5, %r0 + xor %r8, %r8, %r9 + stb %r8, 0(%r3) + addi %r3, %r3, 1 + addic %r0, %r0, 1 + cmpw %r0, %r6 + blt .Ltail1_nowrap + li %r0, 0 +.Ltail1_nowrap: + bdnz .Ltail1_loop + + blr + +.size xor_tile_spe, .-xor_tile_spe + +#endif /* HAVE_PPC_SPE */