diff --git a/.github/workflows/perf-deep.yml b/.github/workflows/perf-deep.yml index 4c9b9f9..5da8e7d 100644 --- a/.github/workflows/perf-deep.yml +++ b/.github/workflows/perf-deep.yml @@ -55,7 +55,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake \ -DVCPKG_TARGET_TRIPLET=x64-linux - cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines + cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines bench_diffgeo - name: Run head deep benchmarks shell: bash @@ -78,6 +78,13 @@ jobs: --benchmark_out=artifacts/perf/head/bench_pipelines_head.json \ --benchmark_out_format=json > artifacts/perf/head/bench_pipelines_head.txt + ./build-release/bench_diffgeo \ + --benchmark_min_time=0.2s \ + --benchmark_repetitions=10 \ + --benchmark_report_aggregates_only=true \ + --benchmark_out=artifacts/perf/head/bench_diffgeo_head.json \ + --benchmark_out_format=json > artifacts/perf/head/bench_diffgeo_head.txt + - name: Build and run baseline deep benchmarks shell: bash run: | @@ -114,6 +121,17 @@ jobs: echo "bench_pipelines target not available on baseline $BASELINE_SHA" > artifacts/perf/base/bench_pipelines_base.txt fi + if cmake --build baseline/build-release --parallel --target bench_diffgeo; then + ./baseline/build-release/bench_diffgeo \ + --benchmark_min_time=0.2s \ + --benchmark_repetitions=10 \ + --benchmark_report_aggregates_only=true \ + --benchmark_out=artifacts/perf/base/bench_diffgeo_base.json \ + --benchmark_out_format=json > artifacts/perf/base/bench_diffgeo_base.txt + else + echo "bench_diffgeo target not available on baseline $BASELINE_SHA" > artifacts/perf/base/bench_diffgeo_base.txt + fi + - name: Compare benchmark deltas shell: bash run: | @@ -141,6 +159,24 @@ jobs: --output-markdown artifacts/perf/reports/bench_pipelines_deep.md \ --output-json artifacts/perf/reports/bench_pipelines_deep.json + if [[ -s artifacts/perf/base/bench_diffgeo_base.json ]]; then + python3 scripts/perf/compare_against_main.py \ + --baseline artifacts/perf/base/bench_diffgeo_base.json \ + --current artifacts/perf/head/bench_diffgeo_head.json \ + --baseline-commit "$BASELINE_SHA" \ + --label "Deep run: bench_diffgeo (baseline $BASELINE_SHA)" \ + --output-markdown artifacts/perf/reports/bench_diffgeo_deep.md \ + --output-json artifacts/perf/reports/bench_diffgeo_deep.json + else + python3 scripts/perf/compare_against_main.py \ + --baseline artifacts/perf/base/bench_diffgeo_base.txt \ + --current artifacts/perf/head/bench_diffgeo_head.json \ + --baseline-commit "$BASELINE_SHA" \ + --label "Deep run: bench_diffgeo (baseline $BASELINE_SHA)" \ + --output-markdown artifacts/perf/reports/bench_diffgeo_deep.md \ + --output-json artifacts/perf/reports/bench_diffgeo_deep.json + fi + { echo "# Perf Deep Report" echo @@ -151,6 +187,8 @@ jobs: cat artifacts/perf/reports/bench_dod_deep.md echo cat artifacts/perf/reports/bench_pipelines_deep.md + echo + cat artifacts/perf/reports/bench_diffgeo_deep.md } > artifacts/perf/reports/deep-summary.md - name: Build compact CSV summary @@ -162,7 +200,7 @@ jobs: out = Path("artifacts/perf/reports/deep-summary.csv") rows = ["suite,benchmark,baseline_ns,current_ns,delta_pct,status"] - for suite in ("bench_geometry_deep", "bench_dod_deep", "bench_pipelines_deep"): + for suite in ("bench_geometry_deep", "bench_dod_deep", "bench_pipelines_deep", "bench_diffgeo_deep"): path = Path(f"artifacts/perf/reports/{suite}.json") if not path.exists(): continue diff --git a/.github/workflows/perf-smoke.yml b/.github/workflows/perf-smoke.yml index cbe3135..d1d62a6 100644 --- a/.github/workflows/perf-smoke.yml +++ b/.github/workflows/perf-smoke.yml @@ -92,7 +92,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake \ -DVCPKG_TARGET_TRIPLET=x64-linux - cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines + cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines bench_diffgeo - name: Run head smoke benchmarks shell: bash @@ -117,6 +117,14 @@ jobs: --benchmark_out=artifacts/perf/head/bench_pipelines_head.json \ --benchmark_out_format=json > artifacts/perf/head/bench_pipelines_head.txt + ./build-release/bench_diffgeo \ + --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/1000/50/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/1000/50/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' \ + --benchmark_min_time=0.05s \ + --benchmark_repetitions=5 \ + --benchmark_report_aggregates_only=true \ + --benchmark_out=artifacts/perf/head/bench_diffgeo_head.json \ + --benchmark_out_format=json > artifacts/perf/head/bench_diffgeo_head.txt + - name: Build and run baseline smoke benchmarks shell: bash run: | @@ -155,6 +163,18 @@ jobs: echo "bench_pipelines target not available on baseline ${{ steps.baseline.outputs.base_sha }}" > artifacts/perf/base/bench_pipelines_base.txt fi + if cmake --build baseline/build-release --parallel --target bench_diffgeo; then + ./baseline/build-release/bench_diffgeo \ + --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/1000/50/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/1000/50/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' \ + --benchmark_min_time=0.05s \ + --benchmark_repetitions=5 \ + --benchmark_report_aggregates_only=true \ + --benchmark_out=artifacts/perf/base/bench_diffgeo_base.json \ + --benchmark_out_format=json > artifacts/perf/base/bench_diffgeo_base.txt + else + echo "bench_diffgeo target not available on baseline ${{ steps.baseline.outputs.base_sha }}" > artifacts/perf/base/bench_diffgeo_base.txt + fi + - name: Compare benchmark deltas shell: bash run: | @@ -216,6 +236,24 @@ jobs: > artifacts/perf/reports/bench_pipelines_smoke.json fi + if [[ -s artifacts/perf/base/bench_diffgeo_base.json ]]; then + python3 scripts/perf/compare_against_main.py \ + --baseline artifacts/perf/base/bench_diffgeo_base.json \ + --current artifacts/perf/head/bench_diffgeo_head.json \ + --baseline-commit "${{ steps.baseline.outputs.base_sha }}" \ + --label "PR smoke: bench_diffgeo (baseline ${{ steps.baseline.outputs.base_sha }})" \ + --output-markdown artifacts/perf/reports/bench_diffgeo_smoke.md \ + --output-json artifacts/perf/reports/bench_diffgeo_smoke.json + else + python3 scripts/perf/compare_against_main.py \ + --baseline artifacts/perf/base/bench_diffgeo_base.txt \ + --current artifacts/perf/head/bench_diffgeo_head.json \ + --baseline-commit "${{ steps.baseline.outputs.base_sha }}" \ + --label "PR smoke: bench_diffgeo (baseline ${{ steps.baseline.outputs.base_sha }})" \ + --output-markdown artifacts/perf/reports/bench_diffgeo_smoke.md \ + --output-json artifacts/perf/reports/bench_diffgeo_smoke.json + fi + { echo "# Perf Smoke Report" echo @@ -229,6 +267,8 @@ jobs: cat artifacts/perf/reports/bench_dod_smoke.md echo cat artifacts/perf/reports/bench_pipelines_smoke.md + echo + cat artifacts/perf/reports/bench_diffgeo_smoke.md } > artifacts/perf/reports/smoke-summary.md - name: Publish job summary diff --git a/include/igneous/ops/dec/curvature.hpp b/include/igneous/ops/dec/curvature.hpp index 4b8e653..3c5fe54 100644 --- a/include/igneous/ops/dec/curvature.hpp +++ b/include/igneous/ops/dec/curvature.hpp @@ -44,6 +44,130 @@ void compute_curvature_measures(const data::Space& space, std::vecto workspace.face_normals.resize(num_faces); + if constexpr (std::is_same_v) { + const auto& x = space.x; + const auto& y = space.y; + const auto& z = space.z; + const auto& face_v0 = structure.face_v0; + const auto& face_v1 = structure.face_v1; + const auto& face_v2 = structure.face_v2; + const auto& vertex_face_offsets = structure.vertex_face_offsets; + const auto& vertex_face_data = structure.vertex_face_data; + + core::parallel_for_index( + 0, static_cast(num_faces), + [&](int face_idx) { + const size_t f = static_cast(face_idx); + const uint32_t i0 = face_v0[f]; + const uint32_t i1 = face_v1[f]; + const uint32_t i2 = face_v2[f]; + + const float e10x = x[i1] - x[i0]; + const float e10y = y[i1] - y[i0]; + const float e10z = z[i1] - z[i0]; + const float e20x = x[i2] - x[i0]; + const float e20y = y[i2] - y[i0]; + const float e20z = z[i2] - z[i0]; + + workspace.face_normals[f] = { + e10x * e20y - e10y * e20x, + e10y * e20z - e10z * e20y, + e10z * e20x - e10x * e20z, + }; + }, + 256); + + core::parallel_for_index( + 0, static_cast(num_verts), + [&](int vertex_idx) { + const size_t i = static_cast(vertex_idx); + const uint32_t begin = vertex_face_offsets[i]; + const uint32_t end = vertex_face_offsets[i + 1]; + if (begin == end) { + return; + } + + float angle_sum = 0.0f; + float area_sum = 0.0f; + + float n_xy = 0.0f; + float n_yz = 0.0f; + float n_zx = 0.0f; + + float sum_x = 0.0f; + float sum_y = 0.0f; + float sum_z = 0.0f; + + const float px = x[i]; + const float py = y[i]; + const float pz = z[i]; + + for (uint32_t idx = begin; idx < end; ++idx) { + const uint32_t f_idx = vertex_face_data[idx]; + const core::Bivec3 fn = workspace.face_normals[f_idx]; + n_xy += fn.xy; + n_yz += fn.yz; + n_zx += fn.zx; + + const uint32_t i0 = face_v0[f_idx]; + const uint32_t i1 = face_v1[f_idx]; + const uint32_t i2 = face_v2[f_idx]; + + uint32_t a = i0; + uint32_t b = i1; + if (i0 == i) { + a = i1; + b = i2; + } else if (i1 == i) { + a = i2; + b = i0; + } + + const float ux = x[a] - px; + const float uy = y[a] - py; + const float uz = z[a] - pz; + const float vx = x[b] - px; + const float vy = y[b] - py; + const float vz = z[b] - pz; + + const float dot = ux * vx + uy * vy + uz * vz; + const float wedge_xy = ux * vy - uy * vx; + const float wedge_yz = uy * vz - uz * vy; + const float wedge_zx = uz * vx - ux * vz; + const float wedge_mag = + std::sqrt(wedge_xy * wedge_xy + wedge_yz * wedge_yz + wedge_zx * wedge_zx); + + angle_sum += std::atan2(wedge_mag, dot); + area_sum += 0.5f * wedge_mag; + + sum_x += x[a] + x[b]; + sum_y += y[a] + y[b]; + sum_z += z[a] + z[b]; + } + + if (area_sum > 1e-12f) { + K[i] = static_cast((2.0 * std::numbers::pi_v - angle_sum) / + (static_cast(area_sum) / 3.0)); + } + + const float n_mag_sq = n_xy * n_xy + n_yz * n_yz + n_zx * n_zx; + const float n_inv = (n_mag_sq > 1e-12f) ? 1.0f / std::sqrt(n_mag_sq) : 0.0f; + + const float normal_x = n_yz * n_inv; + const float normal_y = n_zx * n_inv; + const float normal_z = n_xy * n_inv; + + const float inv_c = 0.5f / static_cast(end - begin); + const float laplacian_x = sum_x * inv_c - px; + const float laplacian_y = sum_y * inv_c - py; + const float laplacian_z = sum_z * inv_c - pz; + + H[i] = laplacian_x * normal_x + laplacian_y * normal_y + laplacian_z * normal_z; + }, + 128); + return; + } + const auto get_face_vertex = [&](size_t face_idx, int corner) -> uint32_t { if constexpr (std::is_same_v) { if (corner == 0) diff --git a/notes/perf/20260306-wave3/journal.md b/notes/perf/20260306-wave3/journal.md index a69cfe3..8cff937 100644 --- a/notes/perf/20260306-wave3/journal.md +++ b/notes/perf/20260306-wave3/journal.md @@ -1281,3 +1281,222 @@ Use one entry per optimization hypothesis. - The end-to-end diffgeo pipeline regressed and the operator-phase movement was too noisy to justify keeping a larger cache/matrixization rewrite. - Notes: - The accepted Hodge mixed-gamma matrixization did not carry over cleanly to the generic form workspace; the remaining diffgeo bottleneck is not simply the mixed-gamma cache fill. + +## Hypothesis: add diffgeo coverage to CI smoke and deep perf reports +- Timestamp: 2026-03-07T15:30:00-07:00 +- Commit: `cf653c4` +- Primary target: + - PR smoke/deep visibility for `bench_diffgeo` +- Hypothesis: + - Add `bench_diffgeo` to the GitHub perf workflows and emit a fallback `not comparable` section when the baseline branch predates the target, so CI keeps showing the diffgeo torus timings instead of dropping them. +- Files touched: + - `.github/workflows/perf-smoke.yml` + - `.github/workflows/perf-deep.yml` +- Validation: + - Parsed both workflow YAML files locally. + - Simulated the baseline-missing-target path through `scripts/perf/compare_against_main.py` and verified that the generated markdown still emits current `bench_diffgeo` values. +- Decision: `kept` +- Notes: + - This is instrumentation only; it does not change runtime behavior. + +## Hypothesis: weighted coordinate seed for large-basis symmetric eigensolve +- Timestamp: 2026-03-07T15:31:00-07:00 +- Commit: n/a +- Primary target: + - `bench_diffgeo_phase_eigenbasis/4000/64/32/32` + - `bench_hodge_phase_eigenbasis` +- Hypothesis: + - Keep the accepted symmetric Lanczos path but seed the large-basis solve with a row-sum-weighted coordinate vector to reduce restart time. +- Files touched: + - `include/igneous/ops/diffusion/spectral.hpp` +- Smoke results: + - `bench_eigenbasis/2000/16`: `8.422 ms` -> `8.455 ms` + - `bench_pipeline_spectral_main`: `17.435 ms` -> `17.815 ms` + - `bench_pipeline_hodge_main`: `148.242 ms` -> `164.046 ms` + - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `132.063 ms` + - `bench_diffgeo_pipeline/4000/64/32/32`: `148.745 ms` -> `149.748 ms` + - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `111.772 ms` +- Decision: `rejected` +- Rejected because: + - The actual 64-mode hodge/diffgeo guards regressed materially; the seed quality trade did not pay for the extra movement in restart/orthogonalization. + +## Hypothesis: avoid extra copies in CPU Markov multi-step path +- Timestamp: 2026-03-07T15:32:00-07:00 +- Commit: n/a +- Primary target: + - `bench_markov_multi_step/20000/20` + - `bench_pipeline_diffusion_main/100` +- Hypothesis: + - Remove unnecessary whole-vector copies from repeated CPU Markov steps when the input/output buffers are already distinct. +- Files touched: + - `include/igneous/ops/diffusion/geometry.hpp` +- Smoke results: + - `bench_diffusion_build/2000`: `3.449 ms` -> `3.273 ms` + - `bench_markov_step/2000`: `16.169 us` -> `18.376 us` + - `bench_markov_multi_step/2000/20`: `329.497 us` -> `349.644 us` + - `bench_markov_multi_step/20000/20`: `4.356 ms` -> `4.142 ms` + - `bench_pipeline_diffusion_main/20`: `4.330 ms` -> `4.199 ms` + - `bench_pipeline_diffusion_main/100`: `5.344 ms` -> `5.793 ms` +- Test results: + - `test_structure_diffusion_geometry`: pass +- Decision: `rejected` +- Rejected because: + - The copy-avoidance path only helped one large microbenchmark and regressed the higher-priority `/100` diffusion pipeline guard. + +## Hypothesis: short-circuit serial `parallel_for_index` before env/backend lookup +- Timestamp: 2026-03-07T15:33:00-07:00 +- Commit: n/a +- Primary target: + - `bench_diffgeo_phase_eigenbasis/4000/64/32/32` + - `bench_hodge_phase_eigenbasis` +- Hypothesis: + - Skip the backend/env/thread discovery in `parallel_for_index` whenever the loop is already below the serial threshold, because sample traces showed that overhead inside the sparse eigensolve matvec/restart path. +- Files touched: + - `include/igneous/core/parallel.hpp` +- Profiling notes: + - `sample` on `bench_diffgeo_phase_eigenbasis/4000/64/32/32` showed most time in Spectra restart/lanczos code and Eigen dense matvecs, with smaller but visible `parallel_for_index` env/thread lookup overhead. +- Smoke results: + - `bench_eigenbasis/2000/16`: `8.422 ms` -> `8.574 ms` + - `bench_pipeline_spectral_main`: `17.435 ms` -> `17.004 ms` + - `bench_pipeline_hodge_main`: `148.242 ms` -> `149.110 ms` + - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `114.631 ms` + - `bench_diffgeo_pipeline/4000/64/32/32`: `148.745 ms` -> `155.672 ms` + - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `108.934 ms` +- Test results: + - `test_ops_spectral_geometry`: pass + - `test_ops_diffusion_forms`: pass + - `test_ops_hodge`: pass +- Decision: `rejected` +- Rejected because: + - The serial fast-path saved too little in the actual spectral hot loop and regressed the diffgeo eigensolve guard. + +## Hypothesis: DEC-specialized curvature kernel over direct SoA/CSR storage +- Timestamp: 2026-03-07T15:34:00-07:00 +- Commit: `59336a7` +- Primary target: + - `bench_curvature_kernel/400` + - `bench_geometry` `Grid 1000x1000` +- Hypothesis: + - Specialize `compute_curvature_measures()` for `DiscreteExteriorCalculus` so it traverses `face_v*`, `vertex_face_*`, and `space.x/y/z` directly, removing repeated `get_vec3()` calls, generic face-corner dispatch, and temporary `Vec3` construction while preserving the same curvature formulas. +- Files touched: + - `include/igneous/ops/dec/curvature.hpp` +- Benchmark commands: + - `ctest --test-dir build --output-on-failure -R test_ops_curvature_flow` + - `IGNEOUS_BENCH_MODE=1 ./build/bench_geometry | tee notes/perf/20260306-wave3/results/bench_geometry_dec_curvature_20260307-152340.txt` + - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_curvature_kernel/400|bench_flow_kernel/400|bench_mesh_structure_build/400|bench_markov_step/2000|bench_markov_multi_step/2000/20|bench_markov_multi_step/20000/20' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true | tee notes/perf/20260306-wave3/results/bench_dod_dec_curvature_20260307-152340.txt` + - `ctest --test-dir build --output-on-failure -j8` +- Smoke results: + - `bench_curvature_kernel/400`: `888653 ns` -> `816836 ns` CPU (`-8.08%`) + - `bench_geometry` `Grid 1000x1000` curvature: `5.716 ms` -> `5.358 ms` (`-6.26%`) + - `bench_geometry` `Grid 1000x1000` structure: `14.516 ms` -> `13.661 ms` (`-5.89%`) + - `bench_flow_kernel/400`: `196297 ns` -> `195466 ns` CPU (`-0.42%`) + - `bench_markov_multi_step/20000/20`: `4.356 ms` -> `4.099 ms` (`-5.88%`, guard unchanged by unrelated variance) +- Test results: + - `test_ops_curvature_flow`: pass + - full `ctest`: `14/14` pass +- Decision: `kept` +- Notes: + - The first single `bench_geometry` rerun was noisy on the 1e6-vertex case; follow-up repetitions settled in below the pre-change baseline, so the isolated DOD kernel and large-grid geometry guard both moved in the right direction. + +## Hypothesis: hoist CPU Markov-step dispatch and parallelize repeated row sweeps +- Timestamp: 2026-03-07T15:43:00-07:00 +- Commit: n/a +- Primary target: + - `bench_markov_multi_step/20000/20` + - `bench_pipeline_diffusion_main/20` + - `bench_pipeline_diffusion_main/100` +- Hypothesis: + - Move the CPU CSR row sweep into a shared helper, compute backend/worker selection once per repeated Markov call, and enable parallel row execution only when `row_step_work` is large enough to amortize the worker-pool overhead. +- Files touched: + - `include/igneous/ops/diffusion/geometry.hpp` +- Smoke results: + - `bench_diffusion_build/2000`: `3.438 ms` -> `3.222 ms` + - `bench_markov_step/2000`: `17.938 us` -> `20.352 us` + - `bench_markov_multi_step/2000/20`: `350.765 us` -> `382.900 us` + - `bench_markov_multi_step/20000/20`: `3.881 ms` -> `2.091 ms` + - `bench_pipeline_diffusion_main/20`: `4.187 ms` -> `4.927 ms` + - `bench_pipeline_diffusion_main/100`: `5.711 ms` -> `7.972 ms` +- Test results: + - `test_structure_diffusion_geometry`: pass +- Decision: `rejected` +- Rejected because: + - The helper did improve the largest isolated Markov microbenchmark, but it regressed both end-to-end diffusion pipeline guards badly enough that the trade is unacceptable. + +## Hypothesis: parallelize `carre_du_champ` row sweeps directly +- Timestamp: 2026-03-07T15:44:00-07:00 +- Commit: n/a +- Primary target: + - `bench_hodge_phase_curl_energy` + - `bench_pipeline_hodge_main` + - `bench_diffgeo_pipeline/4000/64/32/32` +- Hypothesis: + - Remove the redundant zero-fill in `carre_du_champ()` and parallelize both the mean-center pass and the final row accumulation above a conservative row threshold. +- Files touched: + - `include/igneous/ops/diffusion/geometry.hpp` +- Smoke results before diffgeo abort: + - `bench_pipeline_hodge_main`: `153.294 ms` -> `149.364 ms` + - `bench_hodge_phase_gram`: `1.514 ms` -> `1.409 ms` + - `bench_hodge_phase_weak_derivative`: `1.112 ms` -> `1.112 ms` + - `bench_hodge_phase_curl_energy`: `12.130 ms` -> `10.827 ms` +- Diffgeo behavior: + - `bench_diffgeo` did not return under the modified kernel and had to be terminated. +- Decision: `rejected` +- Rejected because: + - The row-parallel gamma path likely nests into existing worker-pool parallelism in the generic form assembly, which makes it unsafe for the diffgeo pipeline despite the hodge-side movement. + +## Hypothesis: scalarize ambient form reconstruction in diffgeo/hodge apps and benches +- Timestamp: 2026-03-07T15:45:00-07:00 +- Commit: n/a +- Primary target: + - `bench_diffgeo_pipeline/4000/64/32/32` + - `igneous-diffusion-geometry --n-points 1000` +- Hypothesis: + - Replace the per-point `Eigen::Matrix3f` reconstruction path with direct scalar formulas and pointer-based column loads for both 1-form and dual-2-form ambient recovery. +- Files touched: + - `benches/bench_diffgeo.cpp` + - `src/main_diffusion_geometry.cpp` + - `src/main_hodge.cpp` +- Smoke results: + - `bench_diffgeo_pipeline/4000/64/32/32`: `149.864 ms` -> `153.718 ms` + - app guard `igneous-diffusion-geometry --n-points 1000 --output-dir `: `real 0.07` -> `real 0.09` +- Test results: + - `test_diffgeo_cli_outputs`: pass + - `test_hodge_cli_outputs`: pass +- Decision: `rejected` +- Rejected because: + - The scalar rewrite is mathematically equivalent, but on the current head it regressed both the diffgeo benchmark guard and the app-level wall-time guard. + +## Hypothesis: cache backend/thread env lookups for `parallel_for_index` +- Timestamp: 2026-03-07T15:46:00-07:00 +- Commit: n/a +- Primary target: + - `bench_diffgeo_phase_eigenbasis/4000/64/32/32` + - `bench_hodge_phase_eigenbasis` + - `bench_pipeline_hodge_main` +- Hypothesis: + - Read `IGNEOUS_BACKEND` and `IGNEOUS_NUM_THREADS` once per process instead of reparsing them on every `parallel_for_index()` call, to remove repeated `getenv` and `hardware_concurrency()` overhead from the eigensolve-heavy paths. +- Files touched: + - `include/igneous/core/parallel.hpp` +- Smoke results: + - `bench_eigenbasis/2000/16`: `8.199 ms` -> `8.539 ms` + - `bench_pipeline_spectral_main`: `16.783 ms` -> `18.089 ms` + - `bench_pipeline_hodge_main`: `153.294 ms` -> `160.077 ms` + - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `123.248 ms` + - `bench_diffgeo_pipeline/4000/64/32/32`: `149.864 ms` -> `161.629 ms` + - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `113.714 ms` +- Test results: + - `test_ops_spectral_geometry`: pass + - `test_ops_diffusion_forms`: pass + - `test_ops_hodge`: pass +- Decision: `rejected` +- Rejected because: + - Even if the cached env lookup slightly helps tiny loops, it regresses the actual 64-mode guard path across spectral, hodge, and diffgeo. + +## Profiling: current diffgeo pipeline +- Timestamp: 2026-03-07T15:47:00-07:00 +- Artifact: + - `notes/perf/20260306-wave3/profiles/20260307-154012-diffgeo-pipeline/sample.txt` +- Main observations: + - The diffgeo pipeline is still dominated by the large sparse diffusion eigensolve. + - The same Spectra restart/lanczos path and Eigen dense orthogonalization matvecs remain hot on the current head. + - The remaining `parallel_for_index` env/thread lookup overhead is visible in samples, but it is clearly not large enough to justify the regression from the cached-env experiment. diff --git a/notes/perf/20260306-wave3/prior-art.md b/notes/perf/20260306-wave3/prior-art.md index 87a9278..6ca7492 100644 --- a/notes/perf/20260306-wave3/prior-art.md +++ b/notes/perf/20260306-wave3/prior-art.md @@ -38,6 +38,13 @@ This campaign should build on previous accepted optimizations and avoid repeatin - Replacing the symmetric normalized-kernel solver with Spectra's Davidson path; it remained much slower than the accepted Lanczos solver even with a direct CSR adapter. - Matrixizing `Gamma(data_coord, immersion_coord)` for diffgeo reconstruction; the diffgeo pipeline stayed effectively flat, so that precompute is not worth revisiting on the current head. - Matrixizing the mean-centered `DiffusionFormWorkspace` mixed-gamma cache fill; the diffgeo pipeline regressed and the operator-phase movement was too noisy to justify the larger cache rewrite. +- Weighted coordinate seeds for the large-basis symmetric diffusion eigensolve; they regressed the 64-mode hodge/diffgeo guards. +- CPU Markov multi-step copy-avoidance rewrites that change ping/pong ownership; they only help isolated large microcases and regress the higher-priority pipeline guard. +- Early serial short-circuiting inside `parallel_for_index`; the env/backend lookup overhead is not the dominant sparse-eigensolve cost on the current head. +- Hoisting CPU Markov repeated-step dispatch into a parallel row helper; it helps one large microbenchmark but regresses the diffusion pipeline guards. +- Parallel row-sweep versions of `carre_du_champ`; they interact badly with nested worker-pool usage in the generic form path. +- Scalarized ambient reconstruction for diffgeo/hodge export paths; it regressed the diffgeo pipeline and app guard on the current head. +- Caching `IGNEOUS_BACKEND` / `IGNEOUS_NUM_THREADS` once per process; it regressed the 64-mode spectral/hodge/diffgeo guard path. ## New Focus - Benchmark and optimize the generic k-form pipeline, which was added after most of the earlier perf work. diff --git a/notes/perf/20260306-wave3/report.md b/notes/perf/20260306-wave3/report.md index 188c675..d6be220 100644 --- a/notes/perf/20260306-wave3/report.md +++ b/notes/perf/20260306-wave3/report.md @@ -120,3 +120,44 @@ Using the Wave 3 artifacts in this directory: - a new eigensolver trade that improves diffusion eigenbasis without shifting too much cost back into diffusion build, - a mathematically exact reduction of the remaining scalar spectrum/reconstruction overhead, - or another order-preserving data-oriented rewrite outside diffusion geometry like the accepted DEC face-incidence pass. + +## Post-PR Continuation + +### Kept Since The PR Was Opened +- `cf653c4` `ci(perf): add diffgeo coverage to smoke and deep reports` +- `59336a7` `perf(dec): specialize curvature traversal for DEC storage` + +### Current Additional Gains +- DEC curvature: + - `bench_curvature_kernel/400`: `888653 ns -> 816836 ns` CPU (`-8.08%`) + - `bench_geometry` `Grid 1000x1000` curvature: `5.716 ms -> 5.358 ms` (`-6.26%`) + - `bench_geometry` `Grid 1000x1000` structure: `14.516 ms -> 13.661 ms` (`-5.89%`) + +### New Rejections +- Weighted coordinate seed for the large-basis symmetric eigensolve. +- CPU Markov multi-step copy-avoidance rewrite. +- Early serial short-circuit in `parallel_for_index`. + +### Current Read Of The Remaining Work +- The large sparse diffusion eigensolve is still the biggest shared runtime cost. +- The geometry/DEC path still has room for order-preserving data-oriented rewrites, but the wins are now single-digit rather than order-of-magnitude. +- GitHub perf smoke/deep will now surface `bench_diffgeo` even when the baseline branch predates that benchmark target. + +## Continuation Outcome + +### Additional Rejections After The DEC Curvature Pass +- Hoisted/parallel CPU Markov repeated-step helper: improved one large microbenchmark but regressed both diffusion pipeline guards. +- Parallel `carre_du_champ` row sweeps: helped some hodge-side phases but appears unsafe under diffgeo due to nested worker-pool behavior. +- Scalarized ambient reconstruction in diffgeo/hodge apps and benches: regressed both the diffgeo benchmark guard and the diffgeo app wall-time guard. +- Cached backend/thread env lookups in `parallel_for_index`: regressed the 64-mode spectral/hodge/diffgeo guard path. + +### New Profile Artifact +- Current diffgeo pipeline sample: + - `notes/perf/20260306-wave3/profiles/20260307-154012-diffgeo-pipeline/sample.txt` +- Readout: + - The large sparse diffusion eigensolve is still the dominant remaining shared cost. + - The next most likely wins require a deeper solver/orthogonalization change rather than another local cache or loop rewrite. + +### Stop Condition +- This continuation hit multiple consecutive rejected fronts after the accepted DEC curvature pass. +- The remaining plausible wins are now mostly in the large-basis eigensolver path, where the recent profiler-guided experiments did not clear the guard thresholds.