diff --git a/.github/workflows/perf-deep.yml b/.github/workflows/perf-deep.yml
index 4c9b9f9..5da8e7d 100644
--- a/.github/workflows/perf-deep.yml
+++ b/.github/workflows/perf-deep.yml
@@ -55,7 +55,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake \
             -DVCPKG_TARGET_TRIPLET=x64-linux
-          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines
+          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines bench_diffgeo
 
       - name: Run head deep benchmarks
         shell: bash
@@ -78,6 +78,13 @@ jobs:
             --benchmark_out=artifacts/perf/head/bench_pipelines_head.json \
             --benchmark_out_format=json > artifacts/perf/head/bench_pipelines_head.txt
 
+          ./build-release/bench_diffgeo \
+            --benchmark_min_time=0.2s \
+            --benchmark_repetitions=10 \
+            --benchmark_report_aggregates_only=true \
+            --benchmark_out=artifacts/perf/head/bench_diffgeo_head.json \
+            --benchmark_out_format=json > artifacts/perf/head/bench_diffgeo_head.txt
+
       - name: Build and run baseline deep benchmarks
         shell: bash
         run: |
@@ -114,6 +121,17 @@ jobs:
             echo "bench_pipelines target not available on baseline $BASELINE_SHA" > artifacts/perf/base/bench_pipelines_base.txt
           fi
 
+          if cmake --build baseline/build-release --parallel --target bench_diffgeo; then
+            ./baseline/build-release/bench_diffgeo \
+              --benchmark_min_time=0.2s \
+              --benchmark_repetitions=10 \
+              --benchmark_report_aggregates_only=true \
+              --benchmark_out=artifacts/perf/base/bench_diffgeo_base.json \
+              --benchmark_out_format=json > artifacts/perf/base/bench_diffgeo_base.txt
+          else
+            echo "bench_diffgeo target not available on baseline $BASELINE_SHA" > artifacts/perf/base/bench_diffgeo_base.txt
+          fi
+
       - name: Compare benchmark deltas
         shell: bash
         run: |
@@ -141,6 +159,24 @@ jobs:
             --output-markdown artifacts/perf/reports/bench_pipelines_deep.md \
             --output-json artifacts/perf/reports/bench_pipelines_deep.json
 
+          if [[ -s artifacts/perf/base/bench_diffgeo_base.json ]]; then
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.json \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "$BASELINE_SHA" \
+              --label "Deep run: bench_diffgeo (baseline $BASELINE_SHA)" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_deep.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_deep.json
+          else
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.txt \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "$BASELINE_SHA" \
+              --label "Deep run: bench_diffgeo (baseline $BASELINE_SHA)" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_deep.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_deep.json
+          fi
+
           {
             echo "# Perf Deep Report"
             echo
@@ -151,6 +187,8 @@ jobs:
             cat artifacts/perf/reports/bench_dod_deep.md
             echo
             cat artifacts/perf/reports/bench_pipelines_deep.md
+            echo
+            cat artifacts/perf/reports/bench_diffgeo_deep.md
           } > artifacts/perf/reports/deep-summary.md
 
       - name: Build compact CSV summary
@@ -162,7 +200,7 @@ jobs:
 
           out = Path("artifacts/perf/reports/deep-summary.csv")
           rows = ["suite,benchmark,baseline_ns,current_ns,delta_pct,status"]
-          for suite in ("bench_geometry_deep", "bench_dod_deep", "bench_pipelines_deep"):
+          for suite in ("bench_geometry_deep", "bench_dod_deep", "bench_pipelines_deep", "bench_diffgeo_deep"):
             path = Path(f"artifacts/perf/reports/{suite}.json")
             if not path.exists():
               continue
diff --git a/.github/workflows/perf-smoke.yml b/.github/workflows/perf-smoke.yml
index cbe3135..d1d62a6 100644
--- a/.github/workflows/perf-smoke.yml
+++ b/.github/workflows/perf-smoke.yml
@@ -92,7 +92,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake \
             -DVCPKG_TARGET_TRIPLET=x64-linux
-          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines
+          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines bench_diffgeo
 
       - name: Run head smoke benchmarks
         shell: bash
@@ -117,6 +117,14 @@ jobs:
             --benchmark_out=artifacts/perf/head/bench_pipelines_head.json \
             --benchmark_out_format=json > artifacts/perf/head/bench_pipelines_head.txt
 
+          ./build-release/bench_diffgeo \
+            --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/1000/50/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/1000/50/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' \
+            --benchmark_min_time=0.05s \
+            --benchmark_repetitions=5 \
+            --benchmark_report_aggregates_only=true \
+            --benchmark_out=artifacts/perf/head/bench_diffgeo_head.json \
+            --benchmark_out_format=json > artifacts/perf/head/bench_diffgeo_head.txt
+
       - name: Build and run baseline smoke benchmarks
         shell: bash
         run: |
@@ -155,6 +163,18 @@ jobs:
             echo "bench_pipelines target not available on baseline ${{ steps.baseline.outputs.base_sha }}" > artifacts/perf/base/bench_pipelines_base.txt
           fi
 
+          if cmake --build baseline/build-release --parallel --target bench_diffgeo; then
+            ./baseline/build-release/bench_diffgeo \
+              --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/1000/50/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/1000/50/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' \
+              --benchmark_min_time=0.05s \
+              --benchmark_repetitions=5 \
+              --benchmark_report_aggregates_only=true \
+              --benchmark_out=artifacts/perf/base/bench_diffgeo_base.json \
+              --benchmark_out_format=json > artifacts/perf/base/bench_diffgeo_base.txt
+          else
+            echo "bench_diffgeo target not available on baseline ${{ steps.baseline.outputs.base_sha }}" > artifacts/perf/base/bench_diffgeo_base.txt
+          fi
+
       - name: Compare benchmark deltas
         shell: bash
         run: |
@@ -216,6 +236,24 @@ jobs:
               > artifacts/perf/reports/bench_pipelines_smoke.json
           fi
 
+          if [[ -s artifacts/perf/base/bench_diffgeo_base.json ]]; then
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.json \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "${{ steps.baseline.outputs.base_sha }}" \
+              --label "PR smoke: bench_diffgeo (baseline ${{ steps.baseline.outputs.base_sha }})" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_smoke.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_smoke.json
+          else
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.txt \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "${{ steps.baseline.outputs.base_sha }}" \
+              --label "PR smoke: bench_diffgeo (baseline ${{ steps.baseline.outputs.base_sha }})" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_smoke.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_smoke.json
+          fi
+
           {
             echo "# Perf Smoke Report"
             echo
@@ -229,6 +267,8 @@ jobs:
             cat artifacts/perf/reports/bench_dod_smoke.md
             echo
             cat artifacts/perf/reports/bench_pipelines_smoke.md
+            echo
+            cat artifacts/perf/reports/bench_diffgeo_smoke.md
           } > artifacts/perf/reports/smoke-summary.md
 
       - name: Publish job summary
diff --git a/include/igneous/ops/dec/curvature.hpp b/include/igneous/ops/dec/curvature.hpp
index 4b8e653..3c5fe54 100644
--- a/include/igneous/ops/dec/curvature.hpp
+++ b/include/igneous/ops/dec/curvature.hpp
@@ -44,6 +44,130 @@ void compute_curvature_measures(const data::Space<StructureT>& space, std::vecto
 
   workspace.face_normals.resize(num_faces);
 
+  if constexpr (std::is_same_v<StructureT, data::DiscreteExteriorCalculus>) {
+    const auto& x = space.x;
+    const auto& y = space.y;
+    const auto& z = space.z;
+    const auto& face_v0 = structure.face_v0;
+    const auto& face_v1 = structure.face_v1;
+    const auto& face_v2 = structure.face_v2;
+    const auto& vertex_face_offsets = structure.vertex_face_offsets;
+    const auto& vertex_face_data = structure.vertex_face_data;
+
+    core::parallel_for_index(
+        0, static_cast<int>(num_faces),
+        [&](int face_idx) {
+          const size_t f = static_cast<size_t>(face_idx);
+          const uint32_t i0 = face_v0[f];
+          const uint32_t i1 = face_v1[f];
+          const uint32_t i2 = face_v2[f];
+
+          const float e10x = x[i1] - x[i0];
+          const float e10y = y[i1] - y[i0];
+          const float e10z = z[i1] - z[i0];
+          const float e20x = x[i2] - x[i0];
+          const float e20y = y[i2] - y[i0];
+          const float e20z = z[i2] - z[i0];
+
+          workspace.face_normals[f] = {
+              e10x * e20y - e10y * e20x,
+              e10y * e20z - e10z * e20y,
+              e10z * e20x - e10x * e20z,
+          };
+        },
+        256);
+
+    core::parallel_for_index(
+        0, static_cast<int>(num_verts),
+        [&](int vertex_idx) {
+          const size_t i = static_cast<size_t>(vertex_idx);
+          const uint32_t begin = vertex_face_offsets[i];
+          const uint32_t end = vertex_face_offsets[i + 1];
+          if (begin == end) {
+            return;
+          }
+
+          float angle_sum = 0.0f;
+          float area_sum = 0.0f;
+
+          float n_xy = 0.0f;
+          float n_yz = 0.0f;
+          float n_zx = 0.0f;
+
+          float sum_x = 0.0f;
+          float sum_y = 0.0f;
+          float sum_z = 0.0f;
+
+          const float px = x[i];
+          const float py = y[i];
+          const float pz = z[i];
+
+          for (uint32_t idx = begin; idx < end; ++idx) {
+            const uint32_t f_idx = vertex_face_data[idx];
+            const core::Bivec3 fn = workspace.face_normals[f_idx];
+            n_xy += fn.xy;
+            n_yz += fn.yz;
+            n_zx += fn.zx;
+
+            const uint32_t i0 = face_v0[f_idx];
+            const uint32_t i1 = face_v1[f_idx];
+            const uint32_t i2 = face_v2[f_idx];
+
+            uint32_t a = i0;
+            uint32_t b = i1;
+            if (i0 == i) {
+              a = i1;
+              b = i2;
+            } else if (i1 == i) {
+              a = i2;
+              b = i0;
+            }
+
+            const float ux = x[a] - px;
+            const float uy = y[a] - py;
+            const float uz = z[a] - pz;
+            const float vx = x[b] - px;
+            const float vy = y[b] - py;
+            const float vz = z[b] - pz;
+
+            const float dot = ux * vx + uy * vy + uz * vz;
+            const float wedge_xy = ux * vy - uy * vx;
+            const float wedge_yz = uy * vz - uz * vy;
+            const float wedge_zx = uz * vx - ux * vz;
+            const float wedge_mag =
+                std::sqrt(wedge_xy * wedge_xy + wedge_yz * wedge_yz + wedge_zx * wedge_zx);
+
+            angle_sum += std::atan2(wedge_mag, dot);
+            area_sum += 0.5f * wedge_mag;
+
+            sum_x += x[a] + x[b];
+            sum_y += y[a] + y[b];
+            sum_z += z[a] + z[b];
+          }
+
+          if (area_sum > 1e-12f) {
+            K[i] = static_cast<float>((2.0 * std::numbers::pi_v<double> - angle_sum) /
+                                      (static_cast<double>(area_sum) / 3.0));
+          }
+
+          const float n_mag_sq = n_xy * n_xy + n_yz * n_yz + n_zx * n_zx;
+          const float n_inv = (n_mag_sq > 1e-12f) ? 1.0f / std::sqrt(n_mag_sq) : 0.0f;
+
+          const float normal_x = n_yz * n_inv;
+          const float normal_y = n_zx * n_inv;
+          const float normal_z = n_xy * n_inv;
+
+          const float inv_c = 0.5f / static_cast<float>(end - begin);
+          const float laplacian_x = sum_x * inv_c - px;
+          const float laplacian_y = sum_y * inv_c - py;
+          const float laplacian_z = sum_z * inv_c - pz;
+
+          H[i] = laplacian_x * normal_x + laplacian_y * normal_y + laplacian_z * normal_z;
+        },
+        128);
+    return;
+  }
+
   const auto get_face_vertex = [&](size_t face_idx, int corner) -> uint32_t {
     if constexpr (std::is_same_v<StructureT, data::DiscreteExteriorCalculus>) {
       if (corner == 0)
diff --git a/notes/perf/20260306-wave3/journal.md b/notes/perf/20260306-wave3/journal.md
index a69cfe3..8cff937 100644
--- a/notes/perf/20260306-wave3/journal.md
+++ b/notes/perf/20260306-wave3/journal.md
@@ -1281,3 +1281,222 @@ Use one entry per optimization hypothesis.
   - The end-to-end diffgeo pipeline regressed and the operator-phase movement was too noisy to justify keeping a larger cache/matrixization rewrite.
 - Notes:
   - The accepted Hodge mixed-gamma matrixization did not carry over cleanly to the generic form workspace; the remaining diffgeo bottleneck is not simply the mixed-gamma cache fill.
+
+## Hypothesis: add diffgeo coverage to CI smoke and deep perf reports
+- Timestamp: 2026-03-07T15:30:00-07:00
+- Commit: `cf653c4`
+- Primary target:
+  - PR smoke/deep visibility for `bench_diffgeo`
+- Hypothesis:
+  - Add `bench_diffgeo` to the GitHub perf workflows and emit a fallback `not comparable` section when the baseline branch predates the target, so CI keeps showing the diffgeo torus timings instead of dropping them.
+- Files touched:
+  - `.github/workflows/perf-smoke.yml`
+  - `.github/workflows/perf-deep.yml`
+- Validation:
+  - Parsed both workflow YAML files locally.
+  - Simulated the baseline-missing-target path through `scripts/perf/compare_against_main.py` and verified that the generated markdown still emits current `bench_diffgeo` values.
+- Decision: `kept`
+- Notes:
+  - This is instrumentation only; it does not change runtime behavior.
+
+## Hypothesis: weighted coordinate seed for large-basis symmetric eigensolve
+- Timestamp: 2026-03-07T15:31:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_hodge_phase_eigenbasis`
+- Hypothesis:
+  - Keep the accepted symmetric Lanczos path but seed the large-basis solve with a row-sum-weighted coordinate vector to reduce restart time.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Smoke results:
+  - `bench_eigenbasis/2000/16`: `8.422 ms` -> `8.455 ms`
+  - `bench_pipeline_spectral_main`: `17.435 ms` -> `17.815 ms`
+  - `bench_pipeline_hodge_main`: `148.242 ms` -> `164.046 ms`
+  - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `132.063 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `148.745 ms` -> `149.748 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `111.772 ms`
+- Decision: `rejected`
+- Rejected because:
+  - The actual 64-mode hodge/diffgeo guards regressed materially; the seed quality trade did not pay for the extra movement in restart/orthogonalization.
+
+## Hypothesis: avoid extra copies in CPU Markov multi-step path
+- Timestamp: 2026-03-07T15:32:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_markov_multi_step/20000/20`
+  - `bench_pipeline_diffusion_main/100`
+- Hypothesis:
+  - Remove unnecessary whole-vector copies from repeated CPU Markov steps when the input/output buffers are already distinct.
+- Files touched:
+  - `include/igneous/ops/diffusion/geometry.hpp`
+- Smoke results:
+  - `bench_diffusion_build/2000`: `3.449 ms` -> `3.273 ms`
+  - `bench_markov_step/2000`: `16.169 us` -> `18.376 us`
+  - `bench_markov_multi_step/2000/20`: `329.497 us` -> `349.644 us`
+  - `bench_markov_multi_step/20000/20`: `4.356 ms` -> `4.142 ms`
+  - `bench_pipeline_diffusion_main/20`: `4.330 ms` -> `4.199 ms`
+  - `bench_pipeline_diffusion_main/100`: `5.344 ms` -> `5.793 ms`
+- Test results:
+  - `test_structure_diffusion_geometry`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The copy-avoidance path only helped one large microbenchmark and regressed the higher-priority `/100` diffusion pipeline guard.
+
+## Hypothesis: short-circuit serial `parallel_for_index` before env/backend lookup
+- Timestamp: 2026-03-07T15:33:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_hodge_phase_eigenbasis`
+- Hypothesis:
+  - Skip the backend/env/thread discovery in `parallel_for_index` whenever the loop is already below the serial threshold, because sample traces showed that overhead inside the sparse eigensolve matvec/restart path.
+- Files touched:
+  - `include/igneous/core/parallel.hpp`
+- Profiling notes:
+  - `sample` on `bench_diffgeo_phase_eigenbasis/4000/64/32/32` showed most time in Spectra restart/lanczos code and Eigen dense matvecs, with smaller but visible `parallel_for_index` env/thread lookup overhead.
+- Smoke results:
+  - `bench_eigenbasis/2000/16`: `8.422 ms` -> `8.574 ms`
+  - `bench_pipeline_spectral_main`: `17.435 ms` -> `17.004 ms`
+  - `bench_pipeline_hodge_main`: `148.242 ms` -> `149.110 ms`
+  - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `114.631 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `148.745 ms` -> `155.672 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `108.934 ms`
+- Test results:
+  - `test_ops_spectral_geometry`: pass
+  - `test_ops_diffusion_forms`: pass
+  - `test_ops_hodge`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The serial fast-path saved too little in the actual spectral hot loop and regressed the diffgeo eigensolve guard.
+
+## Hypothesis: DEC-specialized curvature kernel over direct SoA/CSR storage
+- Timestamp: 2026-03-07T15:34:00-07:00
+- Commit: `59336a7`
+- Primary target:
+  - `bench_curvature_kernel/400`
+  - `bench_geometry` `Grid 1000x1000`
+- Hypothesis:
+  - Specialize `compute_curvature_measures()` for `DiscreteExteriorCalculus` so it traverses `face_v*`, `vertex_face_*`, and `space.x/y/z` directly, removing repeated `get_vec3()` calls, generic face-corner dispatch, and temporary `Vec3` construction while preserving the same curvature formulas.
+- Files touched:
+  - `include/igneous/ops/dec/curvature.hpp`
+- Benchmark commands:
+  - `ctest --test-dir build --output-on-failure -R test_ops_curvature_flow`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_geometry | tee notes/perf/20260306-wave3/results/bench_geometry_dec_curvature_20260307-152340.txt`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_curvature_kernel/400|bench_flow_kernel/400|bench_mesh_structure_build/400|bench_markov_step/2000|bench_markov_multi_step/2000/20|bench_markov_multi_step/20000/20' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true | tee notes/perf/20260306-wave3/results/bench_dod_dec_curvature_20260307-152340.txt`
+  - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - `bench_curvature_kernel/400`: `888653 ns` -> `816836 ns` CPU (`-8.08%`)
+  - `bench_geometry` `Grid 1000x1000` curvature: `5.716 ms` -> `5.358 ms` (`-6.26%`)
+  - `bench_geometry` `Grid 1000x1000` structure: `14.516 ms` -> `13.661 ms` (`-5.89%`)
+  - `bench_flow_kernel/400`: `196297 ns` -> `195466 ns` CPU (`-0.42%`)
+  - `bench_markov_multi_step/20000/20`: `4.356 ms` -> `4.099 ms` (`-5.88%`, guard unchanged by unrelated variance)
+- Test results:
+  - `test_ops_curvature_flow`: pass
+  - full `ctest`: `14/14` pass
+- Decision: `kept`
+- Notes:
+  - The first single `bench_geometry` rerun was noisy on the 1e6-vertex case; follow-up repetitions settled in below the pre-change baseline, so the isolated DOD kernel and large-grid geometry guard both moved in the right direction.
+
+## Hypothesis: hoist CPU Markov-step dispatch and parallelize repeated row sweeps
+- Timestamp: 2026-03-07T15:43:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_markov_multi_step/20000/20`
+  - `bench_pipeline_diffusion_main/20`
+  - `bench_pipeline_diffusion_main/100`
+- Hypothesis:
+  - Move the CPU CSR row sweep into a shared helper, compute backend/worker selection once per repeated Markov call, and enable parallel row execution only when `row_step_work` is large enough to amortize the worker-pool overhead.
+- Files touched:
+  - `include/igneous/ops/diffusion/geometry.hpp`
+- Smoke results:
+  - `bench_diffusion_build/2000`: `3.438 ms` -> `3.222 ms`
+  - `bench_markov_step/2000`: `17.938 us` -> `20.352 us`
+  - `bench_markov_multi_step/2000/20`: `350.765 us` -> `382.900 us`
+  - `bench_markov_multi_step/20000/20`: `3.881 ms` -> `2.091 ms`
+  - `bench_pipeline_diffusion_main/20`: `4.187 ms` -> `4.927 ms`
+  - `bench_pipeline_diffusion_main/100`: `5.711 ms` -> `7.972 ms`
+- Test results:
+  - `test_structure_diffusion_geometry`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The helper did improve the largest isolated Markov microbenchmark, but it regressed both end-to-end diffusion pipeline guards badly enough that the trade is unacceptable.
+
+## Hypothesis: parallelize `carre_du_champ` row sweeps directly
+- Timestamp: 2026-03-07T15:44:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_hodge_phase_curl_energy`
+  - `bench_pipeline_hodge_main`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Remove the redundant zero-fill in `carre_du_champ()` and parallelize both the mean-center pass and the final row accumulation above a conservative row threshold.
+- Files touched:
+  - `include/igneous/ops/diffusion/geometry.hpp`
+- Smoke results before diffgeo abort:
+  - `bench_pipeline_hodge_main`: `153.294 ms` -> `149.364 ms`
+  - `bench_hodge_phase_gram`: `1.514 ms` -> `1.409 ms`
+  - `bench_hodge_phase_weak_derivative`: `1.112 ms` -> `1.112 ms`
+  - `bench_hodge_phase_curl_energy`: `12.130 ms` -> `10.827 ms`
+- Diffgeo behavior:
+  - `bench_diffgeo` did not return under the modified kernel and had to be terminated.
+- Decision: `rejected`
+- Rejected because:
+  - The row-parallel gamma path likely nests into existing worker-pool parallelism in the generic form assembly, which makes it unsafe for the diffgeo pipeline despite the hodge-side movement.
+
+## Hypothesis: scalarize ambient form reconstruction in diffgeo/hodge apps and benches
+- Timestamp: 2026-03-07T15:45:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+  - `igneous-diffusion-geometry --n-points 1000`
+- Hypothesis:
+  - Replace the per-point `Eigen::Matrix3f` reconstruction path with direct scalar formulas and pointer-based column loads for both 1-form and dual-2-form ambient recovery.
+- Files touched:
+  - `benches/bench_diffgeo.cpp`
+  - `src/main_diffusion_geometry.cpp`
+  - `src/main_hodge.cpp`
+- Smoke results:
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `149.864 ms` -> `153.718 ms`
+  - app guard `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 0.07` -> `real 0.09`
+- Test results:
+  - `test_diffgeo_cli_outputs`: pass
+  - `test_hodge_cli_outputs`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The scalar rewrite is mathematically equivalent, but on the current head it regressed both the diffgeo benchmark guard and the app-level wall-time guard.
+
+## Hypothesis: cache backend/thread env lookups for `parallel_for_index`
+- Timestamp: 2026-03-07T15:46:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_pipeline_hodge_main`
+- Hypothesis:
+  - Read `IGNEOUS_BACKEND` and `IGNEOUS_NUM_THREADS` once per process instead of reparsing them on every `parallel_for_index()` call, to remove repeated `getenv` and `hardware_concurrency()` overhead from the eigensolve-heavy paths.
+- Files touched:
+  - `include/igneous/core/parallel.hpp`
+- Smoke results:
+  - `bench_eigenbasis/2000/16`: `8.199 ms` -> `8.539 ms`
+  - `bench_pipeline_spectral_main`: `16.783 ms` -> `18.089 ms`
+  - `bench_pipeline_hodge_main`: `153.294 ms` -> `160.077 ms`
+  - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `123.248 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `149.864 ms` -> `161.629 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `113.714 ms`
+- Test results:
+  - `test_ops_spectral_geometry`: pass
+  - `test_ops_diffusion_forms`: pass
+  - `test_ops_hodge`: pass
+- Decision: `rejected`
+- Rejected because:
+  - Even if the cached env lookup slightly helps tiny loops, it regresses the actual 64-mode guard path across spectral, hodge, and diffgeo.
+
+## Profiling: current diffgeo pipeline
+- Timestamp: 2026-03-07T15:47:00-07:00
+- Artifact:
+  - `notes/perf/20260306-wave3/profiles/20260307-154012-diffgeo-pipeline/sample.txt`
+- Main observations:
+  - The diffgeo pipeline is still dominated by the large sparse diffusion eigensolve.
+  - The same Spectra restart/lanczos path and Eigen dense orthogonalization matvecs remain hot on the current head.
+  - The remaining `parallel_for_index` env/thread lookup overhead is visible in samples, but it is clearly not large enough to justify the regression from the cached-env experiment.
diff --git a/notes/perf/20260306-wave3/prior-art.md b/notes/perf/20260306-wave3/prior-art.md
index 87a9278..6ca7492 100644
--- a/notes/perf/20260306-wave3/prior-art.md
+++ b/notes/perf/20260306-wave3/prior-art.md
@@ -38,6 +38,13 @@ This campaign should build on previous accepted optimizations and avoid repeatin
 - Replacing the symmetric normalized-kernel solver with Spectra's Davidson path; it remained much slower than the accepted Lanczos solver even with a direct CSR adapter.
 - Matrixizing `Gamma(data_coord, immersion_coord)` for diffgeo reconstruction; the diffgeo pipeline stayed effectively flat, so that precompute is not worth revisiting on the current head.
 - Matrixizing the mean-centered `DiffusionFormWorkspace` mixed-gamma cache fill; the diffgeo pipeline regressed and the operator-phase movement was too noisy to justify the larger cache rewrite.
+- Weighted coordinate seeds for the large-basis symmetric diffusion eigensolve; they regressed the 64-mode hodge/diffgeo guards.
+- CPU Markov multi-step copy-avoidance rewrites that change ping/pong ownership; they only help isolated large microcases and regress the higher-priority pipeline guard.
+- Early serial short-circuiting inside `parallel_for_index`; the env/backend lookup overhead is not the dominant sparse-eigensolve cost on the current head.
+- Hoisting CPU Markov repeated-step dispatch into a parallel row helper; it helps one large microbenchmark but regresses the diffusion pipeline guards.
+- Parallel row-sweep versions of `carre_du_champ`; they interact badly with nested worker-pool usage in the generic form path.
+- Scalarized ambient reconstruction for diffgeo/hodge export paths; it regressed the diffgeo pipeline and app guard on the current head.
+- Caching `IGNEOUS_BACKEND` / `IGNEOUS_NUM_THREADS` once per process; it regressed the 64-mode spectral/hodge/diffgeo guard path.
 
 ## New Focus
 - Benchmark and optimize the generic k-form pipeline, which was added after most of the earlier perf work.
diff --git a/notes/perf/20260306-wave3/report.md b/notes/perf/20260306-wave3/report.md
index 188c675..d6be220 100644
--- a/notes/perf/20260306-wave3/report.md
+++ b/notes/perf/20260306-wave3/report.md
@@ -120,3 +120,44 @@ Using the Wave 3 artifacts in this directory:
   - a new eigensolver trade that improves diffusion eigenbasis without shifting too much cost back into diffusion build,
   - a mathematically exact reduction of the remaining scalar spectrum/reconstruction overhead,
   - or another order-preserving data-oriented rewrite outside diffusion geometry like the accepted DEC face-incidence pass.
+
+## Post-PR Continuation
+
+### Kept Since The PR Was Opened
+- `cf653c4` `ci(perf): add diffgeo coverage to smoke and deep reports`
+- `59336a7` `perf(dec): specialize curvature traversal for DEC storage`
+
+### Current Additional Gains
+- DEC curvature:
+  - `bench_curvature_kernel/400`: `888653 ns -> 816836 ns` CPU (`-8.08%`)
+  - `bench_geometry` `Grid 1000x1000` curvature: `5.716 ms -> 5.358 ms` (`-6.26%`)
+  - `bench_geometry` `Grid 1000x1000` structure: `14.516 ms -> 13.661 ms` (`-5.89%`)
+
+### New Rejections
+- Weighted coordinate seed for the large-basis symmetric eigensolve.
+- CPU Markov multi-step copy-avoidance rewrite.
+- Early serial short-circuit in `parallel_for_index`.
+
+### Current Read Of The Remaining Work
+- The large sparse diffusion eigensolve is still the biggest shared runtime cost.
+- The geometry/DEC path still has room for order-preserving data-oriented rewrites, but the wins are now single-digit rather than order-of-magnitude.
+- GitHub perf smoke/deep will now surface `bench_diffgeo` even when the baseline branch predates that benchmark target.
+
+## Continuation Outcome
+
+### Additional Rejections After The DEC Curvature Pass
+- Hoisted/parallel CPU Markov repeated-step helper: improved one large microbenchmark but regressed both diffusion pipeline guards.
+- Parallel `carre_du_champ` row sweeps: helped some hodge-side phases but appears unsafe under diffgeo due to nested worker-pool behavior.
+- Scalarized ambient reconstruction in diffgeo/hodge apps and benches: regressed both the diffgeo benchmark guard and the diffgeo app wall-time guard.
+- Cached backend/thread env lookups in `parallel_for_index`: regressed the 64-mode spectral/hodge/diffgeo guard path.
+
+### New Profile Artifact
+- Current diffgeo pipeline sample:
+  - `notes/perf/20260306-wave3/profiles/20260307-154012-diffgeo-pipeline/sample.txt`
+- Readout:
+  - The large sparse diffusion eigensolve is still the dominant remaining shared cost.
+  - The next most likely wins require a deeper solver/orthogonalization change rather than another local cache or loop rewrite.
+
+### Stop Condition
+- This continuation hit multiple consecutive rejected fronts after the accepted DEC curvature pass.
+- The remaining plausible wins are now mostly in the large-basis eigensolver path, where the recent profiler-guided experiments did not clear the guard thresholds.