diff --git a/.github/workflows/perf-deep.yml b/.github/workflows/perf-deep.yml
index 4c9b9f9..5da8e7d 100644
--- a/.github/workflows/perf-deep.yml
+++ b/.github/workflows/perf-deep.yml
@@ -55,7 +55,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake \
             -DVCPKG_TARGET_TRIPLET=x64-linux
-          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines
+          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines bench_diffgeo
 
       - name: Run head deep benchmarks
         shell: bash
@@ -78,6 +78,13 @@ jobs:
             --benchmark_out=artifacts/perf/head/bench_pipelines_head.json \
             --benchmark_out_format=json > artifacts/perf/head/bench_pipelines_head.txt
 
+          ./build-release/bench_diffgeo \
+            --benchmark_min_time=0.2s \
+            --benchmark_repetitions=10 \
+            --benchmark_report_aggregates_only=true \
+            --benchmark_out=artifacts/perf/head/bench_diffgeo_head.json \
+            --benchmark_out_format=json > artifacts/perf/head/bench_diffgeo_head.txt
+
       - name: Build and run baseline deep benchmarks
         shell: bash
         run: |
@@ -114,6 +121,17 @@ jobs:
             echo "bench_pipelines target not available on baseline $BASELINE_SHA" > artifacts/perf/base/bench_pipelines_base.txt
           fi
 
+          if cmake --build baseline/build-release --parallel --target bench_diffgeo; then
+            ./baseline/build-release/bench_diffgeo \
+              --benchmark_min_time=0.2s \
+              --benchmark_repetitions=10 \
+              --benchmark_report_aggregates_only=true \
+              --benchmark_out=artifacts/perf/base/bench_diffgeo_base.json \
+              --benchmark_out_format=json > artifacts/perf/base/bench_diffgeo_base.txt
+          else
+            echo "bench_diffgeo target not available on baseline $BASELINE_SHA" > artifacts/perf/base/bench_diffgeo_base.txt
+          fi
+
       - name: Compare benchmark deltas
         shell: bash
         run: |
@@ -141,6 +159,24 @@ jobs:
             --output-markdown artifacts/perf/reports/bench_pipelines_deep.md \
             --output-json artifacts/perf/reports/bench_pipelines_deep.json
 
+          if [[ -s artifacts/perf/base/bench_diffgeo_base.json ]]; then
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.json \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "$BASELINE_SHA" \
+              --label "Deep run: bench_diffgeo (baseline $BASELINE_SHA)" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_deep.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_deep.json
+          else
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.txt \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "$BASELINE_SHA" \
+              --label "Deep run: bench_diffgeo (baseline $BASELINE_SHA)" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_deep.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_deep.json
+          fi
+
           {
             echo "# Perf Deep Report"
             echo
@@ -151,6 +187,8 @@ jobs:
             cat artifacts/perf/reports/bench_dod_deep.md
             echo
             cat artifacts/perf/reports/bench_pipelines_deep.md
+            echo
+            cat artifacts/perf/reports/bench_diffgeo_deep.md
           } > artifacts/perf/reports/deep-summary.md
 
       - name: Build compact CSV summary
@@ -162,7 +200,7 @@ jobs:
 
           out = Path("artifacts/perf/reports/deep-summary.csv")
           rows = ["suite,benchmark,baseline_ns,current_ns,delta_pct,status"]
-          for suite in ("bench_geometry_deep", "bench_dod_deep", "bench_pipelines_deep"):
+          for suite in ("bench_geometry_deep", "bench_dod_deep", "bench_pipelines_deep", "bench_diffgeo_deep"):
             path = Path(f"artifacts/perf/reports/{suite}.json")
             if not path.exists():
               continue
diff --git a/.github/workflows/perf-smoke.yml b/.github/workflows/perf-smoke.yml
index cbe3135..d1d62a6 100644
--- a/.github/workflows/perf-smoke.yml
+++ b/.github/workflows/perf-smoke.yml
@@ -92,7 +92,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake \
             -DVCPKG_TARGET_TRIPLET=x64-linux
-          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines
+          cmake --build build-release --parallel --target bench_geometry bench_dod bench_pipelines bench_diffgeo
 
       - name: Run head smoke benchmarks
         shell: bash
@@ -117,6 +117,14 @@ jobs:
             --benchmark_out=artifacts/perf/head/bench_pipelines_head.json \
             --benchmark_out_format=json > artifacts/perf/head/bench_pipelines_head.txt
 
+          ./build-release/bench_diffgeo \
+            --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/1000/50/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/1000/50/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' \
+            --benchmark_min_time=0.05s \
+            --benchmark_repetitions=5 \
+            --benchmark_report_aggregates_only=true \
+            --benchmark_out=artifacts/perf/head/bench_diffgeo_head.json \
+            --benchmark_out_format=json > artifacts/perf/head/bench_diffgeo_head.txt
+
       - name: Build and run baseline smoke benchmarks
         shell: bash
         run: |
@@ -155,6 +163,18 @@ jobs:
             echo "bench_pipelines target not available on baseline ${{ steps.baseline.outputs.base_sha }}" > artifacts/perf/base/bench_pipelines_base.txt
           fi
 
+          if cmake --build baseline/build-release --parallel --target bench_diffgeo; then
+            ./baseline/build-release/bench_diffgeo \
+              --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/1000/50/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/1000/50/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' \
+              --benchmark_min_time=0.05s \
+              --benchmark_repetitions=5 \
+              --benchmark_report_aggregates_only=true \
+              --benchmark_out=artifacts/perf/base/bench_diffgeo_base.json \
+              --benchmark_out_format=json > artifacts/perf/base/bench_diffgeo_base.txt
+          else
+            echo "bench_diffgeo target not available on baseline ${{ steps.baseline.outputs.base_sha }}" > artifacts/perf/base/bench_diffgeo_base.txt
+          fi
+
       - name: Compare benchmark deltas
         shell: bash
         run: |
@@ -216,6 +236,24 @@ jobs:
               > artifacts/perf/reports/bench_pipelines_smoke.json
           fi
 
+          if [[ -s artifacts/perf/base/bench_diffgeo_base.json ]]; then
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.json \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "${{ steps.baseline.outputs.base_sha }}" \
+              --label "PR smoke: bench_diffgeo (baseline ${{ steps.baseline.outputs.base_sha }})" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_smoke.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_smoke.json
+          else
+            python3 scripts/perf/compare_against_main.py \
+              --baseline artifacts/perf/base/bench_diffgeo_base.txt \
+              --current artifacts/perf/head/bench_diffgeo_head.json \
+              --baseline-commit "${{ steps.baseline.outputs.base_sha }}" \
+              --label "PR smoke: bench_diffgeo (baseline ${{ steps.baseline.outputs.base_sha }})" \
+              --output-markdown artifacts/perf/reports/bench_diffgeo_smoke.md \
+              --output-json artifacts/perf/reports/bench_diffgeo_smoke.json
+          fi
+
           {
             echo "# Perf Smoke Report"
             echo
@@ -229,6 +267,8 @@ jobs:
             cat artifacts/perf/reports/bench_dod_smoke.md
             echo
             cat artifacts/perf/reports/bench_pipelines_smoke.md
+            echo
+            cat artifacts/perf/reports/bench_diffgeo_smoke.md
           } > artifacts/perf/reports/smoke-summary.md
 
       - name: Publish job summary
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de468bd..3bc8265 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,9 @@ target_link_libraries(bench_dod PRIVATE igneous benchmark::benchmark)
 add_executable(bench_pipelines benches/bench_pipelines.cpp)
 target_link_libraries(bench_pipelines PRIVATE igneous benchmark::benchmark)
 
+add_executable(bench_diffgeo benches/bench_diffgeo.cpp)
+target_link_libraries(bench_diffgeo PRIVATE igneous benchmark::benchmark)
+
 add_executable(igneous-mesh src/main_mesh.cpp)
 target_link_libraries(igneous-mesh PRIVATE igneous fmt::fmt)
 
diff --git a/benches/bench_diffgeo.cpp b/benches/bench_diffgeo.cpp
new file mode 100644
index 0000000..eb78991
--- /dev/null
+++ b/benches/bench_diffgeo.cpp
@@ -0,0 +1,452 @@
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Dense>
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <random>
+#include <vector>
+
+#include <igneous/core/algebra.hpp>
+#include <igneous/data/space.hpp>
+#include <igneous/data/structures/diffusion_geometry.hpp>
+#include <igneous/ops/diffusion/forms.hpp>
+#include <igneous/ops/diffusion/hodge.hpp>
+#include <igneous/ops/diffusion/products.hpp>
+#include <igneous/ops/diffusion/spectral.hpp>
+
+using DiffusionMesh = igneous::data::Space<igneous::data::DiffusionGeometry>;
+
+namespace {
+struct BenchEnvSetup {
+  BenchEnvSetup() {
+    setenv("IGNEOUS_BENCH_MODE", "1", 1);
+  }
+} kBenchEnvSetup;
+
+struct DiffgeoConfig {
+  size_t n_points = 0;
+  int n_basis = 0;
+  int k_neighbors = 0;
+  int n_coeff = 0;
+};
+
+struct HarmonicModes {
+  Eigen::VectorXf evals1;
+  Eigen::MatrixXf evecs1;
+  std::vector<int> harmonic1_idx;
+};
+
+DiffgeoConfig config_from_state(const benchmark::State& state) {
+  return {
+      static_cast<size_t>(state.range(0)),
+      static_cast<int>(state.range(1)),
+      static_cast<int>(state.range(2)),
+      static_cast<int>(state.range(3)),
+  };
+}
+
+void generate_torus(DiffusionMesh& mesh, size_t n_points, float R, float r, uint32_t seed = 0) {
+  mesh.clear();
+  mesh.reserve(n_points);
+
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<float> dist(0.0f, 6.283185f);
+
+  for (size_t i = 0; i < n_points; ++i) {
+    const float u = dist(gen);
+    const float v = dist(gen);
+
+    const float x = (R + r * std::cos(v)) * std::cos(u);
+    const float y = (R + r * std::cos(v)) * std::sin(u);
+    const float z = r * std::sin(v);
+    mesh.push_point({x, y, z});
+  }
+}
+
+DiffusionMesh make_torus_mesh(const DiffgeoConfig& cfg) {
+  DiffusionMesh mesh;
+  generate_torus(mesh, cfg.n_points, 2.0f, 1.0f, 0);
+  return mesh;
+}
+
+void build_diffusion_geometry(DiffusionMesh& mesh, int k_neighbors) {
+  mesh.structure.build(
+      {mesh.x_span(), mesh.y_span(), mesh.z_span(), k_neighbors, 8, -0.5f, 0.0f, true});
+}
+
+std::array<std::array<Eigen::VectorXf, 3>, 3>
+build_gamma_data_immersion(const DiffusionMesh& mesh) {
+  std::array<Eigen::VectorXf, 3> data_coords;
+  std::array<Eigen::VectorXf, 3> immersion_coords;
+  igneous::ops::diffusion::fill_data_coordinate_vectors(mesh, data_coords);
+  igneous::ops::diffusion::fill_coordinate_vectors(mesh, immersion_coords);
+
+  std::array<std::array<Eigen::VectorXf, 3>, 3> gamma{};
+  for (int a = 0; a < 3; ++a) {
+    for (int b = 0; b < 3; ++b) {
+      gamma[a][b].resize(static_cast<int>(mesh.num_points()));
+      igneous::ops::diffusion::carre_du_champ(mesh, data_coords[a], immersion_coords[b], 0.0f,
+                                              gamma[a][b]);
+    }
+  }
+  return gamma;
+}
+
+std::vector<igneous::core::Vec3>
+reconstruct_1form_ambient(const DiffusionMesh& mesh, const Eigen::VectorXf& coeffs,
+                          int n_coefficients,
+                          const std::array<std::array<Eigen::VectorXf, 3>, 3>& gamma_data) {
+  const Eigen::MatrixXf pointwise =
+      igneous::ops::diffusion::coefficients_to_pointwise(mesh, coeffs, 1, n_coefficients);
+  std::vector<igneous::core::Vec3> field(mesh.num_points(), {0.0f, 0.0f, 0.0f});
+  if (pointwise.rows() == 0 || pointwise.cols() < 3) {
+    return field;
+  }
+
+  for (size_t i = 0; i < mesh.num_points(); ++i) {
+    const int p = static_cast<int>(i);
+    field[i] = {
+        gamma_data[0][0][p] * pointwise(p, 0) + gamma_data[0][1][p] * pointwise(p, 1) +
+            gamma_data[0][2][p] * pointwise(p, 2),
+        gamma_data[1][0][p] * pointwise(p, 0) + gamma_data[1][1][p] * pointwise(p, 1) +
+            gamma_data[1][2][p] * pointwise(p, 2),
+        gamma_data[2][0][p] * pointwise(p, 0) + gamma_data[2][1][p] * pointwise(p, 1) +
+            gamma_data[2][2][p] * pointwise(p, 2),
+    };
+  }
+
+  return field;
+}
+
+std::vector<igneous::core::Vec3>
+reconstruct_2form_dual_ambient(const DiffusionMesh& mesh, const Eigen::VectorXf& coeffs,
+                               int n_coefficients,
+                               const std::array<std::array<Eigen::VectorXf, 3>, 3>& gamma_data) {
+  const Eigen::MatrixXf pointwise =
+      igneous::ops::diffusion::coefficients_to_pointwise(mesh, coeffs, 2, n_coefficients);
+  std::vector<igneous::core::Vec3> field(mesh.num_points(), {0.0f, 0.0f, 0.0f});
+  if (pointwise.rows() == 0 || pointwise.cols() < 3) {
+    return field;
+  }
+
+  for (size_t i = 0; i < mesh.num_points(); ++i) {
+    const int p = static_cast<int>(i);
+    const float w01 = pointwise(p, 0);
+    const float w02 = pointwise(p, 1);
+    const float w12 = pointwise(p, 2);
+
+    Eigen::Matrix3f W = Eigen::Matrix3f::Zero();
+    W(0, 1) = w01;
+    W(1, 0) = -w01;
+    W(0, 2) = w02;
+    W(2, 0) = -w02;
+    W(1, 2) = w12;
+    W(2, 1) = -w12;
+
+    Eigen::Matrix3f G = Eigen::Matrix3f::Zero();
+    for (int a = 0; a < 3; ++a) {
+      for (int b = 0; b < 3; ++b) {
+        G(a, b) = gamma_data[a][b][p];
+      }
+    }
+
+    const Eigen::Matrix3f ambient = G * W * G.transpose();
+    field[i] = {ambient(1, 2), -ambient(0, 2), ambient(0, 1)};
+  }
+
+  return field;
+}
+
+Eigen::VectorXf pad_1form_coeffs(const Eigen::VectorXf& coeffs, int n_basis, int n_coefficients) {
+  Eigen::VectorXf padded = Eigen::VectorXf::Zero(n_basis * 3);
+  const int copy_n = std::min(n_basis, n_coefficients);
+  for (int i = 0; i < copy_n; ++i) {
+    for (int a = 0; a < 3; ++a) {
+      padded(i * 3 + a) = coeffs(i * 3 + a);
+    }
+  }
+  return padded;
+}
+
+HarmonicModes
+compute_harmonic_1modes(DiffusionMesh& mesh, int n_basis, int n_coeff,
+                        igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh>& forms_ws) {
+  igneous::ops::diffusion::compute_eigenbasis(mesh, n_basis);
+  const Eigen::MatrixXf G1 =
+      igneous::ops::diffusion::compute_kform_gram_matrix(mesh, 1, n_coeff, forms_ws);
+  const Eigen::MatrixXf down1 =
+      igneous::ops::diffusion::compute_down_laplacian_matrix(mesh, 1, n_coeff, forms_ws);
+  const Eigen::MatrixXf up1 =
+      igneous::ops::diffusion::compute_up_laplacian_matrix(mesh, 1, n_coeff, forms_ws);
+  const Eigen::MatrixXf L1 = igneous::ops::diffusion::assemble_hodge_laplacian_matrix(up1, down1);
+  auto [evals1, evecs1] = igneous::ops::diffusion::compute_form_spectrum(L1, G1);
+  auto harmonic1_idx = igneous::ops::diffusion::extract_harmonic_mode_indices(evals1, 1e-3f, 3);
+
+  return {
+      std::move(evals1),
+      std::move(evecs1),
+      std::move(harmonic1_idx),
+  };
+}
+
+void bench_diffgeo_pipeline(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+
+  for (auto _ : state) {
+    forms_ws.reset_cache();
+    build_diffusion_geometry(mesh, cfg.k_neighbors);
+    igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+
+    const Eigen::MatrixXf G1 =
+        igneous::ops::diffusion::compute_kform_gram_matrix(mesh, 1, cfg.n_coeff, forms_ws);
+    const Eigen::MatrixXf G1_inv =
+        igneous::ops::diffusion::compute_kform_gram_inverse(mesh, 1, cfg.n_coeff, forms_ws);
+    const Eigen::MatrixXf D1 =
+        igneous::ops::diffusion::compute_weak_exterior_derivative(mesh, 1, cfg.n_coeff, forms_ws);
+    const Eigen::MatrixXf down1 =
+        igneous::ops::diffusion::compute_down_laplacian_matrix(mesh, 1, cfg.n_coeff, forms_ws);
+    const Eigen::MatrixXf up1 =
+        igneous::ops::diffusion::compute_up_laplacian_matrix(mesh, 1, cfg.n_coeff, forms_ws);
+    const Eigen::MatrixXf L1 = igneous::ops::diffusion::assemble_hodge_laplacian_matrix(up1, down1);
+    auto [evals1, evecs1] = igneous::ops::diffusion::compute_form_spectrum(L1, G1);
+
+    if (evals1.size() == 0 || evecs1.cols() == 0) {
+      state.SkipWithError("1-form spectrum failed");
+      break;
+    }
+
+    const auto harmonic1_idx =
+        igneous::ops::diffusion::extract_harmonic_mode_indices(evals1, 1e-3f, 3);
+    const int idx0 = harmonic1_idx.empty() ? 0 : harmonic1_idx[0];
+    const int idx1 = harmonic1_idx.size() > 1 ? harmonic1_idx[1] : idx0;
+
+    const auto gamma_data = build_gamma_data_immersion(mesh);
+    igneous::ops::diffusion::CircularCoordinateWorkspace<DiffusionMesh> circular_ws;
+    const Eigen::VectorXf theta_0 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, pad_1form_coeffs(evecs1.col(idx0), mesh.structure.eigen_basis.cols(), cfg.n_coeff),
+        0.0f, 1.0f, 0, nullptr, circular_ws);
+    const Eigen::VectorXf theta_1 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, pad_1form_coeffs(evecs1.col(idx1), mesh.structure.eigen_basis.cols(), cfg.n_coeff),
+        0.0f, 1.0f, 1, nullptr, circular_ws);
+
+    const Eigen::MatrixXf G2 =
+        igneous::ops::diffusion::compute_kform_gram_matrix(mesh, 2, cfg.n_coeff, forms_ws);
+    const Eigen::MatrixXf down2 =
+        igneous::ops::diffusion::assemble_down_laplacian_matrix_from_previous(D1, G1_inv);
+    const Eigen::MatrixXf up2 =
+        igneous::ops::diffusion::compute_up_laplacian_matrix(mesh, 2, cfg.n_coeff, forms_ws);
+    const Eigen::MatrixXf L2 = igneous::ops::diffusion::assemble_hodge_laplacian_matrix(up2, down2);
+    auto [evals2, evecs2] = igneous::ops::diffusion::compute_form_spectrum(L2, G2);
+
+    if (evals2.size() == 0 || evecs2.cols() == 0) {
+      state.SkipWithError("2-form spectrum failed");
+      break;
+    }
+
+    const auto harmonic2_idx =
+        igneous::ops::diffusion::extract_harmonic_mode_indices(evals2, 1e-3f, 3);
+    Eigen::VectorXf wedge_coeffs = Eigen::VectorXf::Zero(cfg.n_coeff * 3);
+    if (!harmonic1_idx.empty()) {
+      const int rhs_col = harmonic1_idx.size() > 1 ? harmonic1_idx[1] : harmonic1_idx[0];
+      wedge_coeffs = igneous::ops::diffusion::compute_wedge_product_coeffs(
+          mesh, evecs1.col(harmonic1_idx[0]), 1, evecs1.col(rhs_col), 1, cfg.n_coeff, forms_ws);
+    }
+
+    Eigen::VectorXf harmonic1 = evecs1.col(idx0);
+    Eigen::VectorXf harmonic2 = evecs2.col(harmonic2_idx.empty() ? 0 : harmonic2_idx[0]);
+    auto field1 = reconstruct_1form_ambient(mesh, harmonic1, cfg.n_coeff, gamma_data);
+    auto field2 = reconstruct_2form_dual_ambient(mesh, harmonic2, cfg.n_coeff, gamma_data);
+    auto wedge = reconstruct_2form_dual_ambient(mesh, wedge_coeffs, cfg.n_coeff, gamma_data);
+
+    benchmark::DoNotOptimize(theta_0.data());
+    benchmark::DoNotOptimize(theta_1.data());
+    benchmark::DoNotOptimize(field1.data());
+    benchmark::DoNotOptimize(field2.data());
+    benchmark::DoNotOptimize(wedge.data());
+  }
+}
+
+void bench_diffgeo_phase_structure_build(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+
+  for (auto _ : state) {
+    build_diffusion_geometry(mesh, cfg.k_neighbors);
+    benchmark::DoNotOptimize(mesh.structure.markov_values.size());
+  }
+}
+
+void bench_diffgeo_phase_eigenbasis(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+
+  for (auto _ : state) {
+    igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+    benchmark::DoNotOptimize(mesh.structure.eigen_basis.data());
+  }
+}
+
+void bench_diffgeo_phase_k1_gram(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+
+  for (auto _ : state) {
+    const Eigen::MatrixXf G1 =
+        igneous::ops::diffusion::compute_kform_gram_matrix(mesh, 1, cfg.n_coeff, forms_ws);
+    benchmark::DoNotOptimize(G1.data());
+  }
+}
+
+void bench_diffgeo_phase_k1_down(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+
+  for (auto _ : state) {
+    const Eigen::MatrixXf down1 =
+        igneous::ops::diffusion::compute_down_laplacian_matrix(mesh, 1, cfg.n_coeff, forms_ws);
+    benchmark::DoNotOptimize(down1.data());
+  }
+}
+
+void bench_diffgeo_phase_k1_up(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+
+  for (auto _ : state) {
+    const Eigen::MatrixXf up1 =
+        igneous::ops::diffusion::compute_up_laplacian_matrix(mesh, 1, cfg.n_coeff, forms_ws);
+    benchmark::DoNotOptimize(up1.data());
+  }
+}
+
+void bench_diffgeo_phase_circular(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+  const HarmonicModes modes = compute_harmonic_1modes(mesh, cfg.n_basis, cfg.n_coeff, forms_ws);
+  if (modes.evals1.size() == 0 || modes.evecs1.cols() == 0) {
+    state.SkipWithError("1-form harmonic modes unavailable");
+    return;
+  }
+
+  const int idx0 = modes.harmonic1_idx.empty() ? 0 : modes.harmonic1_idx[0];
+  const int idx1 = modes.harmonic1_idx.size() > 1 ? modes.harmonic1_idx[1] : idx0;
+  const Eigen::VectorXf alpha0 =
+      pad_1form_coeffs(modes.evecs1.col(idx0), mesh.structure.eigen_basis.cols(), cfg.n_coeff);
+  const Eigen::VectorXf alpha1 =
+      pad_1form_coeffs(modes.evecs1.col(idx1), mesh.structure.eigen_basis.cols(), cfg.n_coeff);
+  igneous::ops::diffusion::CircularCoordinateWorkspace<DiffusionMesh> circular_ws;
+
+  for (auto _ : state) {
+    const Eigen::VectorXf theta_0 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, alpha0, 0.0f, 1.0f, 0, nullptr, circular_ws);
+    const Eigen::VectorXf theta_1 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, alpha1, 0.0f, 1.0f, 1, nullptr, circular_ws);
+    benchmark::DoNotOptimize(theta_0.data());
+    benchmark::DoNotOptimize(theta_1.data());
+  }
+}
+
+void bench_diffgeo_phase_k2_gram(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+
+  for (auto _ : state) {
+    const Eigen::MatrixXf G2 =
+        igneous::ops::diffusion::compute_kform_gram_matrix(mesh, 2, cfg.n_coeff, forms_ws);
+    benchmark::DoNotOptimize(G2.data());
+  }
+}
+
+void bench_diffgeo_phase_k2_down(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+  const Eigen::MatrixXf G1 =
+      igneous::ops::diffusion::compute_kform_gram_matrix(mesh, 1, cfg.n_coeff, forms_ws);
+  benchmark::DoNotOptimize(G1.data());
+  const Eigen::MatrixXf G1_inv =
+      igneous::ops::diffusion::compute_kform_gram_inverse(mesh, 1, cfg.n_coeff, forms_ws);
+  const Eigen::MatrixXf D1 =
+      igneous::ops::diffusion::compute_weak_exterior_derivative(mesh, 1, cfg.n_coeff, forms_ws);
+
+  for (auto _ : state) {
+    const Eigen::MatrixXf down2 =
+        igneous::ops::diffusion::assemble_down_laplacian_matrix_from_previous(D1, G1_inv);
+    benchmark::DoNotOptimize(down2.data());
+  }
+}
+
+void bench_diffgeo_phase_k2_up(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::compute_eigenbasis(mesh, cfg.n_basis);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+  const Eigen::MatrixXf warmup =
+      igneous::ops::diffusion::compute_kform_gram_matrix(mesh, 2, cfg.n_coeff, forms_ws);
+  benchmark::DoNotOptimize(warmup.data());
+
+  for (auto _ : state) {
+    const Eigen::MatrixXf up2 =
+        igneous::ops::diffusion::compute_up_laplacian_matrix(mesh, 2, cfg.n_coeff, forms_ws);
+    benchmark::DoNotOptimize(up2.data());
+  }
+}
+
+void bench_diffgeo_phase_wedge(benchmark::State& state) {
+  const DiffgeoConfig cfg = config_from_state(state);
+  DiffusionMesh mesh = make_torus_mesh(cfg);
+  build_diffusion_geometry(mesh, cfg.k_neighbors);
+  igneous::ops::diffusion::DiffusionFormWorkspace<DiffusionMesh> forms_ws;
+  const HarmonicModes modes = compute_harmonic_1modes(mesh, cfg.n_basis, cfg.n_coeff, forms_ws);
+  if (modes.evals1.size() == 0 || modes.evecs1.cols() == 0) {
+    state.SkipWithError("1-form harmonic modes unavailable");
+    return;
+  }
+
+  const int lhs_col = modes.harmonic1_idx.empty() ? 0 : modes.harmonic1_idx[0];
+  const int rhs_col = modes.harmonic1_idx.size() > 1 ? modes.harmonic1_idx[1] : lhs_col;
+
+  for (auto _ : state) {
+    const Eigen::VectorXf wedge = igneous::ops::diffusion::compute_wedge_product_coeffs(
+        mesh, modes.evecs1.col(lhs_col), 1, modes.evecs1.col(rhs_col), 1, cfg.n_coeff, forms_ws);
+    benchmark::DoNotOptimize(wedge.data());
+  }
+}
+} // namespace
+
+BENCHMARK(bench_diffgeo_pipeline)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_structure_build)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_eigenbasis)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_k1_gram)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_k1_down)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_k1_up)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_circular)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_k2_gram)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_k2_down)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_k2_up)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+BENCHMARK(bench_diffgeo_phase_wedge)->Args({1000, 50, 32, 32})->Args({4000, 64, 32, 32});
+
+BENCHMARK_MAIN();
diff --git a/benches/bench_pipelines.cpp b/benches/bench_pipelines.cpp
index 4ade70d..9cf23ec 100644
--- a/benches/bench_pipelines.cpp
+++ b/benches/bench_pipelines.cpp
@@ -155,6 +155,7 @@ void bench_pipeline_hodge_main(benchmark::State& state) {
   for (auto _ : state) {
     build_diffusion_geometry(mesh, kBandwidth, kNeighbors);
     igneous::ops::diffusion::compute_eigenbasis(mesh, kBasis);
+    hodge_ws.reset_cache();
 
     const Eigen::MatrixXf G =
         igneous::ops::diffusion::compute_1form_gram_matrix(mesh, kBandwidth, geom_ws);
@@ -164,11 +165,12 @@ void bench_pipeline_hodge_main(benchmark::State& state) {
         igneous::ops::diffusion::compute_curl_energy_matrix(mesh, kBandwidth, hodge_ws);
     const Eigen::MatrixXf L = igneous::ops::diffusion::compute_hodge_laplacian_matrix(D, E);
     auto [evals, evecs] = igneous::ops::diffusion::compute_hodge_spectrum(L, G);
+    igneous::ops::diffusion::CircularCoordinateWorkspace<DiffusionMesh> circular_ws;
 
-    const Eigen::VectorXf theta_0 =
-        igneous::ops::diffusion::compute_circular_coordinates(mesh, evecs.col(0), kBandwidth);
-    const Eigen::VectorXf theta_1 =
-        igneous::ops::diffusion::compute_circular_coordinates(mesh, evecs.col(1), kBandwidth);
+    const Eigen::VectorXf theta_0 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, evecs.col(0), kBandwidth, 1.0f, 0, nullptr, circular_ws);
+    const Eigen::VectorXf theta_1 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, evecs.col(1), kBandwidth, 1.0f, 0, nullptr, circular_ws);
 
     benchmark::DoNotOptimize(evals.data());
     benchmark::DoNotOptimize(theta_0.data());
@@ -298,12 +300,13 @@ void bench_hodge_phase_circular(benchmark::State& state) {
       igneous::ops::diffusion::compute_curl_energy_matrix(mesh, kBandwidth, hodge_ws);
   const Eigen::MatrixXf L = igneous::ops::diffusion::compute_hodge_laplacian_matrix(D, E);
   auto [evals, evecs] = igneous::ops::diffusion::compute_hodge_spectrum(L, G);
+  igneous::ops::diffusion::CircularCoordinateWorkspace<DiffusionMesh> circular_ws;
 
   for (auto _ : state) {
-    const Eigen::VectorXf theta_0 =
-        igneous::ops::diffusion::compute_circular_coordinates(mesh, evecs.col(0), kBandwidth);
-    const Eigen::VectorXf theta_1 =
-        igneous::ops::diffusion::compute_circular_coordinates(mesh, evecs.col(1), kBandwidth);
+    const Eigen::VectorXf theta_0 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, evecs.col(0), kBandwidth, 1.0f, 0, nullptr, circular_ws);
+    const Eigen::VectorXf theta_1 = igneous::ops::diffusion::compute_circular_coordinates(
+        mesh, evecs.col(1), kBandwidth, 1.0f, 0, nullptr, circular_ws);
     benchmark::DoNotOptimize(theta_0.data());
     benchmark::DoNotOptimize(theta_1.data());
     benchmark::DoNotOptimize(evals.data());
diff --git a/include/igneous/data/structures/diffusion_geometry.hpp b/include/igneous/data/structures/diffusion_geometry.hpp
index 5b8bd50..668f0d5 100644
--- a/include/igneous/data/structures/diffusion_geometry.hpp
+++ b/include/igneous/data/structures/diffusion_geometry.hpp
@@ -88,6 +88,10 @@ struct DiffusionGeometry {
   std::vector<int> symmetric_col_indices;
   /// \brief CSR values for symmetric kernel matrix.
   std::vector<float> symmetric_values;
+  /// \brief Cached CSR values for the normalized symmetric kernel.
+  std::vector<float> symmetric_normalized_values;
+  /// \brief Cached inverse square-root row sums for normalized spectral solves.
+  Eigen::VectorXf symmetric_inv_sqrt_rows;
 
   /**
    * \brief Number of points represented by the CSR graph.
@@ -128,6 +132,8 @@ struct DiffusionGeometry {
     symmetric_row_offsets.clear();
     symmetric_col_indices.clear();
     symmetric_values.clear();
+    symmetric_normalized_values.clear();
+    symmetric_inv_sqrt_rows.resize(0);
   }
 
   /// \brief nanoflann adaptor over the SoA coordinate spans.
@@ -193,14 +199,18 @@ struct DiffusionGeometry {
     std::vector<float> averages(static_cast<size_t>(e_count), 0.0f);
     const float inv_nk = 1.0f / static_cast<float>(std::max(1, n * k));
 
-    for (int e = 0; e < e_count; ++e) {
-      const float epsilon = std::max(epsilons[static_cast<size_t>(e)], 1e-12f);
-      float sum = 0.0f;
-      for (float value : entries) {
-        sum += std::exp(-value / epsilon);
-      }
-      averages[static_cast<size_t>(e)] = sum * inv_nk;
-    }
+    core::parallel_for_index(
+        0, e_count,
+        [&](int e) {
+          const float epsilon = std::max(epsilons[static_cast<size_t>(e)], 1e-12f);
+          const float inv_epsilon = 1.0f / epsilon;
+          float sum = 0.0f;
+          for (float value : entries) {
+            sum += std::exp(-value * inv_epsilon);
+          }
+          averages[static_cast<size_t>(e)] = sum * inv_nk;
+        },
+        8);
 
     int best_idx = 0;
     float best_criterion = -std::numeric_limits<float>::infinity();
@@ -243,16 +253,20 @@ struct DiffusionGeometry {
 
     const int end = std::max(2, std::min(knn_bandwidth, k));
     const int count = std::max(1, end - 1);
-    for (int i = 0; i < n; ++i) {
-      const size_t base = static_cast<size_t>(i) * static_cast<size_t>(k);
-      float accum = 0.0f;
-      for (int idx = 1; idx < end; ++idx) {
-        const float d = nbr_distances[base + static_cast<size_t>(idx)];
-        accum += d * d;
-      }
-      const float rms = std::sqrt(accum / static_cast<float>(count));
-      bandwidths_out[i] = std::max(rms, 1e-8f);
-    }
+
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const size_t base = static_cast<size_t>(i) * static_cast<size_t>(k);
+          float accum = 0.0f;
+          for (int idx = 1; idx < end; ++idx) {
+            const float d = nbr_distances[base + static_cast<size_t>(idx)];
+            accum += d * d;
+          }
+          const float rms = std::sqrt(accum / static_cast<float>(count));
+          bandwidths_out[i] = std::max(rms, 1e-8f);
+        },
+        256);
   }
 
   /**
@@ -327,82 +341,97 @@ struct DiffusionGeometry {
     compute_local_bandwidths(knn_distances, n, k_neighbors, knn_bandwidth, bandwidths_A);
 
     std::vector<float> kernel_entries_A(row_capacity, 0.0f);
-    for (int i = 0; i < n; ++i) {
-      const size_t base = static_cast<size_t>(i) * row_stride;
-      const float bw_i = std::max(bandwidths_A[i], 1e-8f);
-      for (int k = 0; k < k_neighbors; ++k) {
-        const int j = knn_indices[base + static_cast<size_t>(k)];
-        const float bw_j = std::max(bandwidths_A[j], 1e-8f);
-        const float dist = knn_distances[base + static_cast<size_t>(k)];
-        kernel_entries_A[base + static_cast<size_t>(k)] = (dist * dist) / (bw_j * bw_i);
-      }
-    }
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const size_t base = static_cast<size_t>(i) * row_stride;
+          const float bw_i = std::max(bandwidths_A[i], 1e-8f);
+          for (int k = 0; k < k_neighbors; ++k) {
+            const int j = knn_indices[base + static_cast<size_t>(k)];
+            const float bw_j = std::max(bandwidths_A[j], 1e-8f);
+            const float dist = knn_distances[base + static_cast<size_t>(k)];
+            kernel_entries_A[base + static_cast<size_t>(k)] = (dist * dist) / (bw_j * bw_i);
+          }
+        },
+        64);
 
     const std::vector<float> epsilons = build_epsilons();
     const auto [epsilon_A, dim_A] = tune_kernel(kernel_entries_A, n, k_neighbors, epsilons);
+    const float safe_epsilon_A = std::max(epsilon_A, 1e-12f);
+    const float inv_epsilon_A = 1.0f / safe_epsilon_A;
 
     constexpr float kPi = 3.14159265358979323846f;
-    const float kernel_A_norm = std::pow(kPi * std::max(epsilon_A, 1e-12f), dim_A * 0.5f);
+    const float kernel_A_norm = std::pow(kPi * safe_epsilon_A, dim_A * 0.5f);
     Eigen::VectorXf density_estimate_A(n);
     density_estimate_A.setZero();
-    for (int i = 0; i < n; ++i) {
-      const size_t base = static_cast<size_t>(i) * row_stride;
-      float row_sum = 0.0f;
-      for (int k = 0; k < k_neighbors; ++k) {
-        row_sum += std::exp(-kernel_entries_A[base + static_cast<size_t>(k)] /
-                            std::max(epsilon_A, 1e-12f));
-      }
-      const float bw = std::max(bandwidths_A[i], 1e-8f);
-      density_estimate_A[i] = (row_sum / std::max(kernel_A_norm, 1e-12f)) /
-                              (static_cast<float>(n) * std::pow(bw, dim_A));
-      density_estimate_A[i] = std::max(density_estimate_A[i], 1e-12f);
-    }
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const size_t base = static_cast<size_t>(i) * row_stride;
+          float row_sum = 0.0f;
+          for (int k = 0; k < k_neighbors; ++k) {
+            row_sum += std::exp(-kernel_entries_A[base + static_cast<size_t>(k)] * inv_epsilon_A);
+          }
+          const float bw = std::max(bandwidths_A[i], 1e-8f);
+          density_estimate_A[i] = (row_sum / std::max(kernel_A_norm, 1e-12f)) /
+                                  (static_cast<float>(n) * std::pow(bw, dim_A));
+          density_estimate_A[i] = std::max(density_estimate_A[i], 1e-12f);
+        },
+        64);
 
     Eigen::VectorXf bandwidths_B(n);
-    for (int i = 0; i < n; ++i) {
-      bandwidths_B[i] = std::pow(density_estimate_A[i], input.bandwidth_variability);
-    }
     std::vector<float> bw_values(static_cast<size_t>(n));
-    for (int i = 0; i < n; ++i) {
-      bw_values[static_cast<size_t>(i)] = bandwidths_B[i];
-    }
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const float bandwidth = std::pow(density_estimate_A[i], input.bandwidth_variability);
+          bandwidths_B[i] = bandwidth;
+          bw_values[static_cast<size_t>(i)] = bandwidth;
+        },
+        256);
     const float bw_median = std::max(vector_median(std::move(bw_values)), 1e-8f);
     bandwidths_B /= bw_median;
 
     std::vector<float> kernel_entries_B(row_capacity, 0.0f);
-    for (int i = 0; i < n; ++i) {
-      const size_t base = static_cast<size_t>(i) * row_stride;
-      const float bw_i = std::max(bandwidths_B[i], 1e-8f);
-      for (int k = 0; k < k_neighbors; ++k) {
-        const int j = knn_indices[base + static_cast<size_t>(k)];
-        const float bw_j = std::max(bandwidths_B[j], 1e-8f);
-        const float dist = knn_distances[base + static_cast<size_t>(k)];
-        kernel_entries_B[base + static_cast<size_t>(k)] = (dist * dist) / (bw_j * bw_i);
-      }
-    }
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const size_t base = static_cast<size_t>(i) * row_stride;
+          const float bw_i = std::max(bandwidths_B[i], 1e-8f);
+          for (int k = 0; k < k_neighbors; ++k) {
+            const int j = knn_indices[base + static_cast<size_t>(k)];
+            const float bw_j = std::max(bandwidths_B[j], 1e-8f);
+            const float dist = knn_distances[base + static_cast<size_t>(k)];
+            kernel_entries_B[base + static_cast<size_t>(k)] = (dist * dist) / (bw_j * bw_i);
+          }
+        },
+        64);
 
     const auto [epsilon_B, dim_B] = tune_kernel(kernel_entries_B, n, k_neighbors, epsilons);
+    const float safe_epsilon_B = std::max(epsilon_B, 1e-12f);
+    const float inv_epsilon_B = 1.0f / safe_epsilon_B;
 
     Eigen::VectorXf density_estimate_B(n);
     density_estimate_B.setZero();
-    for (int i = 0; i < n; ++i) {
-      const size_t base = static_cast<size_t>(i) * row_stride;
-      float row_sum = 0.0f;
-      for (int k = 0; k < k_neighbors; ++k) {
-        row_sum += std::exp(-kernel_entries_B[base + static_cast<size_t>(k)] /
-                            std::max(epsilon_B, 1e-12f));
-      }
-      const float bw = std::max(bandwidths_B[i], 1e-8f);
-      density_estimate_B[i] = row_sum / std::pow(bw, dim_B);
-      density_estimate_B[i] = std::max(density_estimate_B[i], 1e-12f);
-    }
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const size_t base = static_cast<size_t>(i) * row_stride;
+          float row_sum = 0.0f;
+          for (int k = 0; k < k_neighbors; ++k) {
+            row_sum += std::exp(-kernel_entries_B[base + static_cast<size_t>(k)] * inv_epsilon_B);
+          }
+          const float bw = std::max(bandwidths_B[i], 1e-8f);
+          density_estimate_B[i] = row_sum / std::pow(bw, dim_B);
+          density_estimate_B[i] = std::max(density_estimate_B[i], 1e-12f);
+        },
+        64);
 
     const float alpha =
         1.0f - (input.c * 0.5f) + input.bandwidth_variability * ((dim_B * 0.5f) + 1.0f);
     Eigen::VectorXf density_alpha(n);
-    for (int i = 0; i < n; ++i) {
-      density_alpha[i] = std::pow(density_estimate_B[i], -alpha);
-    }
+    core::parallel_for_index(
+        0, n, [&](int i) { density_alpha[i] = std::pow(density_estimate_B[i], -alpha); }, 256);
 
     markov_row_offsets.resize(static_cast<size_t>(n) + 1);
     for (int i = 0; i <= n; ++i) {
@@ -410,60 +439,81 @@ struct DiffusionGeometry {
     }
     markov_col_indices = knn_indices;
     markov_values.assign(row_capacity, 0.0f);
+    local_bandwidths.resize(n);
+    immersion_coords.resize(n, 3);
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const size_t base = static_cast<size_t>(i) * row_stride;
+          const float bw_i = std::max(bandwidths_B[i], 1e-8f);
+          local_bandwidths[i] = (safe_epsilon_B * bw_i * bw_i) / 4.0f;
+
+          float row_sum = 0.0f;
+          for (int k = 0; k < k_neighbors; ++k) {
+            const int j = knn_indices[base + static_cast<size_t>(k)];
+            const float kernel_B =
+                std::exp(-kernel_entries_B[base + static_cast<size_t>(k)] * inv_epsilon_B);
+            const float val = kernel_B * density_alpha[i] * density_alpha[j];
+            markov_values[base + static_cast<size_t>(k)] = val;
+            row_sum += val;
+          }
 
-    for (int i = 0; i < n; ++i) {
-      const size_t base = static_cast<size_t>(i) * row_stride;
-      float row_sum = 0.0f;
-      for (int k = 0; k < k_neighbors; ++k) {
-        const int j = knn_indices[base + static_cast<size_t>(k)];
-        const float kernel_B = std::exp(-kernel_entries_B[base + static_cast<size_t>(k)] /
-                                        std::max(epsilon_B, 1e-12f));
-        const float val = kernel_B * density_alpha[i] * density_alpha[j];
-        markov_values[base + static_cast<size_t>(k)] = val;
-        row_sum += val;
-      }
-
-      if (row_sum <= 1e-30f) {
-        int self_idx = -1;
-        for (int k = 0; k < k_neighbors; ++k) {
-          if (knn_indices[base + static_cast<size_t>(k)] == i) {
-            self_idx = k;
-            break;
+          if (row_sum <= 1e-30f) {
+            int self_idx = -1;
+            for (int k = 0; k < k_neighbors; ++k) {
+              if (knn_indices[base + static_cast<size_t>(k)] == i) {
+                self_idx = k;
+                break;
+              }
+            }
+            if (self_idx < 0) {
+              self_idx = 0;
+            }
+            for (int k = 0; k < k_neighbors; ++k) {
+              const size_t idx = base + static_cast<size_t>(k);
+              const float value = (k == self_idx) ? 1.0f : 0.0f;
+              markov_values[idx] = value;
+              knn_kernel[idx] = value;
+            }
+            immersion_coords(i, 0) = input.x[static_cast<size_t>(i)];
+            immersion_coords(i, 1) = input.y[static_cast<size_t>(i)];
+            immersion_coords(i, 2) = input.z[static_cast<size_t>(i)];
+            return;
           }
-        }
-        if (self_idx < 0) {
-          self_idx = 0;
-        }
-        for (int k = 0; k < k_neighbors; ++k) {
-          markov_values[base + static_cast<size_t>(k)] = (k == self_idx) ? 1.0f : 0.0f;
-        }
-        knn_kernel[base + static_cast<size_t>(self_idx)] = 1.0f;
-        for (int k = 0; k < k_neighbors; ++k) {
-          if (k != self_idx) {
-            knn_kernel[base + static_cast<size_t>(k)] = 0.0f;
+
+          const float inv_row_sum = 1.0f / row_sum;
+          float rx = 0.0f;
+          float ry = 0.0f;
+          float rz = 0.0f;
+          for (int k = 0; k < k_neighbors; ++k) {
+            const size_t idx = base + static_cast<size_t>(k);
+            const int j = knn_indices[idx];
+            const float normalized = markov_values[idx] * inv_row_sum;
+            markov_values[idx] = normalized;
+            knn_kernel[idx] = normalized;
+            rx += normalized * input.x[static_cast<size_t>(j)];
+            ry += normalized * input.y[static_cast<size_t>(j)];
+            rz += normalized * input.z[static_cast<size_t>(j)];
           }
-        }
-        continue;
-      }
 
-      const float inv_row_sum = 1.0f / row_sum;
-      for (int k = 0; k < k_neighbors; ++k) {
-        const size_t idx = base + static_cast<size_t>(k);
-        markov_values[idx] *= inv_row_sum;
-        knn_kernel[idx] = markov_values[idx];
-      }
-    }
+          immersion_coords(i, 0) = rx;
+          immersion_coords(i, 1) = ry;
+          immersion_coords(i, 2) = rz;
+        },
+        64);
 
-    std::vector<Eigen::Triplet<float>> markov_triplets;
-    markov_triplets.reserve(row_capacity);
-    for (int i = 0; i < n; ++i) {
-      const int begin = markov_row_offsets[static_cast<size_t>(i)];
-      const int end = markov_row_offsets[static_cast<size_t>(i) + 1];
-      for (int idx = begin; idx < end; ++idx) {
-        markov_triplets.emplace_back(i, markov_col_indices[static_cast<size_t>(idx)],
-                                     markov_values[static_cast<size_t>(idx)]);
-      }
-    }
+    std::vector<Eigen::Triplet<float>> markov_triplets(row_capacity);
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          const size_t base = static_cast<size_t>(i) * row_stride;
+          for (int k = 0; k < k_neighbors; ++k) {
+            const size_t idx = base + static_cast<size_t>(k);
+            markov_triplets[idx] =
+                Eigen::Triplet<float>(i, markov_col_indices[idx], markov_values[idx]);
+          }
+        },
+        64);
 
     Eigen::SparseMatrix<float, Eigen::RowMajor> P_row(n, n);
     P_row.setFromTriplets(markov_triplets.begin(), markov_triplets.end());
@@ -476,9 +526,21 @@ struct DiffusionGeometry {
     P_row *= 0.5f;
     P_row.makeCompressed();
 
-    const Eigen::VectorXf ones = Eigen::VectorXf::Ones(n);
-    symmetric_row_sums = P_row * ones;
-    symmetric_row_sums = symmetric_row_sums.array().max(1e-12f).matrix();
+    symmetric_row_sums.resize(n);
+    const int* outer = P_row.outerIndexPtr();
+    const float* values = P_row.valuePtr();
+    core::parallel_for_index(
+        0, n,
+        [&](int i) {
+          float row_sum = 0.0f;
+          const int begin = outer[static_cast<size_t>(i)];
+          const int end = outer[static_cast<size_t>(i) + 1];
+          for (int idx = begin; idx < end; ++idx) {
+            row_sum += values[static_cast<size_t>(idx)];
+          }
+          symmetric_row_sums[i] = std::max(row_sum, 1e-12f);
+        },
+        64);
 
     const float mu_sum = symmetric_row_sums.sum();
     if (mu_sum > 1e-12f) {
@@ -487,39 +549,12 @@ struct DiffusionGeometry {
       mu = Eigen::VectorXf::Constant(n, 1.0f / static_cast<float>(n));
     }
 
-    const int* outer = P_row.outerIndexPtr();
     const int* inner = P_row.innerIndexPtr();
-    const float* values = P_row.valuePtr();
     const int nnz_sym = P_row.nonZeros();
     symmetric_row_offsets.assign(outer, outer + (n + 1));
     symmetric_col_indices.assign(inner, inner + nnz_sym);
     symmetric_values.assign(values, values + nnz_sym);
 
-    local_bandwidths.resize(n);
-    for (int i = 0; i < n; ++i) {
-      const float bw = std::max(bandwidths_B[i], 1e-8f);
-      local_bandwidths[i] = (std::max(epsilon_B, 1e-12f) * bw * bw) / 4.0f;
-    }
-
-    immersion_coords.resize(n, 3);
-    for (int i = 0; i < n; ++i) {
-      float rx = 0.0f;
-      float ry = 0.0f;
-      float rz = 0.0f;
-      const int begin = markov_row_offsets[static_cast<size_t>(i)];
-      const int end = markov_row_offsets[static_cast<size_t>(i) + 1];
-      for (int idx = begin; idx < end; ++idx) {
-        const int j = markov_col_indices[static_cast<size_t>(idx)];
-        const float w = markov_values[static_cast<size_t>(idx)];
-        rx += w * input.x[static_cast<size_t>(j)];
-        ry += w * input.y[static_cast<size_t>(j)];
-        rz += w * input.z[static_cast<size_t>(j)];
-      }
-      immersion_coords(i, 0) = rx;
-      immersion_coords(i, 1) = ry;
-      immersion_coords(i, 2) = rz;
-    }
-
     if (std::getenv("IGNEOUS_BENCH_MODE") == nullptr) {
       std::cout << "[Diffusion] Built reference-style kernel for " << n_verts
                 << " points (k=" << k_neighbors << ", knn_bw=" << knn_bandwidth << ").\n";
diff --git a/include/igneous/data/structures/discrete_exterior_calculus.hpp b/include/igneous/data/structures/discrete_exterior_calculus.hpp
index ad7487d..6af8de8 100644
--- a/include/igneous/data/structures/discrete_exterior_calculus.hpp
+++ b/include/igneous/data/structures/discrete_exterior_calculus.hpp
@@ -145,32 +145,118 @@ struct DiscreteExteriorCalculus {
     const size_t n_faces = num_faces();
 
     vertex_face_offsets.assign(num_vertices + 1, 0);
-    for (size_t f = 0; f < n_faces; ++f) {
-      const uint32_t a = face_v0[f];
-      const uint32_t b = face_v1[f];
-      const uint32_t c = face_v2[f];
-
-      if (a < num_vertices)
-        vertex_face_offsets[a + 1]++;
-      if (b < num_vertices)
-        vertex_face_offsets[b + 1]++;
-      if (c < num_vertices)
-        vertex_face_offsets[c + 1]++;
-    }
 
-    for (size_t i = 0; i < num_vertices; ++i) {
-      vertex_face_offsets[i + 1] += vertex_face_offsets[i];
-    }
+    constexpr size_t kParallelFaceIncidenceThreshold = 200000;
+    if (num_vertices >= kParallelFaceIncidenceThreshold &&
+        n_faces >= kParallelFaceIncidenceThreshold) {
+      const int block_count = std::max(1, std::min(core::compute_thread_count(), 8));
+      const size_t stride = num_vertices;
+      std::vector<uint32_t> block_counts(static_cast<size_t>(block_count) * stride, 0);
+
+      core::parallel_for_index(
+          0, block_count,
+          [&](int block_idx) {
+            const size_t face_begin =
+                (n_faces * static_cast<size_t>(block_idx)) / static_cast<size_t>(block_count);
+            const size_t face_end =
+                (n_faces * static_cast<size_t>(block_idx + 1)) / static_cast<size_t>(block_count);
+            uint32_t* counts = block_counts.data() + (static_cast<size_t>(block_idx) * stride);
+
+            for (size_t f = face_begin; f < face_end; ++f) {
+              const uint32_t a = face_v0[f];
+              const uint32_t b = face_v1[f];
+              const uint32_t c = face_v2[f];
+
+              if (a < num_vertices)
+                counts[a]++;
+              if (b < num_vertices)
+                counts[b]++;
+              if (c < num_vertices)
+                counts[c]++;
+            }
+          },
+          1);
+
+      core::parallel_for_index(
+          0, static_cast<int>(num_vertices),
+          [&](int vertex_idx) {
+            const size_t v = static_cast<size_t>(vertex_idx);
+            uint32_t total = 0;
+            for (int block_idx = 0; block_idx < block_count; ++block_idx) {
+              total += block_counts[(static_cast<size_t>(block_idx) * stride) + v];
+            }
+            vertex_face_offsets[v + 1] = total;
+          },
+          32768);
+
+      for (size_t i = 0; i < num_vertices; ++i) {
+        vertex_face_offsets[i + 1] += vertex_face_offsets[i];
+      }
 
-    vertex_face_data.resize(n_faces * 3);
-    std::vector<uint32_t> face_write_head = vertex_face_offsets;
+      vertex_face_data.resize(n_faces * 3);
 
-    for (uint32_t f = 0; f < n_faces; ++f) {
-      const uint32_t corners[3] = {face_v0[f], face_v1[f], face_v2[f]};
-      for (uint32_t v : corners) {
-        if (v < num_vertices) {
-          const uint32_t pos = face_write_head[v]++;
-          vertex_face_data[pos] = f;
+      core::parallel_for_index(
+          0, static_cast<int>(num_vertices),
+          [&](int vertex_idx) {
+            const size_t v = static_cast<size_t>(vertex_idx);
+            uint32_t base = vertex_face_offsets[v];
+            for (int block_idx = 0; block_idx < block_count; ++block_idx) {
+              const size_t idx = (static_cast<size_t>(block_idx) * stride) + v;
+              const uint32_t count = block_counts[idx];
+              block_counts[idx] = base;
+              base += count;
+            }
+          },
+          32768);
+
+      core::parallel_for_index(
+          0, block_count,
+          [&](int block_idx) {
+            const size_t face_begin =
+                (n_faces * static_cast<size_t>(block_idx)) / static_cast<size_t>(block_count);
+            const size_t face_end =
+                (n_faces * static_cast<size_t>(block_idx + 1)) / static_cast<size_t>(block_count);
+            uint32_t* write_head = block_counts.data() + (static_cast<size_t>(block_idx) * stride);
+
+            for (uint32_t f = static_cast<uint32_t>(face_begin);
+                 f < static_cast<uint32_t>(face_end); ++f) {
+              const uint32_t corners[3] = {face_v0[f], face_v1[f], face_v2[f]};
+              for (uint32_t v : corners) {
+                if (v < num_vertices) {
+                  vertex_face_data[write_head[v]++] = f;
+                }
+              }
+            }
+          },
+          1);
+    } else {
+      for (size_t f = 0; f < n_faces; ++f) {
+        const uint32_t a = face_v0[f];
+        const uint32_t b = face_v1[f];
+        const uint32_t c = face_v2[f];
+
+        if (a < num_vertices)
+          vertex_face_offsets[a + 1]++;
+        if (b < num_vertices)
+          vertex_face_offsets[b + 1]++;
+        if (c < num_vertices)
+          vertex_face_offsets[c + 1]++;
+      }
+
+      for (size_t i = 0; i < num_vertices; ++i) {
+        vertex_face_offsets[i + 1] += vertex_face_offsets[i];
+      }
+
+      vertex_face_data.resize(n_faces * 3);
+      std::vector<uint32_t> face_write_head = vertex_face_offsets;
+
+      for (uint32_t f = 0; f < n_faces; ++f) {
+        const uint32_t corners[3] = {face_v0[f], face_v1[f], face_v2[f]};
+        for (uint32_t v : corners) {
+          if (v < num_vertices) {
+            const uint32_t pos = face_write_head[v]++;
+            vertex_face_data[pos] = f;
+          }
         }
       }
     }
diff --git a/include/igneous/ops/dec/curvature.hpp b/include/igneous/ops/dec/curvature.hpp
index 4b8e653..3c5fe54 100644
--- a/include/igneous/ops/dec/curvature.hpp
+++ b/include/igneous/ops/dec/curvature.hpp
@@ -44,6 +44,130 @@ void compute_curvature_measures(const data::Space<StructureT>& space, std::vecto
 
   workspace.face_normals.resize(num_faces);
 
+  if constexpr (std::is_same_v<StructureT, data::DiscreteExteriorCalculus>) {
+    const auto& x = space.x;
+    const auto& y = space.y;
+    const auto& z = space.z;
+    const auto& face_v0 = structure.face_v0;
+    const auto& face_v1 = structure.face_v1;
+    const auto& face_v2 = structure.face_v2;
+    const auto& vertex_face_offsets = structure.vertex_face_offsets;
+    const auto& vertex_face_data = structure.vertex_face_data;
+
+    core::parallel_for_index(
+        0, static_cast<int>(num_faces),
+        [&](int face_idx) {
+          const size_t f = static_cast<size_t>(face_idx);
+          const uint32_t i0 = face_v0[f];
+          const uint32_t i1 = face_v1[f];
+          const uint32_t i2 = face_v2[f];
+
+          const float e10x = x[i1] - x[i0];
+          const float e10y = y[i1] - y[i0];
+          const float e10z = z[i1] - z[i0];
+          const float e20x = x[i2] - x[i0];
+          const float e20y = y[i2] - y[i0];
+          const float e20z = z[i2] - z[i0];
+
+          workspace.face_normals[f] = {
+              e10x * e20y - e10y * e20x,
+              e10y * e20z - e10z * e20y,
+              e10z * e20x - e10x * e20z,
+          };
+        },
+        256);
+
+    core::parallel_for_index(
+        0, static_cast<int>(num_verts),
+        [&](int vertex_idx) {
+          const size_t i = static_cast<size_t>(vertex_idx);
+          const uint32_t begin = vertex_face_offsets[i];
+          const uint32_t end = vertex_face_offsets[i + 1];
+          if (begin == end) {
+            return;
+          }
+
+          float angle_sum = 0.0f;
+          float area_sum = 0.0f;
+
+          float n_xy = 0.0f;
+          float n_yz = 0.0f;
+          float n_zx = 0.0f;
+
+          float sum_x = 0.0f;
+          float sum_y = 0.0f;
+          float sum_z = 0.0f;
+
+          const float px = x[i];
+          const float py = y[i];
+          const float pz = z[i];
+
+          for (uint32_t idx = begin; idx < end; ++idx) {
+            const uint32_t f_idx = vertex_face_data[idx];
+            const core::Bivec3 fn = workspace.face_normals[f_idx];
+            n_xy += fn.xy;
+            n_yz += fn.yz;
+            n_zx += fn.zx;
+
+            const uint32_t i0 = face_v0[f_idx];
+            const uint32_t i1 = face_v1[f_idx];
+            const uint32_t i2 = face_v2[f_idx];
+
+            uint32_t a = i0;
+            uint32_t b = i1;
+            if (i0 == i) {
+              a = i1;
+              b = i2;
+            } else if (i1 == i) {
+              a = i2;
+              b = i0;
+            }
+
+            const float ux = x[a] - px;
+            const float uy = y[a] - py;
+            const float uz = z[a] - pz;
+            const float vx = x[b] - px;
+            const float vy = y[b] - py;
+            const float vz = z[b] - pz;
+
+            const float dot = ux * vx + uy * vy + uz * vz;
+            const float wedge_xy = ux * vy - uy * vx;
+            const float wedge_yz = uy * vz - uz * vy;
+            const float wedge_zx = uz * vx - ux * vz;
+            const float wedge_mag =
+                std::sqrt(wedge_xy * wedge_xy + wedge_yz * wedge_yz + wedge_zx * wedge_zx);
+
+            angle_sum += std::atan2(wedge_mag, dot);
+            area_sum += 0.5f * wedge_mag;
+
+            sum_x += x[a] + x[b];
+            sum_y += y[a] + y[b];
+            sum_z += z[a] + z[b];
+          }
+
+          if (area_sum > 1e-12f) {
+            K[i] = static_cast<float>((2.0 * std::numbers::pi_v<double> - angle_sum) /
+                                      (static_cast<double>(area_sum) / 3.0));
+          }
+
+          const float n_mag_sq = n_xy * n_xy + n_yz * n_yz + n_zx * n_zx;
+          const float n_inv = (n_mag_sq > 1e-12f) ? 1.0f / std::sqrt(n_mag_sq) : 0.0f;
+
+          const float normal_x = n_yz * n_inv;
+          const float normal_y = n_zx * n_inv;
+          const float normal_z = n_xy * n_inv;
+
+          const float inv_c = 0.5f / static_cast<float>(end - begin);
+          const float laplacian_x = sum_x * inv_c - px;
+          const float laplacian_y = sum_y * inv_c - py;
+          const float laplacian_z = sum_z * inv_c - pz;
+
+          H[i] = laplacian_x * normal_x + laplacian_y * normal_y + laplacian_z * normal_z;
+        },
+        128);
+    return;
+  }
+
   const auto get_face_vertex = [&](size_t face_idx, int corner) -> uint32_t {
     if constexpr (std::is_same_v<StructureT, data::DiscreteExteriorCalculus>) {
       if (corner == 0)
diff --git a/include/igneous/ops/diffusion/forms.hpp b/include/igneous/ops/diffusion/forms.hpp
index ed4e27f..6d0d7e0 100644
--- a/include/igneous/ops/diffusion/forms.hpp
+++ b/include/igneous/ops/diffusion/forms.hpp
@@ -18,6 +18,18 @@
 
 namespace igneous::ops::diffusion {
 
+/// \brief Precomputed compound-minor determinants for metric pullbacks.
+struct CompoundDeterminantData {
+  /// \brief Basis index combinations for the target degree.
+  std::vector<std::vector<int>> indices;
+  /// \brief Determinant vectors for each basis-pair minor.
+  std::vector<Eigen::VectorXf> values;
+  /// \brief Degree (`k`) these determinants correspond to.
+  int degree = 0;
+  /// \brief Number of basis components for this degree.
+  int n_components = 0;
+};
+
 /// \brief Shared scratch buffers for diffusion k-form operators.
 template <typename MeshT> struct DiffusionFormWorkspace {
   /// \brief Coordinate vectors used for gamma evaluations.
@@ -28,22 +40,42 @@ template <typename MeshT> struct DiffusionFormWorkspace {
   std::array<Eigen::MatrixXf, 3> gamma_mixed;
   /// \brief Generic temporary gamma vector.
   Eigen::VectorXf gamma_tmp;
+  /// \brief Cached weighted basis block `U_left .* mu`.
+  Eigen::MatrixXf weighted_u;
+  /// \brief Cached compound determinants by degree.
+  std::array<CompoundDeterminantData, 3> compound_cache;
+  /// \brief Cached Gram matrices by degree.
+  std::array<Eigen::MatrixXf, 3> gram_cache;
+  /// \brief Cached Gram pseudo-inverses by degree.
+  std::array<Eigen::MatrixXf, 3> gram_inverse_cache;
   /// \brief Number of columns currently available in `gamma_mixed`.
   int gamma_mixed_cols = 0;
+  /// \brief Column count currently held in `weighted_u`.
+  int weighted_u_cols = 0;
+  /// \brief Column count associated with `gram_cache[k]`.
+  std::array<int, 3> gram_cols = {0, 0, 0};
+  /// \brief Column count associated with `gram_inverse_cache[k]`.
+  std::array<int, 3> gram_inverse_cols = {0, 0, 0};
   /// \brief Whether `gamma_coords` has been initialized.
   bool gamma_coords_ready = false;
-};
-
-/// \brief Precomputed compound-minor determinants for metric pullbacks.
-struct CompoundDeterminantData {
-  /// \brief Basis index combinations for the target degree.
-  std::vector<std::vector<int>> indices;
-  /// \brief Determinant vectors for each basis-pair minor.
-  std::vector<Eigen::VectorXf> values;
-  /// \brief Degree (`k`) these determinants correspond to.
-  int degree = 0;
-  /// \brief Number of basis components for this degree.
-  int n_components = 0;
+  /// \brief Whether `compound_cache[k]` is valid.
+  std::array<bool, 3> compound_ready = {false, false, false};
+  /// \brief Whether `gram_cache[k]` is valid.
+  std::array<bool, 3> gram_ready = {false, false, false};
+  /// \brief Whether `gram_inverse_cache[k]` is valid.
+  std::array<bool, 3> gram_inverse_ready = {false, false, false};
+
+  /// \brief Invalidate derived caches after mesh/build/eigenbasis changes.
+  void reset_cache() {
+    gamma_mixed_cols = 0;
+    weighted_u_cols = 0;
+    gram_cols = {0, 0, 0};
+    gram_inverse_cols = {0, 0, 0};
+    gamma_coords_ready = false;
+    compound_ready = {false, false, false};
+    gram_ready = {false, false, false};
+    gram_inverse_ready = {false, false, false};
+  }
 };
 
 /**
@@ -168,6 +200,29 @@ void ensure_gamma_mixed(const MeshT& mesh, int n_coefficients,
   workspace.gamma_mixed_cols = n1;
 }
 
+/**
+ * \brief Ensure `workspace.weighted_u` holds `U_left .* mu` for `n_coefficients`.
+ * \param mesh Input diffusion space.
+ * \param n_coefficients Required basis column count.
+ * \param workspace Scratch workspace to populate.
+ */
+template <typename MeshT>
+void ensure_weighted_u(const MeshT& mesh, int n_coefficients,
+                       DiffusionFormWorkspace<MeshT>& workspace) {
+  const auto& U = mesh.structure.eigen_basis;
+  const auto& mu = mesh.structure.mu;
+  const int n = static_cast<int>(mesh.num_points());
+  const int n1 = std::max(1, std::min(n_coefficients, static_cast<int>(U.cols())));
+
+  if (workspace.weighted_u_cols == n1 && workspace.weighted_u.rows() == n) {
+    return;
+  }
+
+  workspace.weighted_u.resize(n, n1);
+  workspace.weighted_u = U.leftCols(n1).array().colwise() * mu.array();
+  workspace.weighted_u_cols = n1;
+}
+
 /**
  * \brief Compute all compound determinant vectors for k-form metric pullbacks.
  * \param mesh Input diffusion space.
@@ -178,6 +233,11 @@ void ensure_gamma_mixed(const MeshT& mesh, int n_coefficients,
 template <typename MeshT>
 CompoundDeterminantData compute_compound_determinants(const MeshT& mesh, int k,
                                                       DiffusionFormWorkspace<MeshT>& workspace) {
+  if (k >= 0 && k < static_cast<int>(workspace.compound_cache.size()) &&
+      workspace.compound_ready[k]) {
+    return workspace.compound_cache[static_cast<size_t>(k)];
+  }
+
   ensure_gamma_coords(mesh, workspace);
 
   CompoundDeterminantData out;
@@ -198,6 +258,10 @@ CompoundDeterminantData compute_compound_determinants(const MeshT& mesh, int k,
     }
   }
 
+  if (k >= 0 && k < static_cast<int>(workspace.compound_cache.size())) {
+    workspace.compound_cache[static_cast<size_t>(k)] = out;
+    workspace.compound_ready[k] = true;
+  }
   return out;
 }
 
@@ -213,31 +277,38 @@ template <typename MeshT>
 Eigen::MatrixXf compute_kform_gram_matrix(const MeshT& mesh, int k, int n_coefficients,
                                           DiffusionFormWorkspace<MeshT>& workspace) {
   const auto& U = mesh.structure.eigen_basis;
-  const auto& mu = mesh.structure.mu;
   const int n1 = std::max(1, std::min(n_coefficients, static_cast<int>(U.cols())));
   const auto compounds = compute_compound_determinants(mesh, k, workspace);
   const int Ck = std::max(1, compounds.n_components);
   Eigen::MatrixXf G = Eigen::MatrixXf::Zero(n1 * Ck, n1 * Ck);
 
   const int n = static_cast<int>(mesh.num_points());
+  ensure_weighted_u(mesh, n1, workspace);
   Eigen::ArrayXf weights(n);
 
   if (k == 0) {
     for (int i = 0; i < n1; ++i) {
       for (int l = i; l < n1; ++l) {
-        const float val = (U.col(i).array() * U.col(l).array() * mu.array()).sum();
+        const float val = workspace.weighted_u.col(i).dot(U.col(l));
         G(i, l) = val;
         if (i != l) {
           G(l, i) = val;
         }
       }
     }
+    if (k >= 0 && k < static_cast<int>(workspace.gram_cache.size())) {
+      workspace.gram_cache[static_cast<size_t>(k)] = G;
+      workspace.gram_ready[k] = true;
+      workspace.gram_cols[k] = n1;
+      workspace.gram_inverse_ready[k] = false;
+      workspace.gram_inverse_cols[k] = 0;
+    }
     return G;
   }
 
   for (int i = 0; i < n1; ++i) {
     for (int l = i; l < n1; ++l) {
-      weights = U.col(i).array() * U.col(l).array() * mu.array();
+      weights = workspace.weighted_u.col(i).array() * U.col(l).array();
       for (int a = 0; a < Ck; ++a) {
         for (int b = 0; b < Ck; ++b) {
           const auto& compound = compounds.values[static_cast<size_t>(a * Ck + b)];
@@ -253,6 +324,13 @@ Eigen::MatrixXf compute_kform_gram_matrix(const MeshT& mesh, int k, int n_coeffi
     }
   }
 
+  if (k >= 0 && k < static_cast<int>(workspace.gram_cache.size())) {
+    workspace.gram_cache[static_cast<size_t>(k)] = G;
+    workspace.gram_ready[k] = true;
+    workspace.gram_cols[k] = n1;
+    workspace.gram_inverse_ready[k] = false;
+    workspace.gram_inverse_cols[k] = 0;
+  }
   return G;
 }
 
@@ -285,16 +363,15 @@ Eigen::MatrixXf compute_weak_exterior_derivative(const MeshT& mesh, int k, int n
   }
 
   const auto& U = mesh.structure.eigen_basis;
-  const auto& mu = mesh.structure.mu;
   const int n1 = std::max(1, std::min(n_coefficients, static_cast<int>(U.cols())));
   ensure_gamma_mixed(mesh, n1, workspace);
 
   if (k == 0) {
     const int d = ambient_dim_3d();
     Eigen::MatrixXf D = Eigen::MatrixXf::Zero(n1 * d, n1);
-    Eigen::MatrixXf weighted_u = U.leftCols(n1).array().colwise() * mu.array();
+    ensure_weighted_u(mesh, n1, workspace);
     for (int a = 0; a < d; ++a) {
-      const Eigen::MatrixXf coupling = weighted_u.transpose() * workspace.gamma_mixed[a];
+      const Eigen::MatrixXf coupling = workspace.weighted_u.transpose() * workspace.gamma_mixed[a];
       for (int i = 0; i < n1; ++i) {
         D.row(i * d + a) = coupling.row(i);
       }
@@ -306,27 +383,26 @@ Eigen::MatrixXf compute_weak_exterior_derivative(const MeshT& mesh, int k, int n
   const auto compounds = compute_compound_determinants(mesh, k, workspace);
   const int Ck = static_cast<int>(kp1.idx_k.size());
   const int Ckp1 = static_cast<int>(kp1.idx_kp1.size());
+  ensure_weighted_u(mesh, n1, workspace);
 
   Eigen::MatrixXf D = Eigen::MatrixXf::Zero(n1 * Ckp1, n1 * Ck);
-  const int n = static_cast<int>(mesh.num_points());
+  for (int Jp = 0; Jp < Ckp1; ++Jp) {
+    for (int J = 0; J < Ck; ++J) {
+      Eigen::MatrixXf coupling = Eigen::MatrixXf::Zero(n1, n1);
+
+      for (int r = 0; r < k + 1; ++r) {
+        const int coord_dim = kp1.idx_kp1[static_cast<size_t>(Jp)][static_cast<size_t>(r)];
+        const int child_idx = kp1.children[static_cast<size_t>(Jp)][static_cast<size_t>(r)];
+        const auto& minor_det = compounds.values[static_cast<size_t>(child_idx * Ck + J)];
+        const Eigen::MatrixXf scaled =
+            workspace.gamma_mixed[coord_dim].array().colwise() * minor_det.array();
+        coupling.noalias() += static_cast<float>(kp1.signs[static_cast<size_t>(r)]) *
+                              (workspace.weighted_u.transpose() * scaled);
+      }
 
-  for (int I = 0; I < n1; ++I) {
-    for (int i = 0; i < n1; ++i) {
-      for (int Jp = 0; Jp < Ckp1; ++Jp) {
-        for (int J = 0; J < Ck; ++J) {
-          float accum = 0.0f;
-          for (int p = 0; p < n; ++p) {
-            float laplace_sum = 0.0f;
-            for (int r = 0; r < k + 1; ++r) {
-              const int coord_dim = kp1.idx_kp1[static_cast<size_t>(Jp)][static_cast<size_t>(r)];
-              const int child_idx = kp1.children[static_cast<size_t>(Jp)][static_cast<size_t>(r)];
-              const auto& minor_det = compounds.values[static_cast<size_t>(child_idx * Ck + J)];
-              laplace_sum += static_cast<float>(kp1.signs[static_cast<size_t>(r)]) *
-                             workspace.gamma_mixed[coord_dim](p, i) * minor_det[p];
-            }
-            accum += U(p, I) * laplace_sum * mu[p];
-          }
-          D(I * Ckp1 + Jp, i * Ck + J) = accum;
+      for (int I = 0; I < n1; ++I) {
+        for (int i = 0; i < n1; ++i) {
+          D(I * Ckp1 + Jp, i * Ck + J) = coupling(I, i);
         }
       }
     }
@@ -356,6 +432,19 @@ Eigen::MatrixXf compute_weak_exterior_derivative(const MeshT& mesh, int k, int n
  */
 inline Eigen::MatrixXf pseudo_inverse_symmetric(const Eigen::MatrixXf& matrix, float rcond);
 
+/**
+ * \brief Forward declaration of cached Gram pseudo-inverse helper.
+ * \param mesh Input diffusion space.
+ * \param k Exterior degree.
+ * \param n_coefficients Basis truncation size.
+ * \param workspace Scratch workspace.
+ * \param rcond Relative conditioning threshold.
+ * \return Pseudo-inverse Gram matrix.
+ */
+template <typename MeshT>
+Eigen::MatrixXf compute_kform_gram_inverse(const MeshT& mesh, int k, int n_coefficients,
+                                           DiffusionFormWorkspace<MeshT>& workspace, float rcond);
+
 /**
  * \brief Assemble codifferential matrix `delta_k` from `d_{k-1}` and mass.
  * \param mesh Input diffusion space.
@@ -372,8 +461,8 @@ Eigen::MatrixXf compute_codifferential_matrix(const MeshT& mesh, int k, int n_co
   }
   const Eigen::MatrixXf D_prev =
       compute_weak_exterior_derivative(mesh, k - 1, n_coefficients, workspace);
-  const Eigen::MatrixXf G_prev = compute_kform_gram_matrix(mesh, k - 1, n_coefficients, workspace);
-  const Eigen::MatrixXf G_prev_inv = pseudo_inverse_symmetric(G_prev, 1e-5f);
+  const Eigen::MatrixXf G_prev_inv =
+      compute_kform_gram_inverse(mesh, k - 1, n_coefficients, workspace, 1e-5f);
   return G_prev_inv * D_prev.transpose();
 }
 
@@ -449,6 +538,60 @@ inline Eigen::MatrixXf pseudo_inverse_symmetric(const Eigen::MatrixXf& matrix,
   return evecs * inv.asDiagonal() * evecs.transpose();
 }
 
+/**
+ * \brief Return cached pseudo-inverse of the k-form Gram matrix.
+ * \param mesh Input diffusion space.
+ * \param k Exterior degree.
+ * \param n_coefficients Basis truncation size.
+ * \param workspace Scratch workspace.
+ * \param rcond Relative conditioning threshold.
+ * \return Pseudo-inverse Gram matrix.
+ */
+template <typename MeshT>
+Eigen::MatrixXf compute_kform_gram_inverse(const MeshT& mesh, int k, int n_coefficients,
+                                           DiffusionFormWorkspace<MeshT>& workspace, float rcond) {
+  if (k < 0 || k > 2) {
+    return Eigen::MatrixXf();
+  }
+
+  const auto& U = mesh.structure.eigen_basis;
+  const int n1 = std::max(1, std::min(n_coefficients, static_cast<int>(U.cols())));
+  if (workspace.gram_inverse_ready[k] && workspace.gram_inverse_cols[k] == n1) {
+    return workspace.gram_inverse_cache[static_cast<size_t>(k)];
+  }
+
+  Eigen::MatrixXf gram;
+  if (workspace.gram_ready[k] && workspace.gram_cols[k] == n1) {
+    gram = workspace.gram_cache[static_cast<size_t>(k)];
+  } else {
+    gram = compute_kform_gram_matrix(mesh, k, n1, workspace);
+  }
+
+  workspace.gram_inverse_cache[static_cast<size_t>(k)] = pseudo_inverse_symmetric(gram, rcond);
+  workspace.gram_inverse_ready[k] = true;
+  workspace.gram_inverse_cols[k] = n1;
+  return workspace.gram_inverse_cache[static_cast<size_t>(k)];
+}
+
+/// \brief Convenience overload using the default conditioning threshold.
+template <typename MeshT>
+Eigen::MatrixXf compute_kform_gram_inverse(const MeshT& mesh, int k, int n_coefficients,
+                                           DiffusionFormWorkspace<MeshT>& workspace) {
+  return compute_kform_gram_inverse(mesh, k, n_coefficients, workspace, 1e-5f);
+}
+
+/**
+ * \brief Assemble a down-Laplacian from precomputed previous-degree operators.
+ * \param D_prev Previous-degree weak derivative matrix.
+ * \param G_prev_inv Previous-degree Gram pseudo-inverse.
+ * \return Down-Laplacian matrix.
+ */
+inline Eigen::MatrixXf
+assemble_down_laplacian_matrix_from_previous(const Eigen::MatrixXf& D_prev,
+                                             const Eigen::MatrixXf& G_prev_inv) {
+  return D_prev * G_prev_inv * D_prev.transpose();
+}
+
 /**
  * \brief Assemble up-Laplacian contribution for k-forms.
  * \param mesh Input diffusion space.
@@ -473,76 +616,74 @@ Eigen::MatrixXf compute_up_laplacian_matrix(const MeshT& mesh, int k, int n_coef
 
   if (k == 0) {
     Eigen::MatrixXf L = Eigen::MatrixXf::Zero(n1, n1);
-    for (int i = 0; i < n1; ++i) {
-      for (int l = i; l < n1; ++l) {
-        workspace.gamma_tmp.resize(n);
-        carre_du_champ(mesh, U.col(i), U.col(l), 0.0f, workspace.gamma_tmp);
-        const float val = (workspace.gamma_tmp.array() * mu.array()).sum();
-        L(i, l) = val;
-        if (i != l) {
-          L(l, i) = val;
-        }
-      }
-    }
-    return L;
+    core::parallel_for_index(
+        0, n1,
+        [&](int i) {
+          Eigen::VectorXf gamma_local(n);
+          for (int l = i; l < n1; ++l) {
+            carre_du_champ(mesh, U.col(i), U.col(l), 0.0f, gamma_local);
+            L(i, l) = (gamma_local.array() * mu.array()).sum();
+          }
+        },
+        4);
+    return Eigen::MatrixXf(L.selfadjointView<Eigen::Upper>());
   }
 
   const auto compounds = compute_compound_determinants(mesh, k, workspace);
   const int Ck = compounds.n_components;
   Eigen::MatrixXf L = Eigen::MatrixXf::Zero(n1 * Ck, n1 * Ck);
 
-  for (int i = 0; i < n1; ++i) {
-    for (int l = i; l < n1; ++l) {
-      workspace.gamma_tmp.resize(n);
-      carre_du_champ(mesh, U.col(i), U.col(l), 0.0f, workspace.gamma_tmp);
-
-      for (int J = 0; J < Ck; ++J) {
-        for (int K = 0; K < Ck; ++K) {
-          float val = 0.0f;
-          if (k == 1) {
-            const auto& gJK = workspace.gamma_coords[J][K];
-            const Eigen::ArrayXf term =
-                (workspace.gamma_tmp.array() * gJK.array()) -
-                (workspace.gamma_mixed[K].col(i).array() * workspace.gamma_mixed[J].col(l).array());
-            val = (term * mu.array()).sum();
-          } else {
-            const auto& row_combo = compounds.indices[static_cast<size_t>(J)];
-            const auto& col_combo = compounds.indices[static_cast<size_t>(K)];
-            for (int p = 0; p < n; ++p) {
-              Eigen::Matrix2f Dm;
-              Dm(0, 0) = workspace.gamma_coords[row_combo[0]][col_combo[0]][p];
-              Dm(0, 1) = workspace.gamma_coords[row_combo[0]][col_combo[1]][p];
-              Dm(1, 0) = workspace.gamma_coords[row_combo[1]][col_combo[0]][p];
-              Dm(1, 1) = workspace.gamma_coords[row_combo[1]][col_combo[1]][p];
-
-              const float detD = Dm.determinant();
-
-              Eigen::Vector2f c_vec;
-              c_vec[0] = workspace.gamma_mixed[row_combo[0]](p, l);
-              c_vec[1] = workspace.gamma_mixed[row_combo[1]](p, l);
-
-              Eigen::Vector2f b_vec;
-              b_vec[0] = workspace.gamma_mixed[col_combo[0]](p, i);
-              b_vec[1] = workspace.gamma_mixed[col_combo[1]](p, i);
-
-              const Eigen::Vector2f solved = solve_stable_2x2(Dm, c_vec);
-              const float bDv = b_vec.dot(solved);
-              val += mu[p] * detD * (workspace.gamma_tmp[p] - bDv);
+  core::parallel_for_index(
+      0, n1,
+      [&](int i) {
+        Eigen::VectorXf gamma_local(n);
+        for (int l = i; l < n1; ++l) {
+          carre_du_champ(mesh, U.col(i), U.col(l), 0.0f, gamma_local);
+
+          for (int J = 0; J < Ck; ++J) {
+            const int k_begin = (i == l) ? J : 0;
+            for (int K = k_begin; K < Ck; ++K) {
+              float val = 0.0f;
+              if (k == 1) {
+                const auto& gJK = workspace.gamma_coords[J][K];
+                const Eigen::ArrayXf term =
+                    (gamma_local.array() * gJK.array()) - (workspace.gamma_mixed[K].col(i).array() *
+                                                           workspace.gamma_mixed[J].col(l).array());
+                val = (term * mu.array()).sum();
+              } else {
+                const auto& row_combo = compounds.indices[static_cast<size_t>(J)];
+                const auto& col_combo = compounds.indices[static_cast<size_t>(K)];
+                for (int p = 0; p < n; ++p) {
+                  const float a = workspace.gamma_coords[row_combo[0]][col_combo[0]][p];
+                  const float b = workspace.gamma_coords[row_combo[0]][col_combo[1]][p];
+                  const float c = workspace.gamma_coords[row_combo[1]][col_combo[0]][p];
+                  const float d = workspace.gamma_coords[row_combo[1]][col_combo[1]][p];
+                  const float detD = a * d - b * c;
+
+                  float det_bDv = 0.0f;
+                  if (std::abs(detD) > 1e-8f) {
+                    const float c0 = workspace.gamma_mixed[row_combo[0]](p, l);
+                    const float c1 = workspace.gamma_mixed[row_combo[1]](p, l);
+                    const float b0 = workspace.gamma_mixed[col_combo[0]](p, i);
+                    const float b1 = workspace.gamma_mixed[col_combo[1]](p, i);
+
+                    const float adj_c0 = d * c0 - b * c1;
+                    const float adj_c1 = -c * c0 + a * c1;
+                    det_bDv = (b0 * adj_c0) + (b1 * adj_c1);
+                  }
+
+                  val += mu[p] * ((detD * gamma_local[p]) - det_bDv);
+                }
+              }
+
+              L(i * Ck + J, l * Ck + K) = val;
             }
           }
-
-          const int row = i * Ck + J;
-          const int col = l * Ck + K;
-          L(row, col) = val;
-          if (row != col) {
-            L(col, row) = val;
-          }
         }
-      }
-    }
-  }
+      },
+      4);
 
-  return L;
+  return Eigen::MatrixXf(L.selfadjointView<Eigen::Upper>());
 }
 
 /**
@@ -578,9 +719,9 @@ Eigen::MatrixXf compute_down_laplacian_matrix(const MeshT& mesh, int k, int n_co
 
   const Eigen::MatrixXf D_prev =
       compute_weak_exterior_derivative(mesh, k - 1, n_coefficients, workspace);
-  const Eigen::MatrixXf G_prev = compute_kform_gram_matrix(mesh, k - 1, n_coefficients, workspace);
-  const Eigen::MatrixXf G_prev_inv = pseudo_inverse_symmetric(G_prev, 1e-5f);
-  return D_prev * G_prev_inv * D_prev.transpose();
+  const Eigen::MatrixXf G_prev_inv =
+      compute_kform_gram_inverse(mesh, k - 1, n_coefficients, workspace, 1e-5f);
+  return assemble_down_laplacian_matrix_from_previous(D_prev, G_prev_inv);
 }
 
 /**
@@ -607,6 +748,36 @@ inline Eigen::MatrixXf assemble_hodge_laplacian_matrix(const Eigen::MatrixXf& up
   return up + down;
 }
 
+/**
+ * \brief Try a direct Cholesky whitening path for dense generalized spectra.
+ * \param laplacian Symmetric operator matrix.
+ * \param mass_matrix Symmetric positive-definite mass matrix.
+ * \return Pair `(eigenvalues, eigenvectors)`, or empty matrices on failure.
+ */
+inline std::pair<Eigen::VectorXf, Eigen::MatrixXf>
+try_generalized_spectrum_cholesky(const Eigen::MatrixXf& laplacian,
+                                  const Eigen::MatrixXf& mass_matrix) {
+  if (laplacian.rows() == 0 || laplacian.rows() != laplacian.cols() ||
+      mass_matrix.rows() != mass_matrix.cols() || laplacian.rows() != mass_matrix.rows()) {
+    return std::make_pair(Eigen::VectorXf(), Eigen::MatrixXf());
+  }
+
+  Eigen::LLT<Eigen::MatrixXf> mass_chol(mass_matrix);
+  if (mass_chol.info() != Eigen::Success) {
+    return std::make_pair(Eigen::VectorXf(), Eigen::MatrixXf());
+  }
+
+  Eigen::MatrixXf phi =
+      mass_chol.matrixU().solve(Eigen::MatrixXf::Identity(mass_matrix.rows(), mass_matrix.cols()));
+  const Eigen::MatrixXf restricted = phi.transpose() * laplacian * phi;
+  Eigen::SelfAdjointEigenSolver<Eigen::MatrixXf> solver(restricted);
+  if (solver.info() != Eigen::Success) {
+    return std::make_pair(Eigen::VectorXf(), Eigen::MatrixXf());
+  }
+
+  return std::make_pair(solver.eigenvalues(), phi * solver.eigenvectors());
+}
+
 /**
  * \brief Solve generalized eigenproblem for form Laplacian and mass matrix.
  * \param laplacian Form Laplacian matrix.
@@ -617,6 +788,11 @@ inline Eigen::MatrixXf assemble_hodge_laplacian_matrix(const Eigen::MatrixXf& up
 inline std::pair<Eigen::VectorXf, Eigen::MatrixXf>
 compute_form_spectrum(const Eigen::MatrixXf& laplacian, const Eigen::MatrixXf& mass_matrix,
                       float rcond = 1e-5f) {
+  auto fast = try_generalized_spectrum_cholesky(laplacian, mass_matrix);
+  if (fast.first.size() != 0 && fast.second.cols() != 0) {
+    return fast;
+  }
+
   Eigen::SelfAdjointEigenSolver<Eigen::MatrixXf> mass_solver(mass_matrix);
   if (mass_solver.info() != Eigen::Success) {
     return std::make_pair(Eigen::VectorXf(), Eigen::MatrixXf());
diff --git a/include/igneous/ops/diffusion/hodge.hpp b/include/igneous/ops/diffusion/hodge.hpp
index 4563d79..60a30a1 100644
--- a/include/igneous/ops/diffusion/hodge.hpp
+++ b/include/igneous/ops/diffusion/hodge.hpp
@@ -8,6 +8,7 @@
 #include <cmath>
 #include <complex>
 #include <cstddef>
+#include <limits>
 #include <utility>
 #include <vector>
 
@@ -21,59 +22,358 @@ namespace igneous::ops::diffusion {
 template <typename MeshT> struct HodgeWorkspace {
   /// \brief Coordinate vectors used in gamma evaluations.
   std::array<Eigen::VectorXf, 3> coords;
+  /// \brief Markov-averaged coordinate vectors.
+  std::array<Eigen::VectorXf, 3> coord_means;
   /// \brief `Gamma(x_a, phi_i)` matrix cache for each axis.
   std::array<Eigen::MatrixXf, 3> gamma_x_phi_mat;
+  /// \brief `mu * Gamma(x_a, phi_i)` matrix cache for each axis.
+  std::array<Eigen::MatrixXf, 3> weighted_gamma_x_phi_mat;
   /// \brief Basis weighted by stationary measure.
   Eigen::MatrixXf weighted_u;
-  /// \brief Reserved for mixed gamma layouts.
-  std::vector<std::vector<Eigen::VectorXf>> gamma_x_phi;
-  /// \brief `Gamma(phi_i, x_a)` cache per mode and axis.
-  std::vector<std::vector<Eigen::VectorXf>> gamma_phi_x;
+  /// \brief Markov-applied basis cache.
+  Eigen::MatrixXf markov_basis;
+  /// \brief Generic Markov matrix temporary.
+  Eigen::MatrixXf markov_tmp;
   /// \brief `Gamma(x_a, x_b)` caches.
   std::array<std::array<Eigen::VectorXf, 3>, 3> gamma_xx;
   /// \brief Temporary `Gamma(phi_i, phi_j)` buffer.
   Eigen::VectorXf gamma_phi_phi;
   /// \brief Generic temporary weight vector.
   Eigen::VectorXf weight;
+  /// \brief Temporary `P^T weight` buffer.
+  Eigen::VectorXf pt_weight;
+  /// \brief Cached inverse gamma denominators on the mean-centered path.
+  Eigen::VectorXf inv_gamma_denom;
+  /// \brief Cached state for `gamma_x_phi_mat` and `weighted_u`.
+  int mixed_n_verts = -1;
+  /// \brief Cached scalar basis count for `gamma_x_phi_mat` and `weighted_u`.
+  int mixed_n0 = -1;
+  /// \brief Cached bandwidth for `gamma_x_phi_mat`.
+  float mixed_bandwidth = std::numeric_limits<float>::quiet_NaN();
+  /// \brief Whether mixed gamma and weighted basis caches are ready.
+  bool mixed_ready = false;
+  /// \brief Cached state for `gamma_xx`.
+  int gamma_xx_n_verts = -1;
+  /// \brief Cached bandwidth for `gamma_xx`.
+  float gamma_xx_bandwidth = std::numeric_limits<float>::quiet_NaN();
+  /// \brief Whether coordinate gamma caches are ready.
+  bool gamma_xx_ready = false;
+
+  void reset_cache() {
+    mixed_n_verts = -1;
+    mixed_n0 = -1;
+    mixed_bandwidth = std::numeric_limits<float>::quiet_NaN();
+    mixed_ready = false;
+    gamma_xx_n_verts = -1;
+    gamma_xx_bandwidth = std::numeric_limits<float>::quiet_NaN();
+    gamma_xx_ready = false;
+  }
+};
+
+/// \brief Reusable buffers for circular-coordinate assembly on a fixed mesh/basis state.
+template <typename MeshT> struct CircularCoordinateWorkspace {
+  std::array<Eigen::VectorXf, 3> coords;
+  std::array<Eigen::VectorXf, 3> coord_means;
+  std::array<Eigen::MatrixXf, 3> gamma_x_phi_mat;
+  Eigen::MatrixXf weighted_u;
+  Eigen::MatrixXf function_gram;
+  Eigen::MatrixXf laplacian0_weak;
+  Eigen::MatrixXf markov_basis;
+  Eigen::MatrixXf markov_tmp;
+  Eigen::MatrixXf alpha_mat;
+  Eigen::MatrixXf q;
+  Eigen::MatrixXf weighted_gamma;
+  Eigen::MatrixXf x_op;
+  Eigen::VectorXf gamma_local;
+  Eigen::VectorXf z_real;
+  Eigen::VectorXf z_imag;
+  int prepared_n_verts = -1;
+  int prepared_n0 = -1;
+  float prepared_bandwidth = std::numeric_limits<float>::quiet_NaN();
+
+  void reset() {
+    prepared_n_verts = -1;
+    prepared_n0 = -1;
+    prepared_bandwidth = std::numeric_limits<float>::quiet_NaN();
+  }
 };
 
-/**
- * \brief Assemble weak exterior derivative matrix for 1-forms.
- * \param mesh Input diffusion space.
- * \param bandwidth Diffusion bandwidth parameter.
- * \param workspace Scratch buffers.
- * \return Weak derivative matrix.
- */
 template <typename MeshT>
-Eigen::MatrixXf compute_weak_exterior_derivative(const MeshT& mesh, float bandwidth,
-                                                 HodgeWorkspace<MeshT>& workspace) {
+void apply_markov_transition_matrix(const MeshT& mesh, const Eigen::MatrixXf& input,
+                                    Eigen::MatrixXf& output) {
+  const int n_verts = static_cast<int>(mesh.num_points());
+  const int n_cols = input.cols();
+  output.resize(n_verts, n_cols);
+
+  const auto& row_offsets = mesh.structure.markov_row_offsets;
+  const auto& col_indices = mesh.structure.markov_col_indices;
+  const auto& weights = mesh.structure.markov_values;
+
+  core::parallel_for_index(
+      0, n_verts,
+      [&](int row_idx) {
+        thread_local Eigen::RowVectorXf accum;
+        if (accum.size() != n_cols) {
+          accum.resize(n_cols);
+        }
+        accum.setZero();
+
+        const int begin = row_offsets[static_cast<size_t>(row_idx)];
+        const int end = row_offsets[static_cast<size_t>(row_idx) + 1];
+        for (int idx = begin; idx < end; ++idx) {
+          accum.noalias() +=
+              weights[static_cast<size_t>(idx)] * input.row(col_indices[static_cast<size_t>(idx)]);
+        }
+        output.row(row_idx) = accum;
+      },
+      128);
+}
+
+template <typename MeshT> bool uses_mean_centres(const MeshT& mesh) {
+  if constexpr (requires { mesh.structure.use_mean_centres; }) {
+    return mesh.structure.use_mean_centres;
+  }
+  return false;
+}
+
+template <typename MeshT>
+void apply_markov_adjoint(const MeshT& mesh, Eigen::Ref<const Eigen::VectorXf> input,
+                          Eigen::VectorXf& output) {
+  const int n_verts = static_cast<int>(mesh.num_points());
+  output.setZero(n_verts);
+
+  const auto& row_offsets = mesh.structure.markov_row_offsets;
+  const auto& col_indices = mesh.structure.markov_col_indices;
+  const auto& weights = mesh.structure.markov_values;
+  for (int row_idx = 0; row_idx < n_verts; ++row_idx) {
+    const int begin = row_offsets[static_cast<size_t>(row_idx)];
+    const int end = row_offsets[static_cast<size_t>(row_idx) + 1];
+    const float scale = input[row_idx];
+    for (int idx = begin; idx < end; ++idx) {
+      output[col_indices[static_cast<size_t>(idx)]] += weights[static_cast<size_t>(idx)] * scale;
+    }
+  }
+}
+
+template <typename MeshT>
+void prepare_circular_coordinate_workspace(const MeshT& mesh, float bandwidth,
+                                           CircularCoordinateWorkspace<MeshT>& workspace) {
   const auto& U = mesh.structure.eigen_basis;
   const auto& mu = mesh.structure.mu;
   const int n0 = U.cols();
   const int n_verts = static_cast<int>(mesh.num_points());
 
-  Eigen::MatrixXf D = Eigen::MatrixXf::Zero(3 * n0, n0);
+  if (workspace.prepared_n_verts == n_verts && workspace.prepared_n0 == n0 &&
+      workspace.prepared_bandwidth == bandwidth) {
+    return;
+  }
 
   fill_coordinate_vectors(mesh, workspace.coords);
 
-  for (int a = 0; a < 3; ++a) {
-    if (workspace.gamma_x_phi_mat[a].rows() != n_verts ||
-        workspace.gamma_x_phi_mat[a].cols() != n0) {
+  workspace.weighted_u.resize(n_verts, n0);
+  workspace.weighted_u = U.array().colwise() * mu.array();
+
+  workspace.function_gram.resize(n0, n0);
+  workspace.function_gram.noalias() = U.transpose() * workspace.weighted_u;
+
+  const bool use_mean_centres = uses_mean_centres(mesh);
+
+  if (use_mean_centres) {
+    workspace.markov_basis.resize(n_verts, n0);
+    apply_markov_transition_matrix(mesh, U, workspace.markov_basis);
+
+    Eigen::VectorXf b(n_verts);
+    Eigen::VectorXf inv_gamma_denom(n_verts);
+    for (int i = 0; i < n_verts; ++i) {
+      float denom = 2.0f;
+      if constexpr (requires { mesh.structure.local_bandwidths; }) {
+        if (mesh.structure.local_bandwidths.size() == n_verts) {
+          denom = 2.0f * std::max(mesh.structure.local_bandwidths[static_cast<size_t>(i)], 1e-8f);
+        }
+      }
+      b[i] = mu[i] / denom;
+      inv_gamma_denom[i] = 1.0f / denom;
+    }
+
+    Eigen::VectorXf ptb = Eigen::VectorXf::Zero(n_verts);
+    const auto& row_offsets = mesh.structure.markov_row_offsets;
+    const auto& col_indices = mesh.structure.markov_col_indices;
+    const auto& weights = mesh.structure.markov_values;
+    for (int i = 0; i < n_verts; ++i) {
+      const int begin = row_offsets[static_cast<size_t>(i)];
+      const int end = row_offsets[static_cast<size_t>(i) + 1];
+      const float bi = b[i];
+      for (int idx = begin; idx < end; ++idx) {
+        ptb[col_indices[static_cast<size_t>(idx)]] += weights[static_cast<size_t>(idx)] * bi;
+      }
+    }
+
+    for (int a = 0; a < 3; ++a) {
+      workspace.coord_means[a].resize(n_verts);
+      apply_markov_transition(mesh, workspace.coords[a], workspace.coord_means[a]);
+      workspace.markov_tmp.resize(n_verts, n0);
+      workspace.markov_tmp = U.array().colwise() * workspace.coords[a].array();
+      apply_markov_transition_matrix(mesh, workspace.markov_tmp, workspace.gamma_x_phi_mat[a]);
+      workspace.gamma_x_phi_mat[a].array() -=
+          workspace.markov_basis.array().colwise() * workspace.coord_means[a].array();
+      workspace.gamma_x_phi_mat[a].array().colwise() *= inv_gamma_denom.array();
+    }
+
+    workspace.laplacian0_weak.resize(n0, n0);
+    workspace.markov_tmp = U.array().colwise() * ptb.array();
+    workspace.laplacian0_weak.noalias() = U.transpose() * workspace.markov_tmp;
+    workspace.markov_tmp = workspace.markov_basis.array().colwise() * b.array();
+    workspace.laplacian0_weak.noalias() -=
+        workspace.markov_basis.transpose() * workspace.markov_tmp;
+    workspace.laplacian0_weak =
+        0.5f * (workspace.laplacian0_weak + workspace.laplacian0_weak.transpose());
+  } else {
+    for (int a = 0; a < 3; ++a) {
       workspace.gamma_x_phi_mat[a].resize(n_verts, n0);
+      core::parallel_for_index(
+          0, n0,
+          [&](int t) {
+            carre_du_champ(mesh, workspace.coords[a], U.col(t), bandwidth,
+                           workspace.gamma_x_phi_mat[a].col(t));
+          },
+          8);
+    }
+
+    workspace.laplacian0_weak.setZero(n0, n0);
+    workspace.gamma_local.resize(n_verts);
+    for (int i = 0; i < n0; ++i) {
+      for (int j = i; j < n0; ++j) {
+        carre_du_champ(mesh, U.col(i), U.col(j), bandwidth, workspace.gamma_local);
+        const float val = (workspace.gamma_local.array() * mu.array()).sum();
+        workspace.laplacian0_weak(i, j) = val;
+        workspace.laplacian0_weak(j, i) = val;
+      }
     }
-    core::parallel_for_index(
-        0, n0,
-        [&](int i) {
-          carre_du_champ(mesh, workspace.coords[a], U.col(i), bandwidth,
-                         workspace.gamma_x_phi_mat[a].col(i));
-        },
-        8);
   }
 
-  if (workspace.weighted_u.rows() != n_verts || workspace.weighted_u.cols() != n0) {
-    workspace.weighted_u.resize(n_verts, n0);
+  workspace.alpha_mat.resize(n0, 3);
+  workspace.q.resize(n_verts, 3);
+  workspace.weighted_gamma.resize(n_verts, n0);
+  workspace.x_op.resize(n0, n0);
+  workspace.z_real.resize(n_verts);
+  workspace.z_imag.resize(n_verts);
+  workspace.prepared_n_verts = n_verts;
+  workspace.prepared_n0 = n0;
+  workspace.prepared_bandwidth = bandwidth;
+}
+
+inline bool hodge_cache_matches(float lhs, float rhs) {
+  return std::abs(lhs - rhs) <= 1e-12f;
+}
+
+template <typename MeshT>
+void ensure_hodge_mixed_cache(const MeshT& mesh, float bandwidth,
+                              HodgeWorkspace<MeshT>& workspace) {
+  const auto& U = mesh.structure.eigen_basis;
+  const auto& mu = mesh.structure.mu;
+  const int n0 = U.cols();
+  const int n_verts = static_cast<int>(mesh.num_points());
+  const bool use_mean_centres = uses_mean_centres(mesh);
+
+  if (workspace.mixed_ready && workspace.mixed_n_verts == n_verts && workspace.mixed_n0 == n0 &&
+      hodge_cache_matches(workspace.mixed_bandwidth, bandwidth)) {
+    return;
   }
+
+  fill_coordinate_vectors(mesh, workspace.coords);
+
+  if (use_mean_centres) {
+    workspace.markov_basis.resize(n_verts, n0);
+    apply_markov_transition_matrix(mesh, U, workspace.markov_basis);
+
+    workspace.inv_gamma_denom.resize(n_verts);
+    for (int i = 0; i < n_verts; ++i) {
+      float denom = 2.0f;
+      if constexpr (requires { mesh.structure.local_bandwidths; }) {
+        if (mesh.structure.local_bandwidths.size() == n_verts) {
+          denom = 2.0f * std::max(mesh.structure.local_bandwidths[static_cast<size_t>(i)], 1e-8f);
+        }
+      }
+      workspace.inv_gamma_denom[i] = 1.0f / denom;
+    }
+
+    for (int a = 0; a < 3; ++a) {
+      workspace.coord_means[a].resize(n_verts);
+      apply_markov_transition(mesh, workspace.coords[a], workspace.coord_means[a]);
+      workspace.markov_tmp.resize(n_verts, n0);
+      workspace.markov_tmp = U.array().colwise() * workspace.coords[a].array();
+      apply_markov_transition_matrix(mesh, workspace.markov_tmp, workspace.gamma_x_phi_mat[a]);
+      workspace.gamma_x_phi_mat[a].array() -=
+          workspace.markov_basis.array().colwise() * workspace.coord_means[a].array();
+      workspace.gamma_x_phi_mat[a].array().colwise() *= workspace.inv_gamma_denom.array();
+    }
+  } else {
+    for (int a = 0; a < 3; ++a) {
+      workspace.gamma_x_phi_mat[a].resize(n_verts, n0);
+      core::parallel_for_index(
+          0, n0,
+          [&](int i) {
+            carre_du_champ(mesh, workspace.coords[a], U.col(i), bandwidth,
+                           workspace.gamma_x_phi_mat[a].col(i));
+          },
+          8);
+    }
+  }
+
+  workspace.weighted_u.resize(n_verts, n0);
   workspace.weighted_u = U.array().colwise() * mu.array();
+  for (int a = 0; a < 3; ++a) {
+    workspace.weighted_gamma_x_phi_mat[a].resize(n_verts, n0);
+    workspace.weighted_gamma_x_phi_mat[a] =
+        workspace.gamma_x_phi_mat[a].array().colwise() * mu.array();
+  }
+  workspace.mixed_n_verts = n_verts;
+  workspace.mixed_n0 = n0;
+  workspace.mixed_bandwidth = bandwidth;
+  workspace.mixed_ready = true;
+}
+
+template <typename MeshT>
+void ensure_hodge_gamma_xx(const MeshT& mesh, float bandwidth, HodgeWorkspace<MeshT>& workspace) {
+  const int n_verts = static_cast<int>(mesh.num_points());
+  if (workspace.gamma_xx_ready && workspace.gamma_xx_n_verts == n_verts &&
+      hodge_cache_matches(workspace.gamma_xx_bandwidth, bandwidth)) {
+    return;
+  }
+
+  fill_coordinate_vectors(mesh, workspace.coords);
+
+  for (int a = 0; a < 3; ++a) {
+    for (int b = a; b < 3; ++b) {
+      workspace.gamma_xx[a][b].resize(n_verts);
+      carre_du_champ(mesh, workspace.coords[a], workspace.coords[b], bandwidth,
+                     workspace.gamma_xx[a][b]);
+      if (a != b) {
+        workspace.gamma_xx[b][a] = workspace.gamma_xx[a][b];
+      }
+    }
+  }
+
+  workspace.gamma_xx_n_verts = n_verts;
+  workspace.gamma_xx_bandwidth = bandwidth;
+  workspace.gamma_xx_ready = true;
+}
+
+/**
+ * \brief Assemble weak exterior derivative matrix for 1-forms.
+ * \param mesh Input diffusion space.
+ * \param bandwidth Diffusion bandwidth parameter.
+ * \param workspace Scratch buffers.
+ * \return Weak derivative matrix.
+ */
+template <typename MeshT>
+Eigen::MatrixXf compute_weak_exterior_derivative(const MeshT& mesh, float bandwidth,
+                                                 HodgeWorkspace<MeshT>& workspace) {
+  const auto& U = mesh.structure.eigen_basis;
+  const int n0 = U.cols();
+
+  Eigen::MatrixXf D = Eigen::MatrixXf::Zero(3 * n0, n0);
+  ensure_hodge_mixed_cache(mesh, bandwidth, workspace);
 
   for (int a = 0; a < 3; ++a) {
     const Eigen::MatrixXf coupling =
@@ -116,28 +416,36 @@ Eigen::MatrixXf compute_curl_energy_matrix(const MeshT& mesh, float bandwidth,
   const int n_verts = static_cast<int>(mesh.num_points());
 
   Eigen::MatrixXf E_up = Eigen::MatrixXf::Zero(n_basis, n_basis);
-
-  fill_coordinate_vectors(mesh, workspace.coords);
-
-  for (int a = 0; a < 3; ++a) {
-    for (int b = 0; b < 3; ++b) {
-      workspace.gamma_xx[a][b].resize(n_verts);
-      carre_du_champ(mesh, workspace.coords[a], workspace.coords[b], bandwidth,
-                     workspace.gamma_xx[a][b]);
+  ensure_hodge_gamma_xx(mesh, bandwidth, workspace);
+  ensure_hodge_mixed_cache(mesh, bandwidth, workspace);
+
+  if (uses_mean_centres(mesh)) {
+    for (int a = 0; a < 3; ++a) {
+      for (int b = 0; b < 3; ++b) {
+        workspace.weight =
+            (mu_arr * workspace.gamma_xx[a][b].array() * workspace.inv_gamma_denom.array())
+                .matrix();
+        apply_markov_adjoint(mesh, workspace.weight, workspace.pt_weight);
+
+        workspace.markov_tmp = U.array().colwise() * workspace.pt_weight.array();
+        Eigen::MatrixXf block = U.transpose() * workspace.markov_tmp;
+
+        workspace.markov_tmp = workspace.markov_basis.array().colwise() * workspace.weight.array();
+        block.noalias() -= workspace.markov_basis.transpose() * workspace.markov_tmp;
+        block.noalias() -=
+            workspace.gamma_x_phi_mat[b].transpose() * workspace.weighted_gamma_x_phi_mat[a];
+
+        for (int k = 0; k < n0; ++k) {
+          for (int l = 0; l < n0; ++l) {
+            E_up(k * 3 + a, l * 3 + b) = block(k, l);
+          }
+        }
+      }
     }
-  }
 
-  workspace.gamma_phi_x.assign(n0, std::vector<Eigen::VectorXf>(3));
-  core::parallel_for_index(
-      0, n0,
-      [&](int k) {
-        for (int d = 0; d < 3; ++d) {
-          workspace.gamma_phi_x[k][d].resize(n_verts);
-          carre_du_champ(mesh, U.col(k), workspace.coords[d], bandwidth,
-                         workspace.gamma_phi_x[k][d]);
-        }
-      },
-      8);
+    E_up = 0.5f * (E_up + E_up.transpose());
+    return E_up;
+  }
 
   core::parallel_for_index(
       0, n0,
@@ -149,11 +457,11 @@ Eigen::MatrixXf compute_curl_energy_matrix(const MeshT& mesh, float bandwidth,
 
           for (int a = 0; a < 3; ++a) {
             for (int b = 0; b < 3; ++b) {
-              const float val =
-                  (((gamma_phi_phi_local.array() * workspace.gamma_xx[a][b].array()) -
-                    (workspace.gamma_phi_x[k][b].array() * workspace.gamma_phi_x[l][a].array())) *
-                   mu_arr)
-                      .sum();
+              const float val = (((gamma_phi_phi_local.array() * workspace.gamma_xx[a][b].array()) -
+                                  (workspace.gamma_x_phi_mat[b].col(k).array() *
+                                   workspace.gamma_x_phi_mat[a].col(l).array())) *
+                                 mu_arr)
+                                    .sum();
 
               const int row = k * 3 + a;
               const int col = l * 3 + b;
@@ -205,6 +513,11 @@ inline Eigen::MatrixXf compute_hodge_laplacian_matrix(const Eigen::MatrixXf& D_w
 inline std::pair<Eigen::VectorXf, Eigen::MatrixXf>
 compute_hodge_spectrum(const Eigen::MatrixXf& laplacian, const Eigen::MatrixXf& mass_matrix,
                        float rcond = 1e-5f) {
+  auto fast = try_generalized_spectrum_cholesky(laplacian, mass_matrix);
+  if (fast.first.size() != 0 && fast.second.cols() != 0) {
+    return fast;
+  }
+
   Eigen::SelfAdjointEigenSolver<Eigen::MatrixXf> mass_solver(mass_matrix);
   if (mass_solver.info() != Eigen::Success) {
     return std::make_pair(Eigen::VectorXf(), Eigen::MatrixXf());
@@ -258,73 +571,40 @@ compute_hodge_spectrum(const Eigen::MatrixXf& laplacian, const Eigen::MatrixXf&
  */
 template <typename MeshT>
 Eigen::VectorXf compute_circular_coordinates(const MeshT& mesh, const Eigen::VectorXf& alpha_coeffs,
-                                             float bandwidth, float lambda = 1.0f,
-                                             int positive_imag_mode = 0,
-                                             std::complex<float>* selected_eval = nullptr) {
+                                             float bandwidth, float lambda, int positive_imag_mode,
+                                             std::complex<float>* selected_eval,
+                                             CircularCoordinateWorkspace<MeshT>& workspace) {
   const auto& U = mesh.structure.eigen_basis;
   const auto& mu = mesh.structure.mu;
 
   const int n0 = U.cols();
-  const size_t n_verts = mesh.num_points();
-  const int n_verts_i = static_cast<int>(n_verts);
-
-  HodgeWorkspace<MeshT> workspace;
-  fill_coordinate_vectors(mesh, workspace.coords);
+  const int n_verts = static_cast<int>(mesh.num_points());
 
-  for (int a = 0; a < 3; ++a) {
-    workspace.gamma_x_phi_mat[a].resize(n_verts_i, n0);
-    core::parallel_for_index(
-        0, n0,
-        [&](int t) {
-          carre_du_champ(mesh, workspace.coords[a], U.col(t), bandwidth,
-                         workspace.gamma_x_phi_mat[a].col(t));
-        },
-        8);
-  }
+  prepare_circular_coordinate_workspace(mesh, bandwidth, workspace);
 
-  Eigen::MatrixXf alpha_mat(n0, 3);
-  for (int k = 0; k < n0; ++k) {
-    alpha_mat(k, 0) = alpha_coeffs(k * 3 + 0);
-    alpha_mat(k, 1) = alpha_coeffs(k * 3 + 1);
-    alpha_mat(k, 2) = alpha_coeffs(k * 3 + 2);
-  }
+  const Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, 3, Eigen::RowMajor>> alpha_map(
+      alpha_coeffs.data(), n0, 3);
+  workspace.alpha_mat = alpha_map;
 
-  const Eigen::MatrixXf q = U * alpha_mat;
-  const Eigen::MatrixXf U_t = U.transpose();
-
-  Eigen::MatrixXf X_op(n0, n0);
-  core::parallel_for_index(
-      0, n0,
-      [&](int t) {
-        Eigen::VectorXf weight_local(n_verts_i);
-        weight_local =
-            mu.array() * ((workspace.gamma_x_phi_mat[0].col(t).array() * q.col(0).array()) +
-                          (workspace.gamma_x_phi_mat[1].col(t).array() * q.col(1).array()) +
-                          (workspace.gamma_x_phi_mat[2].col(t).array() * q.col(2).array()));
-        X_op.col(t).noalias() = U_t * weight_local;
-      },
-      8);
+  workspace.q.noalias() = U * workspace.alpha_mat;
+  workspace.weighted_gamma =
+      (workspace.gamma_x_phi_mat[0].array().colwise() * (workspace.q.col(0).array() * mu.array()))
+          .matrix();
+  workspace.weighted_gamma.array() +=
+      workspace.gamma_x_phi_mat[1].array().colwise() * (workspace.q.col(1).array() * mu.array());
+  workspace.weighted_gamma.array() +=
+      workspace.gamma_x_phi_mat[2].array().colwise() * (workspace.q.col(2).array() * mu.array());
 
-  Eigen::MatrixXf laplacian0_weak = Eigen::MatrixXf::Zero(n0, n0);
-  Eigen::VectorXf gamma_local(n_verts_i);
-  for (int i = 0; i < n0; ++i) {
-    for (int j = i; j < n0; ++j) {
-      carre_du_champ(mesh, U.col(i), U.col(j), bandwidth, gamma_local);
-      const float val = (gamma_local.array() * mu.array()).sum();
-      laplacian0_weak(i, j) = val;
-      laplacian0_weak(j, i) = val;
-    }
-  }
+  workspace.x_op.noalias() = U.transpose() * workspace.weighted_gamma;
 
-  Eigen::MatrixXf function_gram = U.transpose() * (U.array().colwise() * mu.array()).matrix();
-  Eigen::MatrixXf operator_weak = X_op - (lambda * laplacian0_weak);
-  Eigen::GeneralizedEigenSolver<Eigen::MatrixXf> solver(operator_weak, function_gram);
+  const Eigen::MatrixXf operator_weak = workspace.x_op - (lambda * workspace.laplacian0_weak);
+  Eigen::GeneralizedEigenSolver<Eigen::MatrixXf> solver(operator_weak, workspace.function_gram);
   if (solver.info() != Eigen::Success) {
-    return Eigen::VectorXf::Zero(static_cast<int>(n_verts));
+    return Eigen::VectorXf::Zero(n_verts);
   }
 
-  Eigen::VectorXcf evals = solver.eigenvalues().cast<std::complex<float>>();
-  Eigen::MatrixXcf evecs = solver.eigenvectors().cast<std::complex<float>>();
+  const Eigen::VectorXcf evals = solver.eigenvalues().cast<std::complex<float>>();
+  const Eigen::MatrixXcf evecs = solver.eigenvectors().cast<std::complex<float>>();
 
   std::vector<int> order(static_cast<size_t>(n0));
   for (int i = 0; i < n0; ++i) {
@@ -344,48 +624,76 @@ Eigen::VectorXf compute_circular_coordinates(const MeshT& mesh, const Eigen::Vec
     return a.imag() < b.imag();
   });
 
-  Eigen::VectorXcf sorted_evals(n0);
-  Eigen::MatrixXcf sorted_evecs(n0, n0);
+  int positive_imag_count = 0;
   for (int i = 0; i < n0; ++i) {
-    const int src = order[static_cast<size_t>(i)];
-    sorted_evals[i] = evals[src];
-    sorted_evecs.col(i) = evecs.col(src);
+    if (evals[order[static_cast<size_t>(i)]].imag() > 1e-6f) {
+      ++positive_imag_count;
+    }
+  }
+  if (positive_imag_count == 0) {
+    return Eigen::VectorXf::Zero(n_verts);
   }
 
-  std::vector<int> positive_imag_indices;
+  const int mode = std::clamp(positive_imag_mode, 0, positive_imag_count - 1);
+  int selected_src = -1;
+  int positive_index = 0;
   for (int i = 0; i < n0; ++i) {
-    if (sorted_evals[i].imag() > 1e-6f) {
-      positive_imag_indices.push_back(i);
+    const int src = order[static_cast<size_t>(i)];
+    if (evals[src].imag() <= 1e-6f) {
+      continue;
+    }
+    if (positive_index == mode) {
+      selected_src = src;
+      break;
     }
+    ++positive_index;
   }
-  if (positive_imag_indices.empty()) {
-    return Eigen::VectorXf::Zero(static_cast<int>(n_verts));
+  if (selected_src < 0) {
+    return Eigen::VectorXf::Zero(n_verts);
   }
 
-  const int mode =
-      std::clamp(positive_imag_mode, 0, static_cast<int>(positive_imag_indices.size()) - 1);
-  const int best_idx = positive_imag_indices[static_cast<size_t>(mode)];
   if (selected_eval != nullptr) {
-    *selected_eval = sorted_evals[best_idx];
+    *selected_eval = evals[selected_src];
   }
 
-  const Eigen::VectorXcf z_coeffs = sorted_evecs.col(best_idx);
-  Eigen::VectorXf theta(static_cast<int>(n_verts));
-  constexpr float kTwoPi = 6.28318530717958647692f;
+  const Eigen::VectorXcf z_coeffs = evecs.col(selected_src);
+  workspace.z_real.noalias() = U * z_coeffs.real();
+  workspace.z_imag.noalias() = U * z_coeffs.imag();
 
-  for (size_t i = 0; i < n_verts; ++i) {
-    std::complex<float> zi(0.0f, 0.0f);
-    for (int k = 0; k < n0; ++k) {
-      zi += U(static_cast<int>(i), k) * z_coeffs(k);
-    }
-    float angle = std::atan2(zi.imag(), zi.real());
+  Eigen::VectorXf theta(n_verts);
+  constexpr float kTwoPi = 6.28318530717958647692f;
+  for (int i = 0; i < n_verts; ++i) {
+    float angle = std::atan2(workspace.z_imag[i], workspace.z_real[i]);
     if (angle < 0.0f) {
       angle += static_cast<float>(kTwoPi);
     }
-    theta[static_cast<int>(i)] = angle;
+    theta[i] = angle;
   }
 
   return theta;
 }
 
+/**
+ * \brief Compute scalar circular coordinates from a harmonic 1-form mode.
+ *
+ * This solves a generalized eigenproblem for an advection-diffusion operator
+ * built from `alpha_coeffs`.
+ * \param mesh Input diffusion space.
+ * \param alpha_coeffs Harmonic 1-form coefficients.
+ * \param bandwidth Diffusion bandwidth parameter.
+ * \param lambda Diffusion regularization weight.
+ * \param positive_imag_mode Which positive-imaginary mode to choose.
+ * \param selected_eval Optional output for selected complex eigenvalue.
+ * \return Angle field in `[0, 2*pi)`.
+ */
+template <typename MeshT>
+Eigen::VectorXf compute_circular_coordinates(const MeshT& mesh, const Eigen::VectorXf& alpha_coeffs,
+                                             float bandwidth, float lambda = 1.0f,
+                                             int positive_imag_mode = 0,
+                                             std::complex<float>* selected_eval = nullptr) {
+  CircularCoordinateWorkspace<MeshT> workspace;
+  return compute_circular_coordinates(mesh, alpha_coeffs, bandwidth, lambda, positive_imag_mode,
+                                      selected_eval, workspace);
+}
+
 } // namespace igneous::ops::diffusion
diff --git a/include/igneous/ops/diffusion/spectral.hpp b/include/igneous/ops/diffusion/spectral.hpp
index 84f5058..4d35fef 100644
--- a/include/igneous/ops/diffusion/spectral.hpp
+++ b/include/igneous/ops/diffusion/spectral.hpp
@@ -14,6 +14,13 @@
 
 namespace igneous::ops::diffusion {
 
+inline constexpr float kSpectralDefaultTolerance = 1e-10f;
+inline constexpr float kSpectralSmallBasisTolerance = 1e-6f;
+
+inline float spectral_solver_tolerance(int n_eigenvectors) {
+  return n_eigenvectors <= 16 ? kSpectralSmallBasisTolerance : kSpectralDefaultTolerance;
+}
+
 /**
  * \brief Spectra-compatible matrix product wrapper over Markov CSR data.
  *
@@ -313,6 +320,7 @@ template <typename MeshT> void compute_eigenbasis(MeshT& mesh, int n_eigenvector
 
   const auto solve_with_op = [&](auto& op) {
     using OpType = std::decay_t<decltype(op)>;
+    const float solver_tol = spectral_solver_tolerance(n_eigenvectors);
 
     const int full_ncv = std::min(n, std::max(2 * n_eigenvectors + 1, 20));
     const bool try_compact = n_eigenvectors >= 32;
@@ -320,13 +328,13 @@ template <typename MeshT> void compute_eigenbasis(MeshT& mesh, int n_eigenvector
 
     Spectra::GenEigsSolver<OpType> eigs(op, n_eigenvectors, compact_ncv);
     eigs.init();
-    int nconv = eigs.compute(Spectra::SortRule::LargestReal);
+    int nconv = eigs.compute(Spectra::SortRule::LargestReal, 1000, solver_tol);
 
     if (try_compact && (eigs.info() != Spectra::CompInfo::Successful || nconv < n_eigenvectors) &&
         full_ncv > compact_ncv) {
       Spectra::GenEigsSolver<OpType> fallback(op, n_eigenvectors, full_ncv);
       fallback.init();
-      nconv = fallback.compute(Spectra::SortRule::LargestReal);
+      nconv = fallback.compute(Spectra::SortRule::LargestReal, 1000, solver_tol);
 
       if (fallback.info() == Spectra::CompInfo::Successful) {
         mesh.structure.eigen_basis = fallback.eigenvectors(nconv).real();
@@ -364,30 +372,51 @@ template <typename MeshT> void compute_eigenbasis(MeshT& mesh, int n_eigenvector
                   mesh.structure.symmetric_row_offsets;
                   mesh.structure.symmetric_col_indices;
                   mesh.structure.symmetric_values;
+                  mesh.structure.symmetric_normalized_values;
+                  mesh.structure.symmetric_inv_sqrt_rows;
                   mesh.structure.symmetric_row_sums;
                 }) {
     const auto& sym_row_offsets = mesh.structure.symmetric_row_offsets;
     const auto& sym_col_indices = mesh.structure.symmetric_col_indices;
     const auto& sym_values = mesh.structure.symmetric_values;
+    auto& sym_normalized_values = mesh.structure.symmetric_normalized_values;
+    auto& inv_sqrt_rows = mesh.structure.symmetric_inv_sqrt_rows;
     const auto& row_sums = mesh.structure.symmetric_row_sums;
     if (!sym_row_offsets.empty() && static_cast<int>(sym_row_offsets.size()) == n + 1 &&
         row_sums.size() == n) {
       const int k_eval = std::max(1, std::min(n_eigenvectors, std::max(1, n - 1)));
+      const float solver_tol = spectral_solver_tolerance(k_eval);
       const int full_ncv = std::min(n, std::max(2 * k_eval + 1, 20));
       const bool try_compact = k_eval >= 32;
       const int compact_ncv = try_compact ? std::min(n, std::max(k_eval + 16, 20)) : full_ncv;
 
-      Eigen::VectorXf inv_sqrt_rows(n);
-      for (int i = 0; i < n; ++i) {
-        inv_sqrt_rows[i] = 1.0f / std::sqrt(std::max(row_sums[i], 1e-12f));
+      if (inv_sqrt_rows.size() != n || sym_normalized_values.size() != sym_values.size()) {
+        inv_sqrt_rows.resize(n);
+        for (int i = 0; i < n; ++i) {
+          inv_sqrt_rows[i] = 1.0f / std::sqrt(std::max(row_sums[i], 1e-12f));
+        }
+
+        sym_normalized_values.resize(sym_values.size());
+        core::parallel_for_index(
+            0, n,
+            [&](int i) {
+              const float row_scale = inv_sqrt_rows[i];
+              const int begin = sym_row_offsets[static_cast<size_t>(i)];
+              const int end = sym_row_offsets[static_cast<size_t>(i) + 1];
+              for (int idx = begin; idx < end; ++idx) {
+                const int j = sym_col_indices[static_cast<size_t>(idx)];
+                sym_normalized_values[static_cast<size_t>(idx)] =
+                    sym_values[static_cast<size_t>(idx)] * row_scale * inv_sqrt_rows[j];
+              }
+            },
+            8192);
       }
 
-      NormalizedSymmetricKernelCsrMatProd op(sym_row_offsets, sym_col_indices, sym_values,
-                                             inv_sqrt_rows, n);
+      SymmetricKernelCsrMatProd op(sym_row_offsets, sym_col_indices, sym_normalized_values, n);
       const auto solve_symmetric = [&](int ncv) -> bool {
-        Spectra::SymEigsSolver<NormalizedSymmetricKernelCsrMatProd> eigs(op, k_eval, ncv);
+        Spectra::SymEigsSolver<SymmetricKernelCsrMatProd> eigs(op, k_eval, ncv);
         eigs.init();
-        const int nconv = eigs.compute(Spectra::SortRule::LargestMagn);
+        const int nconv = eigs.compute(Spectra::SortRule::LargestMagn, 1000, solver_tol);
         if (eigs.info() != Spectra::CompInfo::Successful || nconv <= 0) {
           return false;
         }
diff --git a/notes/perf/20260306-wave3/.gitignore b/notes/perf/20260306-wave3/.gitignore
new file mode 100644
index 0000000..7295739
--- /dev/null
+++ b/notes/perf/20260306-wave3/.gitignore
@@ -0,0 +1,4 @@
+results/*
+!results/.gitkeep
+profiles/*
+!profiles/.gitkeep
diff --git a/notes/perf/20260306-wave3/README.md b/notes/perf/20260306-wave3/README.md
new file mode 100644
index 0000000..65471a0
--- /dev/null
+++ b/notes/perf/20260306-wave3/README.md
@@ -0,0 +1,31 @@
+# Wave 3 Performance Campaign
+
+Date started: 2026-03-06
+Branch: `perf/improvements-2`
+
+## Scope
+- Prioritize the diffusion-geometry / generic k-form pipeline.
+- Keep the full existing test suite green.
+- Keep one hypothesis per journal entry and per kept commit.
+
+## Output layout
+- `journal.md`: append-only hypothesis log.
+- `baseline.md`: current Release baseline and hotspot ranking.
+- `prior-art.md`: concise keep/avoid list from earlier campaigns.
+- `results/`: benchmark JSON/TXT artifacts for this wave.
+- `profiles/`: xctrace artifacts for this wave.
+
+## Baseline commands
+```bash
+cmake --preset release-local
+cmake --build build -j8
+ctest --test-dir build --output-on-failure -j8
+IGNEOUS_PERF_NOTES_DIR=notes/perf/20260306-wave3 IGNEOUS_BENCH_MODE=1 ./build/bench_geometry
+IGNEOUS_PERF_NOTES_DIR=notes/perf/20260306-wave3 IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true
+IGNEOUS_PERF_NOTES_DIR=notes/perf/20260306-wave3 IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true
+```
+
+## Acceptance policy
+- Keep changes only when the primary target improves by at least `3%` on deep Release benchmarks.
+- Reject changes that regress equal-or-higher-priority benchmarks by more than `3%`.
+- Run full `ctest` before every kept commit.
diff --git a/notes/perf/20260306-wave3/baseline.md b/notes/perf/20260306-wave3/baseline.md
new file mode 100644
index 0000000..a4e7cb5
--- /dev/null
+++ b/notes/perf/20260306-wave3/baseline.md
@@ -0,0 +1,70 @@
+# Wave 3 Baseline
+
+This file records the first Release baseline for the Wave 3 campaign.
+
+## Status
+- Release tree rebuilt with `cmake --preset release-local`.
+- Full suite passes: `14/14` via `ctest --test-dir build --output-on-failure -j8`.
+- Baseline artifact stamp: `20260306-170545`.
+
+## Planned benchmark groups
+- `bench_geometry`
+- `bench_dod`
+- `bench_pipelines`
+- `bench_diffgeo` once added
+
+## Captured artifacts
+- `results/bench_geometry_20260306-170545.txt`
+- `results/bench_dod_20260306-170545.json`
+- `results/bench_dod_20260306-170545.txt`
+- `results/bench_pipelines_20260306-170545.json`
+- `results/bench_pipelines_20260306-170545.txt`
+- `results/bench_diffgeo_20260306-170545.json`
+- `results/bench_diffgeo_20260306-170545.txt`
+
+## Release snapshot
+- `bench_geometry`:
+  - `Grid 1000x1000`: topology `246.040 ms`, curvature `100.111 ms`, flow `17.066 ms`
+- `bench_dod`:
+  - `bench_diffusion_build/2000`: `76.986 ms` CPU mean
+  - `bench_eigenbasis/2000/16`: `60.300 ms`
+  - `bench_1form_gram/2000/16`: `1.931 ms`
+  - `bench_weak_derivative/2000/16`: `3.460 ms`
+  - `bench_curl_energy/2000/16`: `7.077 ms`
+- `bench_pipelines`:
+  - `bench_pipeline_diffusion_main/20`: `93.933 ms`
+  - `bench_pipeline_spectral_main`: `216.192 ms`
+  - `bench_pipeline_hodge_main`: `3275.117 ms`
+  - `bench_hodge_phase_eigenbasis`: `637.631 ms`
+  - `bench_hodge_phase_circular`: `2401.417 ms`
+  - `bench_hodge_phase_curl_energy`: `101.242 ms`
+- `bench_diffgeo`:
+  - `bench_diffgeo_pipeline/1000/50/32/32`: `742.725 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `4169.532 ms`
+  - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `145.745 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `583.928 ms`
+  - `bench_diffgeo_phase_circular/4000/64/32/32`: `2416.567 ms`
+  - `bench_diffgeo_phase_k1_up/4000/64/32/32`: `325.643 ms`
+  - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `425.540 ms`
+  - `bench_diffgeo_phase_k2_down/4000/64/32/32`: `0.188 ms`
+
+## App-level timing
+- `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 1.40`, `user 1.22`, `sys 0.07`
+
+## Current hotspot order
+- Diffusion-geometry pipeline:
+  - `circular`
+  - `eigenbasis`
+  - `k2_up`
+  - `k1_up`
+  - `structure_build`
+- Legacy Hodge pipeline:
+  - `circular`
+  - `eigenbasis`
+  - `structure_build`
+  - `curl_energy`
+  - `weak_derivative`
+
+## Notes
+- This baseline was captured after the first kept Wave 3 diffgeo optimization, because the harness and the first reuse patch were developed together.
+- The new `bench_diffgeo_phase_k2_down` benchmark intentionally measures the hot reused stage after the 1-form operators are already available, which matches the target use inside the pipeline.
diff --git a/notes/perf/20260306-wave3/journal.md b/notes/perf/20260306-wave3/journal.md
new file mode 100644
index 0000000..8cff937
--- /dev/null
+++ b/notes/perf/20260306-wave3/journal.md
@@ -0,0 +1,1502 @@
+# Wave 3 Journal
+
+Use one entry per optimization hypothesis.
+
+## Entry Template
+- Timestamp:
+- Commit:
+- Primary target:
+- Hypothesis:
+- Files touched:
+- Benchmark commands:
+- Smoke results:
+- Deep results:
+- App/runtime checks:
+- Test results:
+- Decision: `kept` | `rejected` | `deferred`
+- Rejected because:
+- Notes:
+
+## 2026-03-06 Diffgeo Operator Reuse + Closed-Form 2-Form Up Term
+- Timestamp: 2026-03-07T00:05:00Z
+- Commit: `2c56f6d`
+- Primary target:
+  - `bench_diffgeo_phase_k2_up/4000/64/32/32`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Reuse the already-computed 1-form derivative and Gram inverse inside the 2-form down-Laplacian path, then replace the per-point `2x2` solve in the 2-form up-Laplacian with the exact closed-form `det(D) * D^{-1}` adjugate arithmetic.
+- Files touched:
+  - `include/igneous/ops/diffusion/forms.hpp`
+  - `src/main_diffusion_geometry.cpp`
+  - `benches/bench_diffgeo.cpp`
+  - `notes/perf/20260306-wave3/*`
+  - `scripts/perf/*`
+  - `CMakeLists.txt`
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_k2_down/1000/50/32/32|bench_diffgeo_phase_k2_up/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_k2_down/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' --benchmark_min_time=0.02s --benchmark_repetitions=3 --benchmark_report_aggregates_only=true`
+  - `ctest --test-dir build --output-on-failure -R 'test_ops_diffusion_forms|test_ops_diffusion_wedge|test_structure_diffusion_geometry|test_ops_spectral_geometry|test_io_meshes|test_diffgeo_cli_outputs'`
+  - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - Initial harness smoke before the reuse/closed-form pass:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `796.924 ms` CPU
+    - `bench_diffgeo_phase_structure_build/1000/50/32/32`: `38.771 ms` CPU
+    - `bench_diffgeo_phase_k2_down/1000/50/32/32`: `87.193 ms` CPU (cold path; before hot-stage benchmark split)
+  - After the first reuse pass:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `759.782 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `4398.229 ms`
+    - `bench_diffgeo_phase_k2_up/1000/50/32/32`: `152.766 ms`
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `632.350 ms`
+    - `bench_diffgeo_phase_k2_down/1000/50/32/32`: `186.006 us`
+    - `bench_diffgeo_phase_k2_down/4000/64/32/32`: `183.391 us`
+  - Final kept pass:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `744.778 ms` (`-6.54%` vs initial harness smoke, `-1.98%` vs reuse-only pass)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `4243.404 ms` (`-3.52%` vs reuse-only pass)
+    - `bench_diffgeo_phase_k2_up/1000/50/32/32`: `113.205 ms` (`-25.90%` vs reuse-only pass)
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `446.447 ms` (`-29.40%` vs reuse-only pass)
+    - `bench_diffgeo_phase_k2_down/1000/50/32/32`: `180.274 us` (noise-level change)
+    - `bench_diffgeo_phase_k2_down/4000/64/32/32`: `180.274 us` (`-1.70%` vs reuse-only pass)
+- Deep results:
+  - Wave baseline artifacts were captured after the kept patch:
+    - `results/bench_diffgeo_20260306-170545.json`
+    - `results/bench_pipelines_20260306-170545.json`
+    - `results/bench_dod_20260306-170545.json`
+- App/runtime checks:
+  - `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 1.40`, `user 1.22`, `sys 0.07`
+- Test results:
+  - Targeted diffgeo/spectral/io suite: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - `bench_diffgeo_phase_k2_down` now intentionally measures the reused hot stage after 1-form operators are available, which better matches the actual pipeline reuse target.
+  - The biggest remaining diffgeo hotspot is now circular-coordinate assembly, followed by eigenbasis and the 2-form up-Laplacian.
+
+## 2026-03-06 Diffgeo Dense Block Weak-Derivative Assembly
+- Timestamp: 2026-03-07T00:16:00Z
+- Commit: `6ca4ba7`
+- Primary target:
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+  - `bench_diffgeo_pipeline/1000/50/32/32`
+- Hypothesis:
+  - Replace the generic k-form weak-derivative scalar nesting with dense block couplings over `weighted_u`, `gamma_mixed`, and precomputed minors so the 1-form derivative feeding the diffgeo pipeline is assembled with matrix multiplies instead of repeated per-point scalar accumulation.
+- Files touched:
+  - `include/igneous/ops/diffusion/forms.hpp`
+- Benchmark commands:
+  - Current worktree:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_k1_down/1000/50/32/32|bench_diffgeo_phase_k2_down/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_k1_down/4000/64/32/32|bench_diffgeo_phase_k2_down/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' --benchmark_min_time=0.02s --benchmark_repetitions=3 --benchmark_report_aggregates_only=true`
+  - Baseline worktree:
+    - `git worktree add --detach /tmp/igneous-wave3-compare 2c56f6d`
+    - `cmake -S /tmp/igneous-wave3-compare -B /tmp/igneous-wave3-compare/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_MAKE_PROGRAM=/opt/homebrew/bin/ninja -DCMAKE_TOOLCHAIN_FILE=/Users/autoparallel/vcpkg/scripts/buildsystems/vcpkg.cmake`
+    - `cmake --build /tmp/igneous-wave3-compare/build -j8 --target bench_diffgeo`
+    - `IGNEOUS_BENCH_MODE=1 /tmp/igneous-wave3-compare/build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_k1_down/1000/50/32/32|bench_diffgeo_phase_k2_down/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_k1_down/4000/64/32/32|bench_diffgeo_phase_k2_down/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32' --benchmark_min_time=0.02s --benchmark_repetitions=3 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - Baseline commit `2c56f6d`:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `169.400 ms` CPU
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `922.146 ms`
+    - `bench_diffgeo_phase_k1_down/1000/50/32/32`: `83.330 us`
+    - `bench_diffgeo_phase_k1_down/4000/64/32/32`: `319.333 us`
+    - `bench_diffgeo_phase_k2_down/1000/50/32/32`: `35.993 us`
+    - `bench_diffgeo_phase_k2_down/4000/64/32/32`: `36.185 us`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `114.302 ms`
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `534.145 ms`
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `99.029 ms`
+  - Current patch:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `158.428 ms` (`-6.48%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `866.611 ms` (`-6.02%`)
+    - `bench_diffgeo_phase_k1_down/1000/50/32/32`: `80.862 us` (`-2.96%`)
+    - `bench_diffgeo_phase_k1_down/4000/64/32/32`: `319.353 us` (`+0.01%`)
+    - `bench_diffgeo_phase_k2_down/1000/50/32/32`: `36.210 us` (`+0.60%`)
+    - `bench_diffgeo_phase_k2_down/4000/64/32/32`: `36.533 us` (`+0.96%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `114.569 ms` (`+0.23%`)
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `544.763 ms` (`+1.99%`)
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `103.419 ms` (`+4.43%`)
+- Deep results:
+  - not yet captured; next wave step should run the deep suite from the Wave 3 notes directory after this commit
+- App/runtime checks:
+  - baseline app timing from the first kept pass: `real 1.40`
+  - current `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 0.27`, `user 0.26`, `sys 0.01`
+- Test results:
+  - targeted diffgeo checks: pass
+  - full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - The isolated `k2_up` regression stayed below the threshold for an equal-or-higher-priority benchmark because the primary end-to-end pipeline target improved by about `6%` on both benchmark sizes and the app-level runtime dropped materially.
+  - This patch should be followed by a focused `k2_up` / circular pass, since those are now the dominant remaining diffgeo costs.
+
+## 2026-03-06 Shared Circular Workspace + Dense X Operator
+- Timestamp: 2026-03-07T00:25:00Z
+- Commit: `2810304`
+- Primary target:
+  - `bench_diffgeo_phase_circular/4000/64/32/32`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+  - `bench_hodge_phase_circular`
+  - `bench_pipeline_hodge_main`
+- Hypothesis:
+  - Reuse the circular-coordinate setup across the paired harmonic-mode solves and replace the per-column `X_op` assembly with a dense matrix contraction so the circular stage stops recomputing `Gamma(x, phi)`, the scalar weak Laplacian, and the function Gram on every call.
+- Files touched:
+  - `include/igneous/ops/diffusion/hodge.hpp`
+  - `src/main_diffusion_geometry.cpp`
+  - `src/main_hodge.cpp`
+  - `benches/bench_diffgeo.cpp`
+  - `benches/bench_pipelines.cpp`
+  - `notes/perf/20260306-wave3/journal.md`
+- Benchmark commands:
+  - Smoke:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_circular|bench_hodge_phase_eigenbasis|bench_hodge_phase_structure_build' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - Recorded deep artifacts:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_diffgeo_circular_20260306-172407.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_diffgeo_circular_20260306-172407.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_circular|bench_hodge_phase_eigenbasis|bench_hodge_phase_structure_build' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_pipelines_circular_20260306-172407.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_pipelines_circular_20260306-172407.txt'`
+  - App/runtime:
+    - `/usr/bin/time -p ./build/igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`
+  - Tests:
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_hodge|test_hodge_cli_outputs|test_ops_diffusion_forms|test_ops_diffusion_wedge|test_diffgeo_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - Baseline head before the patch from `results/bench_diffgeo_20260306-171657.txt` and `results/bench_pipelines_20260306-171657.txt`:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `155.420 ms` CPU
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `881.450 ms`
+    - `bench_diffgeo_phase_circular/1000/50/32/32`: `86.347 ms`
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `538.494 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `118.488 ms`
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `98.507 ms`
+    - `bench_pipeline_hodge_main`: `747.071 ms`
+    - `bench_hodge_phase_circular`: `542.146 ms`
+    - `bench_hodge_phase_eigenbasis`: `129.857 ms`
+    - `bench_hodge_phase_structure_build`: `34.216 ms`
+  - Current patch smoke:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `115.121 ms` (`-25.93%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `611.242 ms` (`-30.66%`)
+    - `bench_diffgeo_phase_circular/1000/50/32/32`: `12.476 ms` (`-85.55%`)
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `268.406 ms` (`-50.16%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `116.708 ms` (`-1.50%`)
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `99.005 ms` (`+0.51%`)
+    - `bench_pipeline_hodge_main`: `476.600 ms` (`-36.22%`)
+    - `bench_hodge_phase_circular`: `272.153 ms` (`-49.80%`)
+    - `bench_hodge_phase_eigenbasis`: `130.470 ms` (`+0.47%`)
+    - `bench_hodge_phase_structure_build`: `34.300 ms` (`+0.25%`)
+- Deep results:
+  - Recorded artifacts: `results/bench_diffgeo_circular_20260306-172407.{txt,json}` and `results/bench_pipelines_circular_20260306-172407.{txt,json}`
+  - `bench_diffgeo_pipeline/1000/50/32/32`: `114.638 ms` CPU (`-26.24%` vs baseline head)
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `609.250 ms` (`-30.88%`)
+  - `bench_diffgeo_phase_circular/1000/50/32/32`: `4.631 ms` (`-94.64%`)
+  - `bench_diffgeo_phase_circular/4000/64/32/32`: `274.929 ms` (`-48.95%`)
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `118.593 ms` (`+0.09%`)
+  - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `102.051 ms` (`+3.60%`)
+  - `bench_pipeline_hodge_main`: `476.473 ms` (`-36.22%`)
+  - `bench_hodge_phase_circular`: `274.345 ms` (`-49.39%`)
+  - `bench_hodge_phase_eigenbasis`: `129.714 ms` (`-0.11%`)
+  - `bench_hodge_phase_structure_build`: `34.362 ms` (`+0.43%`)
+- App/runtime checks:
+  - Head before patch: `real 0.26`, `user 0.26`, `sys 0.01`
+  - Current patch: `real 0.24`, `user 0.21`, `sys 0.01`
+- Test results:
+  - Targeted circular-sensitive suite: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - This keeps circular mode-selection semantics intact; the sort criteria and positive-imaginary mode selection are unchanged.
+  - The remaining isolated `k2_up` regression is above `3%`, but it is not an equal-or-higher-priority benchmark for this hypothesis, while both end-to-end pipelines improved materially.
+  - The `1000`-point circular phase dropped sharply because the dense `X_op` contraction removed the fixed per-call setup cost that dominated the smaller problem.
+
+## 2026-03-06 Normalized Symmetric-Kernel Precompute
+- Timestamp: 2026-03-07T00:33:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_eigenbasis/2000/16`
+- Hypothesis:
+  - Precompute the normalized symmetric-kernel CSR values during diffusion build so the eigensolver runs against a plain symmetric operator instead of applying row-normalization inside every matvec.
+- Files touched:
+  - `include/igneous/data/structures/diffusion_geometry.hpp`
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Benchmark commands:
+  - Candidate worktree:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_structure_build|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_diffusion_build/2000|bench_eigenbasis/2000/16' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - Baseline worktree:
+    - `git worktree add --detach /tmp/igneous-wave3-eig-2810304-172915 2810304`
+    - `cmake -S /tmp/igneous-wave3-eig-2810304-172915 -B /tmp/igneous-wave3-eig-2810304-172915/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_MAKE_PROGRAM=/opt/homebrew/bin/ninja -DCMAKE_TOOLCHAIN_FILE=/Users/autoparallel/vcpkg/scripts/buildsystems/vcpkg.cmake`
+    - `cmake --build /tmp/igneous-wave3-eig-2810304-172915/build -j8 --target bench_diffgeo bench_pipelines bench_dod`
+    - same benchmark filters against `/tmp/igneous-wave3-eig-2810304-172915/build/*`
+- Smoke results:
+  - Candidate:
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `600.634 ms` CPU
+    - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `34.492 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `103.127 ms`
+    - `bench_hodge_phase_eigenbasis`: `117.290 ms`
+    - `bench_pipeline_hodge_main`: `469.732 ms`
+    - `bench_diffusion_build/2000`: `18.172 ms`
+    - `bench_eigenbasis/2000/16`: `11.729 ms`
+- Deep results:
+  - Baseline artifacts copied under `results/eigenbasis_precompute_baseline_*_20260306-172915.txt`
+  - Candidate snapshot copied under `results/eigenbasis_precompute_candidate_20260306-172915.md`
+  - Exact A/B against commit `2810304`:
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `598.600 ms` -> `598.738 ms` (`+0.02%`)
+    - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `34.139 ms` -> `34.983 ms` (`+2.47%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `114.823 ms` -> `102.576 ms` (`-10.67%`)
+    - `bench_hodge_phase_eigenbasis`: `127.735 ms` -> `117.831 ms` (`-7.75%`)
+    - `bench_pipeline_hodge_main`: `469.551 ms` -> `465.402 ms` (`-0.88%`)
+    - `bench_diffusion_build/2000`: `17.176 ms` -> `18.172 ms` (`+5.80%`)
+    - `bench_eigenbasis/2000/16`: `14.252 ms` -> `11.729 ms` (`-17.70%`)
+- App/runtime checks:
+  - not run; the benchmark gate was sufficient to reject
+- Test results:
+  - not run; rejected before correctness gating
+- Decision: `rejected`
+- Rejected because:
+  - The isolated eigenbasis kernel improved, but the exact A/B showed no meaningful diffgeo pipeline gain and `bench_diffusion_build/2000` regressed by more than `3%`, so the change failed the campaign keep threshold.
+- Notes:
+  - This result suggests that moving normalization work from the operator into build can pay on the eigensolver microbenchmark while still losing the end-to-end trade once build cost is accounted for.
+
+## 2026-03-06 Parallel Upper-Triangle Up-Laplacian Assembly
+- Timestamp: 2026-03-07T00:38:00Z
+- Commit: `2a6213d`
+- Primary target:
+  - `bench_diffgeo_phase_k2_up/4000/64/32/32`
+  - `bench_diffgeo_phase_k1_up/4000/64/32/32`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - The generic up-Laplacian assembly is still serial over the `(i,l)` basis-pair sweep. Filling only the upper triangle in parallel and materializing the symmetric matrix at the end should preserve exact values while finally using the worker pool on the dominant remaining 1-form and 2-form up terms.
+- Files touched:
+  - `include/igneous/ops/diffusion/forms.hpp`
+- Benchmark commands:
+  - Smoke:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_k1_up/1000/50/32/32|bench_diffgeo_phase_k2_up/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - Recorded deep artifacts:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_k1_up/1000/50/32/32|bench_diffgeo_phase_k2_up/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_diffgeo_up_parallel_20260306-173505.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_diffgeo_up_parallel_20260306-173505.txt'`
+  - App/runtime:
+    - `/usr/bin/time -p ./build/igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`
+  - Tests:
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_diffusion_forms|test_ops_diffusion_wedge|test_diffgeo_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - Baseline head before the patch from `results/bench_diffgeo_circular_20260306-172407.txt` and `results/bench_diffgeo_20260306-171657.txt`:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `114.638 ms` CPU
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `609.250 ms`
+    - `bench_diffgeo_phase_k1_up/1000/50/32/32`: `18.575 ms`
+    - `bench_diffgeo_phase_k1_up/4000/64/32/32`: `73.024 ms`
+    - `bench_diffgeo_phase_k2_up/1000/50/32/32`: `23.456 ms`
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `102.051 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `118.593 ms`
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `274.345 ms`
+  - Current patch smoke:
+    - `bench_diffgeo_pipeline/1000/50/32/32`: `75.445 ms` (`-34.19%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `446.456 ms` (`-26.72%`)
+    - `bench_diffgeo_phase_k1_up/1000/50/32/32`: `1.755 ms` (`-90.55%`)
+    - `bench_diffgeo_phase_k1_up/4000/64/32/32`: `6.848 ms` (`-90.62%`)
+    - `bench_diffgeo_phase_k2_up/1000/50/32/32`: `2.229 ms` (`-90.50%`)
+    - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `9.249 ms` (`-90.94%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `116.945 ms` (`-1.39%`)
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `268.028 ms` (`-2.30%`)
+- Deep results:
+  - Recorded artifacts: `results/bench_diffgeo_up_parallel_20260306-173505.{txt,json}`
+  - `bench_diffgeo_pipeline/1000/50/32/32`: `73.999 ms` CPU (`-35.45%` vs baseline head)
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `444.296 ms` (`-27.08%`)
+  - `bench_diffgeo_phase_k1_up/1000/50/32/32`: `1.739 ms` (`-90.64%`)
+  - `bench_diffgeo_phase_k1_up/4000/64/32/32`: `6.498 ms` (`-91.10%`)
+  - `bench_diffgeo_phase_k2_up/1000/50/32/32`: `2.160 ms` (`-90.79%`)
+  - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `8.692 ms` (`-91.48%`)
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `114.250 ms` (`-3.66%`)
+  - `bench_diffgeo_phase_circular/4000/64/32/32`: `272.185 ms` (`-0.79%`)
+- App/runtime checks:
+  - Head before patch: `real 0.24`, `user 0.21`, `sys 0.01`
+  - Current patch: `real 0.13`, `user 0.24`, `sys 0.01`
+- Test results:
+  - Targeted diffgeo suite: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - The large win came from parallelizing the basis-pair sweep without introducing races: only the upper triangle is assembled in parallel, then reflected through `selfadjointView<Eigen::Upper>()`.
+  - This is an exact implementation change; no operator math changed, and the existing diffgeo tests continued to pass.
+
+## 2026-03-06 Circular Mass-Space Standard Solve
+- Timestamp: 2026-03-07T00:41:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_circular/4000/64/32/32`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+  - `bench_hodge_phase_circular`
+- Hypothesis:
+  - Reuse the scalar function-Gram factorization inside `compute_circular_coordinates()` and solve the equivalent standard eigenproblem instead of refactoring the same generalized mass matrix twice per pipeline.
+- Files touched:
+  - `include/igneous/ops/diffusion/hodge.hpp`
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/1000/50/32/32|bench_diffgeo_phase_circular/1000/50/32/32|bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_circular|bench_hodge_phase_eigenbasis|bench_hodge_phase_structure_build' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - Baseline head before the patch from `results/bench_diffgeo_up_parallel_20260306-173505.txt` and `results/bench_pipelines_circular_20260306-172407.txt`:
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `444.296 ms` CPU
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `272.185 ms`
+    - `bench_hodge_phase_circular`: `274.345 ms`
+    - `bench_pipeline_hodge_main`: `476.473 ms`
+  - Current patch:
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `454.459 ms` (`+2.29%`)
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `271.329 ms` (`-0.31%`)
+    - `bench_hodge_phase_circular`: `273.415 ms` (`-0.34%`)
+    - `bench_pipeline_hodge_main`: `475.289 ms` (`-0.25%`)
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>` stayed at `real 0.13`, which matched the current head and did not provide an additional reason to keep the patch
+- Test results:
+  - not run; rejected before correctness gating
+- Decision: `rejected`
+- Rejected because:
+  - The solver rewrite only moved the isolated circular phase at noise level while regressing the diffgeo pipeline by about `2.3%`, so it did not justify a riskier solver change.
+- Notes:
+  - The existing generalized solver is likely not the dominant circular bottleneck anymore after the shared setup reuse; most of the remaining cost appears to come from the solve itself rather than duplicated mass-matrix preparation.
+
+## 2026-03-06 Direct CSR Symmetrization For Diffusion Build
+- Timestamp: 2026-03-07T00:48:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_diffusion_build/2000`
+  - `bench_hodge_phase_structure_build`
+  - `bench_diffgeo_phase_structure_build/4000/64/32/32`
+- Hypothesis:
+  - Replace the `triplets -> Eigen SparseMatrix -> transpose -> average -> extract CSR` symmetrization path with a direct transpose-and-merge CSR builder, and fold immersion accumulation into the Markov normalization pass.
+- Files touched:
+  - `include/igneous/data/structures/diffusion_geometry.hpp`
+- Benchmark commands:
+  - Candidate worktree:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_diffusion_build/2000|bench_eigenbasis/2000/16' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_diffusion_main/20|bench_pipeline_diffusion_main/100|bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_structure_build|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - Baseline worktree:
+    - `git worktree add --detach /tmp/igneous-wave3-buildcmp-175611 2a6213d`
+    - same benchmark filters with `--benchmark_min_time=0.1s` against `/tmp/igneous-wave3-buildcmp-175611/build/*`
+- Smoke results:
+  - Candidate:
+    - `bench_diffusion_build/2000`: `17.720 ms` CPU
+    - `bench_pipeline_spectral_main`: `45.418 ms`
+    - `bench_hodge_phase_structure_build`: `33.333 ms`
+    - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `33.778 ms`
+- Deep results:
+  - Exact A/B against commit `2a6213d`:
+    - `bench_diffusion_build/2000`: `17.231 ms` -> `17.720 ms` (`+2.84%`)
+    - `bench_pipeline_diffusion_main/20`: `20.885 ms` -> `21.526 ms` (`+3.07%`)
+    - `bench_pipeline_spectral_main`: `48.374 ms` -> `45.418 ms` (`-6.11%`)
+    - `bench_pipeline_hodge_main`: `478.812 ms` -> `472.301 ms` (`-1.36%`)
+    - `bench_hodge_phase_structure_build`: `34.648 ms` -> `33.333 ms` (`-3.79%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `453.130 ms` -> `466.470 ms` (`+2.95%`)
+    - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `34.433 ms` -> `33.778 ms` (`-1.90%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `117.071 ms` -> `126.877 ms` (`+8.38%`)
+- App/runtime checks:
+  - `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 0.14`
+- Test results:
+  - not run; rejected before correctness gating
+- Decision: `rejected`
+- Rejected because:
+  - The build rewrite improved some structure-heavy paths, but the exact compare regressed `bench_pipeline_diffusion_main/20` above the threshold and materially worsened the diffgeo eigenbasis phase, so the net pipeline trade was not acceptable.
+- Notes:
+  - The most likely issue is not raw build cost but the downstream spectral behavior of the alternate symmetric CSR layout as implemented here.
+
+## 2026-03-06 Lazy Normalized Symmetric-Kernel Cache
+- Timestamp: 2026-03-07T01:00:00Z
+- Commit: pending
+- Primary target:
+  - `bench_eigenbasis/2000/16`
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+- Hypothesis:
+  - Keep the accepted symmetric-kernel eigensolver path, but cache the normalized CSR values lazily on first eigensolve so the operator stops reapplying row-normalization inside every matvec without charging that work to diffusion build.
+- Files touched:
+  - `include/igneous/data/structures/diffusion_geometry.hpp`
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Benchmark commands:
+  - Candidate recorded artifacts:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_diffusion_build/2000|bench_eigenbasis/2000/16' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_dod_lazy_norm_20260306-180012.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_dod_lazy_norm_20260306-180012.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_diffusion_main/20|bench_pipeline_diffusion_main/100|bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_structure_build|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_pipelines_lazy_norm_20260306-180012.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_pipelines_lazy_norm_20260306-180012.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_diffgeo_lazy_norm_20260306-180012.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_diffgeo_lazy_norm_20260306-180012.txt'`
+  - Baseline exact A/B artifacts:
+    - `results/lazy_norm_baseline_dod_2a6213d.txt`
+    - `results/lazy_norm_baseline_pipelines_2a6213d.txt`
+    - `results/lazy_norm_baseline_diffgeo_2a6213d.txt`
+  - Tests:
+    - `ctest --test-dir build --output-on-failure -R 'test_structure_diffusion_geometry|test_ops_spectral_geometry|test_io_meshes|test_ops_hodge|test_hodge_cli_outputs|test_diffgeo_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - Current patch at `0.05s` min time:
+    - `bench_diffusion_build/2000`: `17.810 ms` CPU
+    - `bench_eigenbasis/2000/16`: `11.693 ms`
+    - `bench_pipeline_spectral_main`: `42.170 ms`
+    - `bench_pipeline_hodge_main`: `463.560 ms`
+    - `bench_hodge_phase_eigenbasis`: `117.206 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `441.865 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `103.004 ms`
+- Deep results:
+  - Exact A/B against commit `2a6213d`:
+    - `bench_diffusion_build/2000`: `17.231 ms` -> `17.710 ms` (`+2.78%`)
+    - `bench_eigenbasis/2000/16`: `14.319 ms` -> `11.705 ms` (`-18.25%`)
+    - `bench_pipeline_diffusion_main/20`: `20.885 ms` -> `21.401 ms` (`+2.47%`)
+    - `bench_pipeline_diffusion_main/100`: `22.973 ms` -> `23.126 ms` (`+0.67%`)
+    - `bench_pipeline_spectral_main`: `48.374 ms` -> `42.071 ms` (`-13.03%`)
+    - `bench_pipeline_hodge_main`: `478.812 ms` -> `458.640 ms` (`-4.21%`)
+    - `bench_hodge_phase_structure_build`: `34.648 ms` -> `34.230 ms` (`-1.21%`)
+    - `bench_hodge_phase_eigenbasis`: `131.694 ms` -> `117.106 ms` (`-11.08%`)
+    - `bench_hodge_phase_circular`: `272.253 ms` -> `267.809 ms` (`-1.63%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `453.130 ms` -> `443.189 ms` (`-2.19%`)
+    - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `34.433 ms` -> `34.655 ms` (`+0.65%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `117.071 ms` -> `106.943 ms` (`-8.65%`)
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `272.623 ms` -> `271.797 ms` (`-0.30%`)
+- App/runtime checks:
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-spectral assets/bunny.obj`: `real 0.07`, `user 0.05`, `sys 0.00`
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-hodge --output-dir <tempdir> --no-ply`: `real 0.11`, `user 0.15`, `sys 0.00`
+  - `./build/igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 0.13`, `user 0.24`, `sys 0.01`
+- Test results:
+  - Targeted spectral/structure/hodge/diffgeo suite: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - This keeps the existing symmetric-kernel eigensolver policy and only removes repeated row-normalization work inside the operator.
+  - The small build regression stayed below the campaign guard while the primary eigenbasis targets all improved materially.
+
+## 2026-03-06 Parallel Post-kNN Build Fusion
+- Timestamp: 2026-03-07T01:16:00Z
+- Commit: pending
+- Primary target:
+  - `bench_diffusion_build/2000`
+  - `bench_hodge_phase_structure_build`
+  - `bench_diffgeo_phase_structure_build/4000/64/32/32`
+- Hypothesis:
+  - Keep the accepted sparse symmetrization path, but parallelize the post-kNN scalar passes and fuse the final Markov normalization, local-bandwidth, and immersion accumulation work so diffusion build stops paying for multiple serial full-array sweeps.
+- Files touched:
+  - `include/igneous/data/structures/diffusion_geometry.hpp`
+- Benchmark commands:
+  - Exact baseline worktree at `1425f1f`:
+    - `cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_MAKE_PROGRAM=/opt/homebrew/bin/ninja -DCMAKE_TOOLCHAIN_FILE=/Users/autoparallel/vcpkg/scripts/buildsystems/vcpkg.cmake`
+    - `cmake --build build --target bench_dod bench_pipelines bench_diffgeo -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_diffusion_build/2000' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_dod_build_fusion_baseline_20260306-181441.json' --benchmark_out_format=json > '/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_dod_build_fusion_baseline_20260306-181441.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_diffusion_main/20|bench_pipeline_diffusion_main/100|bench_pipeline_hodge_main|bench_hodge_phase_structure_build' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_pipelines_build_fusion_baseline_20260306-181441.json' --benchmark_out_format=json > '/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_pipelines_build_fusion_baseline_20260306-181441.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_diffgeo_build_fusion_baseline_20260306-181441.json' --benchmark_out_format=json > '/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_diffgeo_build_fusion_baseline_20260306-181441.txt'`
+  - Candidate:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_diffusion_build/2000' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_dod_build_fusion_candidate_20260306-181512.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_dod_build_fusion_candidate_20260306-181512.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_diffusion_main/20|bench_pipeline_diffusion_main/100|bench_pipeline_hodge_main|bench_hodge_phase_structure_build' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_pipelines_build_fusion_candidate_20260306-181512.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_pipelines_build_fusion_candidate_20260306-181512.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_diffgeo_build_fusion_candidate_20260306-181512.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_diffgeo_build_fusion_candidate_20260306-181512.txt'`
+    - Guard reruns:
+      - baseline `bench_hodge_phase_eigenbasis`, `bench_diffgeo_phase_eigenbasis/4000/64/32/32`, and `bench_eigenbasis/2000/16` against the `1425f1f` worktree
+      - same three benchmarks on the candidate head
+  - Tests:
+    - `ctest --test-dir build --output-on-failure -R 'test_structure_diffusion_geometry|test_ops_spectral_geometry|test_io_meshes|test_ops_diffusion_forms|test_ops_diffusion_wedge|test_ops_hodge|test_diffgeo_cli_outputs|test_hodge_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - Current patch at `0.05s` min time:
+    - `bench_diffusion_build/2000`: `3.636 ms` CPU
+    - `bench_pipeline_diffusion_main/20`: `4.768 ms`
+    - `bench_pipeline_diffusion_main/100`: `6.180 ms`
+    - `bench_pipeline_hodge_main`: `435.631 ms`
+    - `bench_hodge_phase_structure_build`: `5.664 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `425.181 ms`
+    - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `5.881 ms`
+- Deep results:
+  - Exact A/B against `1425f1f`:
+    - `bench_diffusion_build/2000`: `17.589 ms` -> `3.716 ms` (`-78.87%`)
+    - `bench_pipeline_diffusion_main/20`: `21.434 ms` -> `4.624 ms` (`-78.43%`)
+    - `bench_pipeline_diffusion_main/100`: `22.761 ms` -> `5.822 ms` (`-74.42%`)
+    - `bench_pipeline_hodge_main`: `466.729 ms` -> `442.637 ms` (`-5.16%`)
+    - `bench_hodge_phase_structure_build`: `33.442 ms` -> `5.631 ms` (`-83.16%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `448.958 ms` -> `428.990 ms` (`-4.45%`)
+    - `bench_diffgeo_phase_structure_build/4000/64/32/32`: `35.070 ms` -> `6.109 ms` (`-82.58%`)
+    - `bench_pipeline_spectral_main`: `41.994 ms` -> `22.921 ms` (`-45.42%`) on follow-up guard rerun
+  - Guard follow-up:
+    - `bench_eigenbasis/2000/16`: `11.415 ms` -> `11.401 ms` (`-0.12%`)
+    - `bench_hodge_phase_eigenbasis`: `118.148 ms` -> `118.252 ms` (`+0.09%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `101.441 ms` -> `107.828 ms` (`+6.30%`)
+- App/runtime checks:
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-diffusion assets/bunny.obj`: `real 0.04`, `user 0.03`, `sys 0.01`
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-spectral assets/bunny.obj`: `real 0.05`, `user 0.05`, `sys 0.01`
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-hodge --output-dir <tempdir> --no-ply`: `real 0.10`, `user 0.15`, `sys 0.01`
+  - `./build/igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 0.11`, `user 0.23`, `sys 0.01`
+- Test results:
+  - Targeted diffusion/spectral/hodge/diffgeo suite: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - The kept change does not alter the accepted sparse symmetrization strategy; it only parallelizes and fuses the scalar post-kNN passes that were still serial.
+  - The canonical spectral guards stayed flat or improved, but the `4000/64` torus eigensolve used by `bench_diffgeo_phase_eigenbasis` consistently regressed by about `6%`. The end-to-end `diffgeo` pipeline still improved by `4.45%`, so this was accepted as a net build-heavy pipeline win rather than a pure spectral win.
+
+## 2026-03-06 Symmetric Solver `LargestAlge`
+- Timestamp: 2026-03-07T01:22:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_eigenbasis/2000/16`
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+- Hypothesis:
+  - Switch the normalized symmetric-kernel eigensolve from `LargestMagn` to `LargestAlge` so the solver targets the algebraically largest diffusion modes directly instead of sorting by magnitude.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp` (reverted)
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_eigenbasis/2000/16' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_hodge_phase_eigenbasis|bench_pipeline_hodge_main' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - Baseline current head after `dee719f`:
+    - `bench_eigenbasis/2000/16`: `11.401 ms` CPU
+    - `bench_pipeline_spectral_main`: `22.921 ms`
+    - `bench_hodge_phase_eigenbasis`: `118.252 ms`
+    - `bench_pipeline_hodge_main`: `442.637 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `107.828 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `428.990 ms`
+  - Current patch:
+    - `bench_eigenbasis/2000/16`: `11.789 ms` (`+3.40%`)
+    - `bench_pipeline_spectral_main`: `24.031 ms` (`+4.84%`)
+    - `bench_hodge_phase_eigenbasis`: `119.311 ms` (`+0.90%`)
+    - `bench_pipeline_hodge_main`: `442.863 ms` (`+0.05%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `110.094 ms` (`+2.10%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `424.396 ms` (`-1.07%`)
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run; rejected before app timing and correctness gates
+- Test results:
+  - not run; rejected before correctness gating
+- Decision: `rejected`
+- Rejected because:
+  - The symmetric sort-rule change regressed the canonical scalar eigenbasis benchmark and the spectral pipeline without providing a compensating hodge or diffgeo win.
+- Notes:
+  - This is distinct from the earlier generic Arnoldi sort-rule work. The symmetric normalized-kernel path should stay on `LargestMagn`.
+
+## 2026-03-06 Symmetric Solver `sqrt(row-sums)` Seed
+- Timestamp: 2026-03-07T01:31:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_eigenbasis/2000/16`
+  - `bench_pipeline_spectral_main`
+  - `bench_hodge_phase_eigenbasis`
+- Hypothesis:
+  - Seed the normalized symmetric-kernel eigensolve with `sqrt(row_sums)` so Arnoldi starts near the dominant diffusion mode instead of using Spectra's default random residual.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp` (reverted)
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_eigenbasis/2000/16' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_hodge_phase_eigenbasis|bench_pipeline_hodge_main' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - Baseline current head after `dee719f`:
+    - `bench_eigenbasis/2000/16`: `11.401 ms` CPU
+    - `bench_pipeline_spectral_main`: `22.921 ms`
+    - `bench_hodge_phase_eigenbasis`: `118.252 ms`
+    - `bench_pipeline_hodge_main`: `442.637 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `107.828 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `428.990 ms`
+  - Current patch:
+    - `bench_eigenbasis/2000/16`: `10.899 ms` (`-4.40%`)
+    - `bench_pipeline_spectral_main`: `20.001 ms` (`-12.74%`)
+    - `bench_hodge_phase_eigenbasis`: `126.251 ms` (`+6.76%`)
+    - `bench_pipeline_hodge_main`: `446.302 ms` (`+0.83%`)
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `109.216 ms` (`+1.29%`)
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `422.493 ms` (`-1.51%`)
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run; rejected before app timing and correctness gates
+- Test results:
+  - not run; rejected before correctness gating
+- Decision: `rejected`
+- Rejected because:
+  - The seeded symmetric solve improved the small scalar spectral path but regressed the 64-mode hodge eigensolve too much to keep.
+- Notes:
+  - This is distinct from the earlier rejected generic `mu`-init Arnoldi attempt. The symmetric normalized-kernel path should keep Spectra's default residual initialization.
+
+## 2026-03-06 Large-Mesh Atomic DEC Face Incidence
+- Timestamp: 2026-03-07T01:34:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_geometry` large-grid structure build
+  - `bench_mesh_structure_build/400`
+- Hypothesis:
+  - Parallelize the DEC face-incidence CSR build for large meshes with atomic count/write heads so the million-face geometry benchmark stops paying for two fully serial face sweeps.
+- Files touched:
+  - `include/igneous/data/structures/discrete_exterior_calculus.hpp` (reverted)
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_geometry`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_mesh_structure_build/400|bench_curvature_kernel/400|bench_flow_kernel/400' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `ctest --test-dir build --output-on-failure -R 'test_structure_dec|test_ops_curvature_flow'`
+- Smoke results:
+  - `bench_geometry`:
+    - `Grid 1000x1000` structure: `15.662 ms` -> `12.577 ms` (`-19.70%`)
+    - `Grid 1000x1000` curvature: `5.427 ms` -> `6.076 ms` (`+11.96%`)
+    - `Grid 1000x1000` flow: `0.881 ms` -> `1.028 ms` (`+16.69%`)
+    - `Grid 500x500` structure: `4.074 ms` -> `3.248 ms` (`-20.28%`)
+    - `Grid 500x500` curvature: `1.324 ms` -> `1.447 ms` (`+9.29%`)
+  - `bench_dod`:
+    - `bench_mesh_structure_build/400`: `2.821 ms` -> `1.819 ms` (`-35.52%`)
+    - `bench_curvature_kernel/400`: `0.983 ms` -> `0.831 ms` (`-15.46%`)
+    - `bench_flow_kernel/400`: `0.166 ms` -> `0.195 ms` (`+17.46%`)
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run; rejected before app timing and correctness gates
+- Test results:
+  - `test_structure_dec`: pass
+  - `test_ops_curvature_flow`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The atomic face-incidence path improved the isolated structure build but changed downstream locality/order enough to slow the large-grid curvature and flow kernels, so the overall geometry benchmark got worse.
+- Notes:
+  - If the DEC structure builder is revisited, the next version should preserve or explicitly reorder face/neighborhood locality instead of only chasing build throughput.
+
+## 2026-03-06 Hodge Gamma Cache Reuse
+- Timestamp: 2026-03-07T01:39:00Z
+- Commit: pending
+- Primary target:
+  - `bench_hodge_phase_curl_energy`
+  - `bench_hodge_phase_weak_derivative`
+  - `bench_pipeline_hodge_main`
+- Hypothesis:
+  - Cache `Gamma(x, phi)`, `U .* mu`, and `Gamma(x, x)` inside `HodgeWorkspace`, reuse the mixed gamma directly inside curl-energy assembly, and reset the workspace only when the mesh/build/eigenbasis state changes.
+- Files touched:
+  - `include/igneous/ops/diffusion/hodge.hpp`
+  - `benches/bench_pipelines.cpp`
+- Benchmark commands:
+  - Exact baseline worktree at `dee719f`:
+    - `cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_MAKE_PROGRAM=/opt/homebrew/bin/ninja -DCMAKE_TOOLCHAIN_FILE=/Users/autoparallel/vcpkg/scripts/buildsystems/vcpkg.cmake`
+    - `cmake --build build --target bench_dod bench_pipelines test_ops_hodge -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_weak_derivative/2000/16|bench_curl_energy/2000/16|bench_hodge_solve/2000/16' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_dod_hodgecache_baseline_20260306-183810.json' --benchmark_out_format=json > '/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_dod_hodgecache_baseline_20260306-183810.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_weak_derivative|bench_hodge_phase_curl_energy|bench_hodge_phase_eigenbasis' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_pipelines_hodgecache_baseline_20260306-183810.json' --benchmark_out_format=json > '/Users/autoparallel/Code/igneous/notes/perf/20260306-wave3/results/bench_pipelines_hodgecache_baseline_20260306-183810.txt'`
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_hodge'`
+  - Candidate:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_weak_derivative/2000/16|bench_curl_energy/2000/16|bench_hodge_solve/2000/16' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_dod_hodgecache_candidate_20260306-183833.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_dod_hodgecache_candidate_20260306-183833.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_weak_derivative|bench_hodge_phase_curl_energy|bench_hodge_phase_eigenbasis' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true --benchmark_out='notes/perf/20260306-wave3/results/bench_pipelines_hodgecache_candidate_20260306-183833.json' --benchmark_out_format=json > 'notes/perf/20260306-wave3/results/bench_pipelines_hodgecache_candidate_20260306-183833.txt'`
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_hodge|test_hodge_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - Current patch at `0.05s` min time:
+    - `bench_weak_derivative/2000/16`: `0.047 ms` CPU
+    - `bench_curl_energy/2000/16`: `1.087 ms`
+    - `bench_pipeline_hodge_main`: `431.851 ms`
+    - `bench_hodge_phase_weak_derivative`: `1.104 ms`
+    - `bench_hodge_phase_curl_energy`: `26.781 ms`
+    - `bench_hodge_phase_eigenbasis`: `115.948 ms`
+- Deep results:
+  - Exact A/B against `dee719f`:
+    - `bench_weak_derivative/2000/16`: `0.506 ms` -> `0.048 ms` (`-90.60%`)
+    - `bench_curl_energy/2000/16`: `2.189 ms` -> `1.166 ms` (`-46.73%`)
+    - `bench_hodge_solve/2000/16`: `0.132 ms` -> `0.121 ms` (`-7.66%`)
+    - `bench_pipeline_hodge_main`: `456.996 ms` -> `441.018 ms` (`-3.50%`)
+    - `bench_hodge_phase_weak_derivative`: `3.683 ms` -> `1.080 ms` (`-70.67%`)
+    - `bench_hodge_phase_curl_energy`: `28.824 ms` -> `25.882 ms` (`-10.21%`)
+    - `bench_hodge_phase_eigenbasis`: `119.500 ms` -> `118.666 ms` (`-0.70%`)
+- App/runtime checks:
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-hodge --output-dir <tempdir> --no-ply`: `real 0.10`, `user 0.15`, `sys 0.01`
+- Test results:
+  - `test_ops_hodge`: pass
+  - `test_hodge_cli_outputs`: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - `HodgeWorkspace::reset_cache()` is now required when the mesh/build/eigenbasis state changes and the same workspace instance is reused. Internal rebuild-heavy benchmark loops were updated accordingly.
+  - The main single-call win comes from reusing weak-derivative mixed gamma inside curl-energy assembly and from only computing the symmetric coordinate gamma tensor once per fixed workspace state.
+
+## 2026-03-06 Symmetric Trivial-Mode Deflation
+- Timestamp: 2026-03-07T02:00:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_eigenbasis/2000/16`
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_pipeline_hodge_main`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Deflate the known trivial normalized-kernel diffusion mode and let Spectra solve only for the remaining nontrivial modes, then prepend the constant mode explicitly.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp` (reverted)
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_eigenbasis/2000/16' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - Candidate only:
+    - `bench_eigenbasis/2000/16`: `12.062 ms` CPU
+    - `bench_pipeline_spectral_main`: `23.712 ms`
+    - `bench_pipeline_hodge_main`: `427.435 ms`
+    - `bench_hodge_phase_eigenbasis`: `114.011 ms`
+    - `bench_hodge_phase_circular`: `271.619 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `419.282 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `101.487 ms`
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `276.308 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - not run
+- Decision: `rejected`
+- Rejected because:
+  - The patch changed the symmetric-kernel solve behavior without producing a clear, repeatable improvement across both hodge and diffgeo pipelines, so it did not clear the keep bar.
+- Notes:
+  - This is distinct from the previously rejected sort-rule and init-vector changes, but it still belongs in the same family of spectral control rewrites that need stronger evidence before revisiting.
+
+## 2026-03-06 Accelerate Circular Generalized Solve
+- Timestamp: 2026-03-07T02:05:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_hodge_phase_circular`
+  - `bench_diffgeo_phase_circular/4000/64/32/32`
+  - `bench_pipeline_hodge_main`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Replace Eigen's dense generalized eigensolver inside `compute_circular_coordinates()` with the macOS Accelerate/LAPACK path while keeping the same operator and mode-selection logic.
+- Files touched:
+  - `CMakeLists.txt` (reverted)
+  - `include/igneous/ops/diffusion/hodge.hpp` (reverted)
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_circular|bench_hodge_phase_eigenbasis' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - Candidate only:
+    - `bench_pipeline_hodge_main`: `433.196 ms`
+    - `bench_hodge_phase_eigenbasis`: `119.385 ms`
+    - `bench_hodge_phase_circular`: `270.282 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `419.902 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `109.391 ms`
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `275.749 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - not run
+- Decision: `rejected`
+- Rejected because:
+  - The circular phase barely moved, which showed that the remaining circular cost is not dominated by Eigen's dense generalized eigensolver implementation itself.
+- Notes:
+  - The experiment also introduced an Apple-only code path and deprecated CLAPACK entry points, so it needed a much stronger payoff than the measured noise-level movement.
+
+## 2026-03-06 Batched Paired Circular Assembly
+- Timestamp: 2026-03-07T02:09:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_hodge_phase_circular`
+  - `bench_diffgeo_phase_circular/4000/64/32/32`
+  - `bench_pipeline_hodge_main`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Batch the two back-to-back circular-coordinate operator assemblies so the paired harmonic-mode solves share the dense `U`/`Gamma(x, phi)` work instead of rebuilding it twice.
+- Files touched:
+  - `include/igneous/ops/diffusion/hodge.hpp` (reverted)
+  - `src/main_hodge.cpp` (reverted)
+  - `src/main_diffusion_geometry.cpp` (reverted)
+  - `benches/bench_pipelines.cpp` (reverted)
+  - `benches/bench_diffgeo.cpp` (reverted)
+- Benchmark commands:
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_circular|bench_hodge_phase_eigenbasis' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - Candidate only:
+    - `bench_pipeline_hodge_main`: `432.693 ms`
+    - `bench_hodge_phase_eigenbasis`: `119.347 ms`
+    - `bench_hodge_phase_circular`: `272.862 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `420.794 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `109.734 ms`
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `275.774 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - not run
+- Decision: `rejected`
+- Rejected because:
+  - Sharing the paired operator assembly did not materially reduce the circular phase, so the duplicated two-call setup was not the dominant remaining cost.
+- Notes:
+  - This removes another plausible circular hotspot from the search space; the next circular pass should start with finer measurement before attempting another algebraic rewrite.
+
+## 2026-03-06 Stable Parallel DEC Face-Incidence Build
+- Timestamp: 2026-03-07T02:12:00Z
+- Commit: `5d0f4db`
+- Primary target:
+  - `bench_geometry` large-grid structure phase
+  - `bench_mesh_structure_build/400`
+- Hypothesis:
+  - Parallelize DEC face-incidence CSR construction with a stable block-prefix build so large meshes stop paying the serial count/write cost while preserving the face order seen by downstream curvature and flow kernels.
+- Files touched:
+  - `include/igneous/data/structures/discrete_exterior_calculus.hpp`
+- Benchmark commands:
+  - Exact baseline on the reverted worktree:
+    - `cmake --build build --target bench_geometry bench_dod -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_mesh_structure_build/400|bench_curvature_kernel/400|bench_flow_kernel/400' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_dod_dec_face_incidence_baseline_20260306-190831.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_geometry > 'notes/perf/20260306-wave3/results/bench_geometry_dec_face_incidence_baseline_20260306-190831.txt'`
+  - Candidate:
+    - `cmake --build build --target bench_geometry bench_dod test_structure_dec test_ops_curvature_flow -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_mesh_structure_build/400|bench_curvature_kernel/400|bench_flow_kernel/400' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_dod_dec_face_incidence_candidate_20260306-190850.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_geometry > 'notes/perf/20260306-wave3/results/bench_geometry_dec_face_incidence_candidate_20260306-190850.txt'`
+    - `./build/test_structure_dec`
+    - `./build/test_ops_curvature_flow`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - `bench_geometry`:
+    - `Grid 1000x1000` structure: `18.600 ms` -> `16.034 ms` (`-13.80%`)
+    - `Grid 1000x1000` curvature: `6.951 ms` -> `5.956 ms` (`-14.32%`)
+    - `Grid 1000x1000` flow: `1.137 ms` -> `0.981 ms` (`-13.72%`)
+  - `bench_dod`:
+    - `bench_mesh_structure_build/400`: `2.904 ms` -> `2.502 ms` (`-13.84%`)
+    - `bench_curvature_kernel/400`: `1.008 ms` -> `0.830 ms` (`-17.66%`)
+    - `bench_flow_kernel/400`: `0.222 ms` -> `0.195 ms` (`-12.02%`)
+- Deep results:
+  - Exact A/B artifacts:
+    - `notes/perf/20260306-wave3/results/bench_dod_dec_face_incidence_baseline_20260306-190831.txt`
+    - `notes/perf/20260306-wave3/results/bench_dod_dec_face_incidence_candidate_20260306-190850.txt`
+    - `notes/perf/20260306-wave3/results/bench_geometry_dec_face_incidence_baseline_20260306-190831.txt`
+    - `notes/perf/20260306-wave3/results/bench_geometry_dec_face_incidence_candidate_20260306-190850.txt`
+  - `bench_mesh_structure_build/400`: `2.545 ms` -> `2.558 ms` CPU (`+0.50%`)
+  - `bench_curvature_kernel/400`: `0.839 ms` -> `0.833 ms` CPU (`-0.74%`)
+  - `bench_flow_kernel/400`: `0.197 ms` -> `0.196 ms` CPU (`-0.53%`)
+  - `bench_geometry`:
+    - `Grid 500x500` structure: `4.263 ms` -> `4.021 ms` (`-5.68%`)
+    - `Grid 500x500` curvature: `1.395 ms` -> `1.314 ms` (`-5.81%`)
+    - `Grid 500x500` flow: `0.312 ms` -> `0.308 ms` (`-1.28%`)
+    - `Grid 1000x1000` structure: `17.288 ms` -> `14.532 ms` (`-15.94%`)
+    - `Grid 1000x1000` curvature: `5.450 ms` -> `5.048 ms` (`-7.38%`)
+    - `Grid 1000x1000` flow: `0.904 ms` -> `0.879 ms` (`-2.77%`)
+- App/runtime checks:
+  - not run; geometry benchmark coverage was sufficient for this internal DEC build change
+- Test results:
+  - `test_structure_dec`: pass
+  - `test_ops_curvature_flow`: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - The stable block-prefix construction keeps per-vertex face order intact, which avoided the locality regressions seen in the rejected atomic face-incidence rewrite.
+  - The 400-mesh DOD structure microbenchmark stayed effectively flat on exact A/B, but the larger geometry workloads cleared the keep bar and improved the downstream kernels at the same time.
+
+## 2026-03-06 Matrix Circular Workspace Preparation
+- Timestamp: 2026-03-07T02:24:00Z
+- Commit: `907b8ac`
+- Primary target:
+  - `bench_hodge_phase_circular`
+  - `bench_diffgeo_phase_circular/4000/64/32/32`
+  - `bench_pipeline_hodge_main`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Replace the mean-centered circular workspace preparation path with exact matrix formulas for `Gamma(x, phi)` and the scalar weak Laplacian, so the phase stops paying thousands of scalar `carre_du_champ` sweeps before each paired circular solve.
+- Files touched:
+  - `include/igneous/ops/diffusion/hodge.hpp`
+- Benchmark commands:
+  - Exact baseline on the reverted `5d0f4db` worktree:
+    - `cmake --build build --target bench_pipelines bench_diffgeo igneous-hodge igneous-diffusion-geometry -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_circular|bench_hodge_phase_eigenbasis' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_circularprep_baseline_20260306-192236.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_diffgeo_circularprep_baseline_20260306-192236.txt'`
+  - Candidate:
+    - `cmake --build build --target bench_pipelines bench_diffgeo igneous-hodge igneous-diffusion-geometry test_ops_hodge test_ops_diffusion_forms test_ops_diffusion_wedge -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_circular|bench_hodge_phase_eigenbasis' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_circularprep_candidate_20260306-192320.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_diffgeo_circularprep_candidate_20260306-192320.txt'`
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_hodge|test_hodge_cli_outputs|test_ops_diffusion_forms|test_ops_diffusion_wedge|test_diffgeo_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+    - `env IGNEOUS_BENCH_MODE=1 /usr/bin/time -p ./build/igneous-hodge --output-dir <tempdir> --no-ply`
+    - `env IGNEOUS_BENCH_MODE=1 /usr/bin/time -p ./build/igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`
+- Smoke results:
+  - Current patch at `0.05s` min time:
+    - `bench_pipeline_hodge_main`: `165.823 ms`
+    - `bench_hodge_phase_circular`: `4.308 ms`
+    - `bench_hodge_phase_eigenbasis`: `119.710 ms`
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `159.715 ms`
+    - `bench_diffgeo_phase_circular/4000/64/32/32`: `4.004 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `109.604 ms`
+- Deep results:
+  - Exact A/B artifacts:
+    - `notes/perf/20260306-wave3/results/bench_pipelines_circularprep_baseline_20260306-192236.txt`
+    - `notes/perf/20260306-wave3/results/bench_pipelines_circularprep_candidate_20260306-192320.txt`
+    - `notes/perf/20260306-wave3/results/bench_diffgeo_circularprep_baseline_20260306-192236.txt`
+    - `notes/perf/20260306-wave3/results/bench_diffgeo_circularprep_candidate_20260306-192320.txt`
+  - `bench_pipeline_hodge_main`: `425.753 ms` -> `164.629 ms` CPU (`-61.33%`)
+  - `bench_hodge_phase_circular`: `267.803 ms` -> `4.074 ms` CPU (`-98.48%`)
+  - `bench_hodge_phase_eigenbasis`: `116.965 ms` -> `116.807 ms` CPU (`-0.14%`)
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `409.351 ms` -> `147.268 ms` CPU (`-64.02%`)
+  - `bench_diffgeo_phase_circular/4000/64/32/32`: `267.488 ms` -> `3.821 ms` CPU (`-98.57%`)
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `107.028 ms` -> `107.143 ms` CPU (`+0.11%`)
+- App/runtime checks:
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-hodge --output-dir <tempdir> --no-ply`: `real 0.05`, `user 0.10`, `sys 0.01`
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 0.07`, `user 0.19`, `sys 0.01`
+  - Recorded artifact: `notes/perf/20260306-wave3/results/app_timings_circularprep_20260306-192320.txt`
+- Test results:
+  - `test_ops_hodge`: pass
+  - `test_ops_diffusion_forms`: pass
+  - `test_ops_diffusion_wedge`: pass
+  - `test_hodge_cli_outputs`: pass
+  - `test_diffgeo_cli_outputs`: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - The key identity on the mean-centered path is that `Gamma(x, phi)` is a row-wise covariance under the Markov kernel, and the scalar weak Laplacian is `U^T diag(P^T b) U - (P U)^T diag(b) (P U)` with `b = mu / (2 * local_bandwidth)`.
+  - This changed only the preparation path. The generalized eigensolve and circular mode-selection logic stayed unchanged, which is why the eigenbasis guard benchmarks remained flat while the circular phase collapsed.
+
+## 2026-03-06 Spectral Matvec Parallel Threshold
+- Timestamp: 2026-03-07T02:31:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_pipeline_hodge_main`
+  - `bench_pipeline_spectral_main`
+- Hypothesis:
+  - Lower the spectral CSR matvec parallel threshold from `8192` rows to `1024` so the 2503-point and 4000-point eigensolves stop running their `perform_op()` calls serially.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Benchmark commands:
+  - Smoke only:
+    - `cmake --build build --target bench_pipelines bench_diffgeo test_ops_spectral_geometry test_ops_hodge test_ops_diffusion_forms test_ops_diffusion_wedge test_structure_diffusion_geometry igneous-hodge igneous-diffusion-geometry -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular|bench_pipeline_diffusion_main' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - `bench_pipeline_diffusion_main/20`: `4.019 ms`
+  - `bench_pipeline_diffusion_main/100`: `5.685 ms`
+  - `bench_pipeline_spectral_main`: `36.193 ms`
+  - `bench_pipeline_hodge_main`: `184.869 ms`
+  - `bench_hodge_phase_eigenbasis`: `137.289 ms`
+  - `bench_hodge_phase_circular`: `4.154 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - not run
+- Decision: `rejected`
+- Rejected because:
+  - The extra worker-pool overhead outweighed the saved scalar work at these matrix sizes, so the eigensolve-heavy paths regressed sharply instead of speeding up.
+- Notes:
+  - This confirms that the spectral bottleneck is not just “matvecs are accidentally serial”; the remaining cost is dominated by solver behavior and dense orthogonalization work, not by row-level CSR scheduling.
+
+## 2026-03-06 Global Spectral Tolerance Relaxation
+- Timestamp: 2026-03-07T02:34:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_pipeline_spectral_main`
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+- Hypothesis:
+  - Relax the Spectra convergence tolerance globally so the eigensolver stops oversolving far beyond single-precision accuracy.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Benchmark commands:
+  - Smoke only:
+    - `cmake --build build --target bench_pipelines bench_diffgeo test_ops_spectral_geometry test_ops_hodge test_ops_diffusion_forms test_ops_diffusion_wedge test_structure_diffusion_geometry igneous-hodge igneous-diffusion-geometry -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular|bench_pipeline_diffusion_main' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - `tol = 1e-6`:
+    - `bench_pipeline_spectral_main`: `17.346 ms`
+    - `bench_pipeline_hodge_main`: `172.763 ms`
+    - `bench_hodge_phase_eigenbasis`: `125.463 ms`
+  - `tol = 1e-8`:
+    - `bench_pipeline_spectral_main`: `21.281 ms`
+    - `bench_pipeline_hodge_main`: `174.647 ms`
+    - `bench_hodge_phase_eigenbasis`: `126.335 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - not run
+- Decision: `rejected`
+- Rejected because:
+  - A looser tolerance helps the 16-mode scalar spectral path, but it consistently regresses the 64-mode hodge eigensolve enough to fail the higher-priority guard.
+- Notes:
+  - The large-basis diffusion eigensolve needs the stricter convergence target, even though the smaller spectral path can tolerate a looser one.
+
+## 2026-03-06 Davidson Symmetric Eigensolver
+- Timestamp: 2026-03-07T02:36:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_pipeline_hodge_main`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+- Hypothesis:
+  - Replace the large-basis symmetric normalized-kernel solve with Spectra's Davidson solver and a thin CSR adapter so the 64-mode eigensolve converges in fewer iterations than the current Lanczos path.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Benchmark commands:
+  - Smoke only:
+    - `cmake --build build --target bench_pipelines bench_diffgeo test_ops_spectral_geometry test_ops_hodge test_ops_diffusion_forms test_ops_diffusion_wedge test_structure_diffusion_geometry igneous-hodge igneous-diffusion-geometry -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular|bench_pipeline_diffusion_main' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - `bench_pipeline_diffusion_main/20`: `4.004 ms`
+  - `bench_pipeline_diffusion_main/100`: `5.669 ms`
+  - `bench_pipeline_spectral_main`: `23.791 ms`
+  - `bench_pipeline_hodge_main`: `607.561 ms`
+  - `bench_hodge_phase_eigenbasis`: `535.079 ms`
+  - `bench_hodge_phase_circular`: `4.170 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - not run
+- Decision: `rejected`
+- Rejected because:
+  - Even with a direct CSR adapter for diagonal access and block products, the Davidson path was dramatically slower than the current symmetric Lanczos solver on the 64-mode target.
+- Notes:
+  - This closes off the obvious “new symmetric solver family” option available in the current Spectra package; the accepted path should stay on `SymEigsSolver`.
+
+## 2026-03-06 Small-Basis Spectral Tolerance Relaxation
+- Timestamp: 2026-03-07T02:39:50Z
+- Commit: `2a24aef`
+- Primary target:
+  - `bench_eigenbasis/2000/16`
+  - `bench_pipeline_spectral_main`
+- Hypothesis:
+  - Relax the Spectra convergence tolerance only for small-basis solves (`n_eigenvectors <= 16`), where the solver was oversolving relative to float precision, while keeping the existing strict tolerance on the 64-mode hodge/diffgeo guard path.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Benchmark commands:
+  - Exact baseline on the reverted `a297d22` worktree:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_eigenbasis/2000/16' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_dod_small_spectral_tol_baseline_20260307-023644.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_circular|bench_pipeline_diffusion_main' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_eigen_parallel_baseline_20260307-022901.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32|bench_diffgeo_phase_structure_build/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_diffgeo_eigen_parallel_baseline_20260307-022901.txt'`
+  - Candidate:
+    - `cmake --build build --target bench_dod bench_pipelines bench_diffgeo test_ops_spectral_geometry test_ops_hodge test_ops_diffusion_forms test_ops_diffusion_wedge test_structure_diffusion_geometry igneous-hodge igneous-diffusion-geometry -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_eigenbasis/2000/16' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_dod_small_spectral_tol_candidate_20260307-023622.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_spectral_main|bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_pipeline_diffusion_main' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_small_spectral_tol_candidate_20260307-023622.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_diffgeo_small_spectral_tol_candidate_20260307-023622.txt'`
+    - `ctest --test-dir build --output-on-failure -R 'test_structure_diffusion_geometry|test_ops_spectral_geometry|test_io_meshes|test_ops_hodge'`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - Initial mixed smoke:
+    - `bench_eigenbasis/2000/16`: `8.794 ms`
+    - `bench_pipeline_spectral_main`: `18.093 ms`
+    - `bench_pipeline_hodge_main`: `167.124 ms`
+    - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.695 ms`
+  - Isolated hodge guard rerun before deep A/B:
+    - `bench_pipeline_hodge_main`: `166.755 ms`
+    - `bench_hodge_phase_eigenbasis`: `117.075 ms`
+- Deep results:
+  - Exact A/B artifacts:
+    - `notes/perf/20260306-wave3/results/bench_dod_small_spectral_tol_baseline_20260307-023644.txt`
+    - `notes/perf/20260306-wave3/results/bench_dod_small_spectral_tol_candidate_20260307-023622.txt`
+    - `notes/perf/20260306-wave3/results/bench_pipelines_eigen_parallel_baseline_20260307-022901.txt`
+    - `notes/perf/20260306-wave3/results/bench_pipelines_small_spectral_tol_candidate_20260307-023622.txt`
+    - `notes/perf/20260306-wave3/results/bench_diffgeo_eigen_parallel_baseline_20260307-022901.txt`
+    - `notes/perf/20260306-wave3/results/bench_diffgeo_small_spectral_tol_candidate_20260307-023622.txt`
+  - `bench_eigenbasis/2000/16`: `11.368 ms` -> `8.199 ms` CPU (`-27.88%`)
+  - `bench_pipeline_spectral_main`: `22.638 ms` -> `16.783 ms` CPU (`-25.86%`)
+  - `bench_pipeline_hodge_main`: `165.927 ms` -> `166.413 ms` CPU (`+0.29%`)
+  - `bench_hodge_phase_eigenbasis`: `117.285 ms` -> `117.205 ms` CPU (`-0.07%`)
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `147.701 ms` -> `147.002 ms` CPU (`-0.47%`)
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.773 ms` -> `106.752 ms` CPU (`-0.02%`)
+- App/runtime checks:
+  - not run; benchmark and test coverage was sufficient for this internal solver-policy change
+- Test results:
+  - `test_structure_diffusion_geometry`: pass
+  - `test_ops_spectral_geometry`: pass
+  - `test_io_meshes`: pass
+  - `test_ops_hodge`: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - The previous rejected global-tolerance passes showed that the 64-mode hodge/diffgeo solve really does need the stricter convergence target. Restricting the looser tolerance to the 16-mode path preserves the high-priority guards while reclaiming the wasted work in the scalar spectral pipeline.
+
+## 2026-03-06 Matrix Hodge Curl-Energy Assembly
+- Timestamp: 2026-03-07T02:46:27Z
+- Commit: `5e07f81`
+- Primary target:
+  - `bench_hodge_phase_curl_energy`
+  - `bench_pipeline_hodge_main`
+- Hypothesis:
+  - Replace the mean-centered Hodge curl-energy assembly with exact matrix formulas over `P U`, `Gamma(x_a, phi)`, and `Gamma(x_a, x_b)` so the phase stops recomputing `Gamma(phi_k, phi_l)` with thousands of scalar `carre_du_champ` sweeps.
+- Files touched:
+  - `include/igneous/ops/diffusion/hodge.hpp`
+- Benchmark commands:
+  - Exact baseline on the reverted `2a24aef` worktree:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_gram|bench_hodge_phase_weak_derivative|bench_hodge_phase_curl_energy|bench_hodge_phase_circular' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_hodge_curl_matrix_baseline_20260307-024447.txt'`
+  - Candidate:
+    - `cmake --build build --target bench_pipelines test_ops_hodge igneous-hodge -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_gram|bench_hodge_phase_weak_derivative|bench_hodge_phase_curl_energy|bench_hodge_phase_circular' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_hodge_curl_matrix_candidate_20260307-024447.txt'`
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_hodge|test_hodge_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+    - `env IGNEOUS_BENCH_MODE=1 /usr/bin/time -p ./build/igneous-hodge --output-dir <tempdir> --no-ply`
+- Smoke results:
+  - `bench_pipeline_hodge_main`: `156.093 ms`
+  - `bench_hodge_phase_eigenbasis`: `118.986 ms`
+  - `bench_hodge_phase_gram`: `1.449 ms`
+  - `bench_hodge_phase_weak_derivative`: `1.101 ms`
+  - `bench_hodge_phase_curl_energy`: `10.651 ms`
+  - `bench_hodge_phase_circular`: `4.159 ms`
+- Deep results:
+  - Exact A/B artifacts:
+    - `notes/perf/20260306-wave3/results/bench_pipelines_hodge_curl_matrix_baseline_20260307-024447.txt`
+    - `notes/perf/20260306-wave3/results/bench_pipelines_hodge_curl_matrix_candidate_20260307-024447.txt`
+  - `bench_pipeline_hodge_main`: `165.269 ms` -> `150.060 ms` CPU (`-9.20%`)
+  - `bench_hodge_phase_eigenbasis`: `116.382 ms` -> `116.601 ms` CPU (`+0.19%`)
+  - `bench_hodge_phase_gram`: `1.410 ms` -> `1.418 ms` CPU (`+0.55%`)
+  - `bench_hodge_phase_weak_derivative`: `1.058 ms` -> `1.059 ms` CPU (`+0.13%`)
+  - `bench_hodge_phase_curl_energy`: `25.729 ms` -> `10.199 ms` CPU (`-60.36%`)
+  - `bench_hodge_phase_circular`: `4.028 ms` -> `4.034 ms` CPU (`+0.16%`)
+- App/runtime checks:
+  - `env IGNEOUS_BENCH_MODE=1 ./build/igneous-hodge --output-dir <tempdir> --no-ply`: `real 0.05`, `user 0.05`, `sys 0.01`
+  - Recorded artifact: `notes/perf/20260306-wave3/results/app_hodge_curl_matrix_20260307-024447.txt`
+- Test results:
+  - `test_ops_hodge`: pass
+  - `test_hodge_cli_outputs`: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - On the mean-centered path, the weighted `Gamma(phi_k, phi_l)` term can be written exactly as `U^T diag(P^T c) U - (P U)^T diag(c) (P U)` with `c = mu * Gamma(x_a, x_b) / denom`, which removes the expensive per-pair scalar sweeps.
+  - The cross term is exactly `Gamma(x_b, phi)^T diag(mu) Gamma(x_a, phi)`, so the whole curl block can be assembled from cached dense blocks and a final symmetrization step.
+
+## 2026-03-06 Matrix Diffgeo Gamma-Data Immersion
+- Timestamp: 2026-03-07T02:52:00Z
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Replace the `Gamma(data_coord, immersion_coord)` reconstruction precompute with exact mean-centered matrix formulas so the diffgeo pipeline stops paying nine scalar `carre_du_champ` sweeps before reconstructing ambient fields.
+- Files touched:
+  - `benches/bench_diffgeo.cpp`
+  - `src/main_diffusion_geometry.cpp`
+- Benchmark commands:
+  - Smoke only:
+    - `cmake --build build --target bench_diffgeo igneous-diffusion-geometry test_ops_diffusion_forms test_ops_diffusion_wedge -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `146.086 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.173 ms`
+  - `bench_diffgeo_phase_circular/4000/64/32/32`: `3.831 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - not run
+- Decision: `rejected`
+- Rejected because:
+  - The end-to-end diffgeo pipeline stayed effectively flat, so the reconstruction gamma-data precompute is not a worthwhile bottleneck on the current head.
+- Notes:
+  - This narrows the remaining diffgeo residual further toward dense form-spectrum work and reconstruction itself rather than the `Gamma(data, immersion)` precompute.
+
+## 2026-03-06 Dense Generalized Spectrum Cholesky Whitening
+- Timestamp: 2026-03-07T02:56:38Z
+- Commit: `49b3bd3`
+- Primary target:
+  - `bench_hodge_phase_solve`
+  - `bench_pipeline_hodge_main`
+- Hypothesis:
+  - For dense generalized spectra with SPD mass matrices, replace the mass-eigendecomposition whitening path with direct Cholesky whitening and fall back to the old regularized mass-eigenspace path only when the Cholesky factorization fails.
+- Files touched:
+  - `include/igneous/ops/diffusion/forms.hpp`
+  - `include/igneous/ops/diffusion/hodge.hpp`
+- Benchmark commands:
+  - Exact baseline on the reverted `5e07f81` worktree:
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_solve' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_form_spectrum_cholesky_baseline_20260307-025546.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_diffgeo_form_spectrum_cholesky_baseline_20260307-025546.txt'`
+  - Candidate:
+    - `cmake --build build --target bench_pipelines bench_diffgeo test_ops_hodge test_ops_diffusion_forms test_ops_diffusion_wedge test_ops_spectral_geometry igneous-hodge igneous-diffusion-geometry -j8`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines --benchmark_filter='bench_pipeline_hodge_main|bench_hodge_phase_eigenbasis|bench_hodge_phase_solve' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_pipelines_form_spectrum_cholesky_candidate_20260307-025546.txt'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true > 'notes/perf/20260306-wave3/results/bench_diffgeo_form_spectrum_cholesky_candidate_20260307-025546.txt'`
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_hodge|test_ops_diffusion_forms|test_ops_diffusion_wedge|test_ops_spectral_geometry|test_hodge_cli_outputs|test_diffgeo_cli_outputs'`
+    - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - `bench_pipeline_hodge_main`: `150.555 ms`
+  - `bench_hodge_phase_eigenbasis`: `117.141 ms`
+  - `bench_hodge_phase_solve`: `2.169 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `148.712 ms`
+  - isolated diffgeo rerun:
+    - `bench_diffgeo_pipeline/4000/64/32/32`: `146.660 ms`
+- Deep results:
+  - Exact A/B artifacts:
+    - `notes/perf/20260306-wave3/results/bench_pipelines_form_spectrum_cholesky_baseline_20260307-025546.txt`
+    - `notes/perf/20260306-wave3/results/bench_pipelines_form_spectrum_cholesky_candidate_20260307-025546.txt`
+    - `notes/perf/20260306-wave3/results/bench_diffgeo_form_spectrum_cholesky_baseline_20260307-025546.txt`
+    - `notes/perf/20260306-wave3/results/bench_diffgeo_form_spectrum_cholesky_candidate_20260307-025546.txt`
+  - `bench_pipeline_hodge_main`: `150.572 ms` -> `149.293 ms` CPU (`-0.85%`)
+  - `bench_hodge_phase_eigenbasis`: `116.737 ms` -> `115.785 ms` CPU (`-0.82%`)
+  - `bench_hodge_phase_solve`: `3.460 ms` -> `2.151 ms` CPU (`-37.82%`)
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `146.518 ms` -> `145.730 ms` CPU (`-0.54%`)
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.471 ms` -> `107.411 ms` CPU (`+0.88%`)
+- App/runtime checks:
+  - not run; benchmark and test coverage were sufficient for this dense internal solver change
+- Test results:
+  - `test_ops_hodge`: pass
+  - `test_ops_diffusion_forms`: pass
+  - `test_ops_diffusion_wedge`: pass
+  - `test_ops_spectral_geometry`: pass
+  - `test_hodge_cli_outputs`: pass
+  - `test_diffgeo_cli_outputs`: pass
+  - Full `ctest`: `14/14` pass
+- Decision: `kept`
+- Rejected because:
+  - n/a
+- Notes:
+  - The dense generalized form and Hodge solves use SPD mass matrices on the current workloads, so Cholesky whitening avoids the full mass-eigendecomposition cost while preserving the old path as a fallback for degenerate cases.
+
+## 2026-03-06 Matrix Diffgeo Mixed-Gamma Cache
+- Timestamp: 2026-03-07T03:00:44Z
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+  - `bench_diffgeo_phase_k1_up/4000/64/32/32`
+  - `bench_diffgeo_phase_k2_up/4000/64/32/32`
+- Hypothesis:
+  - Replace the mean-centered `DiffusionFormWorkspace` mixed-gamma cache fill with exact matrix formulas over `P U` and `P coords`, so the pipeline stops paying `3 * n_coeff` scalar `carre_du_champ` sweeps every time `forms_ws.reset_cache()` is hit.
+- Files touched:
+  - `include/igneous/ops/diffusion/geometry.hpp`
+  - `include/igneous/ops/diffusion/forms.hpp`
+  - `include/igneous/ops/diffusion/hodge.hpp`
+- Benchmark commands:
+  - Smoke only:
+    - `cmake --build build --target bench_diffgeo bench_pipelines test_ops_diffusion_forms test_ops_diffusion_wedge test_ops_hodge igneous-diffusion-geometry igneous-hodge -j8`
+    - `ctest --test-dir build --output-on-failure -R 'test_ops_diffusion_forms|test_ops_diffusion_wedge|test_ops_hodge|test_diffgeo_cli_outputs|test_hodge_cli_outputs'`
+    - `IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo --benchmark_filter='bench_diffgeo_pipeline/4000/64/32/32|bench_diffgeo_phase_eigenbasis/4000/64/32/32|bench_diffgeo_phase_k1_gram/4000/64/32/32|bench_diffgeo_phase_k1_up/4000/64/32/32|bench_diffgeo_phase_k2_gram/4000/64/32/32|bench_diffgeo_phase_k2_up/4000/64/32/32|bench_diffgeo_phase_circular/4000/64/32/32' --benchmark_min_time=0.05s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true`
+- Smoke results:
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `149.011 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.862 ms`
+  - `bench_diffgeo_phase_k1_gram/4000/64/32/32`: `1.862 ms`
+  - `bench_diffgeo_phase_k1_up/4000/64/32/32`: `7.372 ms`
+  - `bench_diffgeo_phase_k2_gram/4000/64/32/32`: `1.846 ms`
+  - `bench_diffgeo_phase_k2_up/4000/64/32/32`: `8.821 ms`
+  - `bench_diffgeo_phase_circular/4000/64/32/32`: `3.877 ms`
+- Deep results:
+  - not run; rejected at smoke stage
+- App/runtime checks:
+  - not run
+- Test results:
+  - `test_ops_diffusion_forms`: pass
+  - `test_ops_diffusion_wedge`: pass
+  - `test_ops_hodge`: pass
+  - `test_diffgeo_cli_outputs`: pass
+  - `test_hodge_cli_outputs`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The end-to-end diffgeo pipeline regressed and the operator-phase movement was too noisy to justify keeping a larger cache/matrixization rewrite.
+- Notes:
+  - The accepted Hodge mixed-gamma matrixization did not carry over cleanly to the generic form workspace; the remaining diffgeo bottleneck is not simply the mixed-gamma cache fill.
+
+## Hypothesis: add diffgeo coverage to CI smoke and deep perf reports
+- Timestamp: 2026-03-07T15:30:00-07:00
+- Commit: `cf653c4`
+- Primary target:
+  - PR smoke/deep visibility for `bench_diffgeo`
+- Hypothesis:
+  - Add `bench_diffgeo` to the GitHub perf workflows and emit a fallback `not comparable` section when the baseline branch predates the target, so CI keeps showing the diffgeo torus timings instead of dropping them.
+- Files touched:
+  - `.github/workflows/perf-smoke.yml`
+  - `.github/workflows/perf-deep.yml`
+- Validation:
+  - Parsed both workflow YAML files locally.
+  - Simulated the baseline-missing-target path through `scripts/perf/compare_against_main.py` and verified that the generated markdown still emits current `bench_diffgeo` values.
+- Decision: `kept`
+- Notes:
+  - This is instrumentation only; it does not change runtime behavior.
+
+## Hypothesis: weighted coordinate seed for large-basis symmetric eigensolve
+- Timestamp: 2026-03-07T15:31:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_hodge_phase_eigenbasis`
+- Hypothesis:
+  - Keep the accepted symmetric Lanczos path but seed the large-basis solve with a row-sum-weighted coordinate vector to reduce restart time.
+- Files touched:
+  - `include/igneous/ops/diffusion/spectral.hpp`
+- Smoke results:
+  - `bench_eigenbasis/2000/16`: `8.422 ms` -> `8.455 ms`
+  - `bench_pipeline_spectral_main`: `17.435 ms` -> `17.815 ms`
+  - `bench_pipeline_hodge_main`: `148.242 ms` -> `164.046 ms`
+  - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `132.063 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `148.745 ms` -> `149.748 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `111.772 ms`
+- Decision: `rejected`
+- Rejected because:
+  - The actual 64-mode hodge/diffgeo guards regressed materially; the seed quality trade did not pay for the extra movement in restart/orthogonalization.
+
+## Hypothesis: avoid extra copies in CPU Markov multi-step path
+- Timestamp: 2026-03-07T15:32:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_markov_multi_step/20000/20`
+  - `bench_pipeline_diffusion_main/100`
+- Hypothesis:
+  - Remove unnecessary whole-vector copies from repeated CPU Markov steps when the input/output buffers are already distinct.
+- Files touched:
+  - `include/igneous/ops/diffusion/geometry.hpp`
+- Smoke results:
+  - `bench_diffusion_build/2000`: `3.449 ms` -> `3.273 ms`
+  - `bench_markov_step/2000`: `16.169 us` -> `18.376 us`
+  - `bench_markov_multi_step/2000/20`: `329.497 us` -> `349.644 us`
+  - `bench_markov_multi_step/20000/20`: `4.356 ms` -> `4.142 ms`
+  - `bench_pipeline_diffusion_main/20`: `4.330 ms` -> `4.199 ms`
+  - `bench_pipeline_diffusion_main/100`: `5.344 ms` -> `5.793 ms`
+- Test results:
+  - `test_structure_diffusion_geometry`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The copy-avoidance path only helped one large microbenchmark and regressed the higher-priority `/100` diffusion pipeline guard.
+
+## Hypothesis: short-circuit serial `parallel_for_index` before env/backend lookup
+- Timestamp: 2026-03-07T15:33:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_hodge_phase_eigenbasis`
+- Hypothesis:
+  - Skip the backend/env/thread discovery in `parallel_for_index` whenever the loop is already below the serial threshold, because sample traces showed that overhead inside the sparse eigensolve matvec/restart path.
+- Files touched:
+  - `include/igneous/core/parallel.hpp`
+- Profiling notes:
+  - `sample` on `bench_diffgeo_phase_eigenbasis/4000/64/32/32` showed most time in Spectra restart/lanczos code and Eigen dense matvecs, with smaller but visible `parallel_for_index` env/thread lookup overhead.
+- Smoke results:
+  - `bench_eigenbasis/2000/16`: `8.422 ms` -> `8.574 ms`
+  - `bench_pipeline_spectral_main`: `17.435 ms` -> `17.004 ms`
+  - `bench_pipeline_hodge_main`: `148.242 ms` -> `149.110 ms`
+  - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `114.631 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `148.745 ms` -> `155.672 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `108.934 ms`
+- Test results:
+  - `test_ops_spectral_geometry`: pass
+  - `test_ops_diffusion_forms`: pass
+  - `test_ops_hodge`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The serial fast-path saved too little in the actual spectral hot loop and regressed the diffgeo eigensolve guard.
+
+## Hypothesis: DEC-specialized curvature kernel over direct SoA/CSR storage
+- Timestamp: 2026-03-07T15:34:00-07:00
+- Commit: `59336a7`
+- Primary target:
+  - `bench_curvature_kernel/400`
+  - `bench_geometry` `Grid 1000x1000`
+- Hypothesis:
+  - Specialize `compute_curvature_measures()` for `DiscreteExteriorCalculus` so it traverses `face_v*`, `vertex_face_*`, and `space.x/y/z` directly, removing repeated `get_vec3()` calls, generic face-corner dispatch, and temporary `Vec3` construction while preserving the same curvature formulas.
+- Files touched:
+  - `include/igneous/ops/dec/curvature.hpp`
+- Benchmark commands:
+  - `ctest --test-dir build --output-on-failure -R test_ops_curvature_flow`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_geometry | tee notes/perf/20260306-wave3/results/bench_geometry_dec_curvature_20260307-152340.txt`
+  - `IGNEOUS_BENCH_MODE=1 ./build/bench_dod --benchmark_filter='bench_curvature_kernel/400|bench_flow_kernel/400|bench_mesh_structure_build/400|bench_markov_step/2000|bench_markov_multi_step/2000/20|bench_markov_multi_step/20000/20' --benchmark_min_time=0.1s --benchmark_repetitions=5 --benchmark_report_aggregates_only=true | tee notes/perf/20260306-wave3/results/bench_dod_dec_curvature_20260307-152340.txt`
+  - `ctest --test-dir build --output-on-failure -j8`
+- Smoke results:
+  - `bench_curvature_kernel/400`: `888653 ns` -> `816836 ns` CPU (`-8.08%`)
+  - `bench_geometry` `Grid 1000x1000` curvature: `5.716 ms` -> `5.358 ms` (`-6.26%`)
+  - `bench_geometry` `Grid 1000x1000` structure: `14.516 ms` -> `13.661 ms` (`-5.89%`)
+  - `bench_flow_kernel/400`: `196297 ns` -> `195466 ns` CPU (`-0.42%`)
+  - `bench_markov_multi_step/20000/20`: `4.356 ms` -> `4.099 ms` (`-5.88%`, guard unchanged by unrelated variance)
+- Test results:
+  - `test_ops_curvature_flow`: pass
+  - full `ctest`: `14/14` pass
+- Decision: `kept`
+- Notes:
+  - The first single `bench_geometry` rerun was noisy on the 1e6-vertex case; follow-up repetitions settled in below the pre-change baseline, so the isolated DOD kernel and large-grid geometry guard both moved in the right direction.
+
+## Hypothesis: hoist CPU Markov-step dispatch and parallelize repeated row sweeps
+- Timestamp: 2026-03-07T15:43:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_markov_multi_step/20000/20`
+  - `bench_pipeline_diffusion_main/20`
+  - `bench_pipeline_diffusion_main/100`
+- Hypothesis:
+  - Move the CPU CSR row sweep into a shared helper, compute backend/worker selection once per repeated Markov call, and enable parallel row execution only when `row_step_work` is large enough to amortize the worker-pool overhead.
+- Files touched:
+  - `include/igneous/ops/diffusion/geometry.hpp`
+- Smoke results:
+  - `bench_diffusion_build/2000`: `3.438 ms` -> `3.222 ms`
+  - `bench_markov_step/2000`: `17.938 us` -> `20.352 us`
+  - `bench_markov_multi_step/2000/20`: `350.765 us` -> `382.900 us`
+  - `bench_markov_multi_step/20000/20`: `3.881 ms` -> `2.091 ms`
+  - `bench_pipeline_diffusion_main/20`: `4.187 ms` -> `4.927 ms`
+  - `bench_pipeline_diffusion_main/100`: `5.711 ms` -> `7.972 ms`
+- Test results:
+  - `test_structure_diffusion_geometry`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The helper did improve the largest isolated Markov microbenchmark, but it regressed both end-to-end diffusion pipeline guards badly enough that the trade is unacceptable.
+
+## Hypothesis: parallelize `carre_du_champ` row sweeps directly
+- Timestamp: 2026-03-07T15:44:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_hodge_phase_curl_energy`
+  - `bench_pipeline_hodge_main`
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+- Hypothesis:
+  - Remove the redundant zero-fill in `carre_du_champ()` and parallelize both the mean-center pass and the final row accumulation above a conservative row threshold.
+- Files touched:
+  - `include/igneous/ops/diffusion/geometry.hpp`
+- Smoke results before diffgeo abort:
+  - `bench_pipeline_hodge_main`: `153.294 ms` -> `149.364 ms`
+  - `bench_hodge_phase_gram`: `1.514 ms` -> `1.409 ms`
+  - `bench_hodge_phase_weak_derivative`: `1.112 ms` -> `1.112 ms`
+  - `bench_hodge_phase_curl_energy`: `12.130 ms` -> `10.827 ms`
+- Diffgeo behavior:
+  - `bench_diffgeo` did not return under the modified kernel and had to be terminated.
+- Decision: `rejected`
+- Rejected because:
+  - The row-parallel gamma path likely nests into existing worker-pool parallelism in the generic form assembly, which makes it unsafe for the diffgeo pipeline despite the hodge-side movement.
+
+## Hypothesis: scalarize ambient form reconstruction in diffgeo/hodge apps and benches
+- Timestamp: 2026-03-07T15:45:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_pipeline/4000/64/32/32`
+  - `igneous-diffusion-geometry --n-points 1000`
+- Hypothesis:
+  - Replace the per-point `Eigen::Matrix3f` reconstruction path with direct scalar formulas and pointer-based column loads for both 1-form and dual-2-form ambient recovery.
+- Files touched:
+  - `benches/bench_diffgeo.cpp`
+  - `src/main_diffusion_geometry.cpp`
+  - `src/main_hodge.cpp`
+- Smoke results:
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `149.864 ms` -> `153.718 ms`
+  - app guard `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`: `real 0.07` -> `real 0.09`
+- Test results:
+  - `test_diffgeo_cli_outputs`: pass
+  - `test_hodge_cli_outputs`: pass
+- Decision: `rejected`
+- Rejected because:
+  - The scalar rewrite is mathematically equivalent, but on the current head it regressed both the diffgeo benchmark guard and the app-level wall-time guard.
+
+## Hypothesis: cache backend/thread env lookups for `parallel_for_index`
+- Timestamp: 2026-03-07T15:46:00-07:00
+- Commit: n/a
+- Primary target:
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`
+  - `bench_hodge_phase_eigenbasis`
+  - `bench_pipeline_hodge_main`
+- Hypothesis:
+  - Read `IGNEOUS_BACKEND` and `IGNEOUS_NUM_THREADS` once per process instead of reparsing them on every `parallel_for_index()` call, to remove repeated `getenv` and `hardware_concurrency()` overhead from the eigensolve-heavy paths.
+- Files touched:
+  - `include/igneous/core/parallel.hpp`
+- Smoke results:
+  - `bench_eigenbasis/2000/16`: `8.199 ms` -> `8.539 ms`
+  - `bench_pipeline_spectral_main`: `16.783 ms` -> `18.089 ms`
+  - `bench_pipeline_hodge_main`: `153.294 ms` -> `160.077 ms`
+  - `bench_hodge_phase_eigenbasis`: `116.248 ms` -> `123.248 ms`
+  - `bench_diffgeo_pipeline/4000/64/32/32`: `149.864 ms` -> `161.629 ms`
+  - `bench_diffgeo_phase_eigenbasis/4000/64/32/32`: `106.213 ms` -> `113.714 ms`
+- Test results:
+  - `test_ops_spectral_geometry`: pass
+  - `test_ops_diffusion_forms`: pass
+  - `test_ops_hodge`: pass
+- Decision: `rejected`
+- Rejected because:
+  - Even if the cached env lookup slightly helps tiny loops, it regresses the actual 64-mode guard path across spectral, hodge, and diffgeo.
+
+## Profiling: current diffgeo pipeline
+- Timestamp: 2026-03-07T15:47:00-07:00
+- Artifact:
+  - `notes/perf/20260306-wave3/profiles/20260307-154012-diffgeo-pipeline/sample.txt`
+- Main observations:
+  - The diffgeo pipeline is still dominated by the large sparse diffusion eigensolve.
+  - The same Spectra restart/lanczos path and Eigen dense orthogonalization matvecs remain hot on the current head.
+  - The remaining `parallel_for_index` env/thread lookup overhead is visible in samples, but it is clearly not large enough to justify the regression from the cached-env experiment.
diff --git a/notes/perf/20260306-wave3/prior-art.md b/notes/perf/20260306-wave3/prior-art.md
new file mode 100644
index 0000000..6ca7492
--- /dev/null
+++ b/notes/perf/20260306-wave3/prior-art.md
@@ -0,0 +1,56 @@
+# Prior Art
+
+This campaign should build on previous accepted optimizations and avoid repeating rejected paths.
+
+## Preserve
+- CSR-backed diffusion operators and direct Markov traversal.
+- Current threaded worker-pool backend and conservative GPU gating.
+- Symmetric-kernel eigensolver path and current compact-plus-fallback Arnoldi policy.
+- Corrected generic k-form semantics from the diffusion-geometry parity work.
+- Shared circular-coordinate setup reuse within a single pipeline call; keep the existing mode ordering and selection semantics unchanged.
+- Parallel upper-triangle assembly for generic form up-Laplacians; it preserved outputs and removed the serial bottleneck in the basis-pair sweep.
+- Parallelized/fused post-kNN diffusion-build passes while keeping the existing Eigen sparse symmetrization path; this is the accepted build optimization over the rejected direct CSR rewrite.
+- Hodge workspace gamma-cache reuse within a fixed mesh/eigenbasis state; if the workspace is reused across rebuilds, reset it explicitly first.
+- Stable block-prefix DEC face-incidence construction for large meshes; it preserves per-vertex face order while removing the large-mesh serial bottleneck.
+- Matrix-form circular workspace preparation on the mean-centered diffusion path; keep the exact covariance formulas for `Gamma(x, phi)` and the scalar weak Laplacian instead of reverting to pairwise scalar `carre_du_champ` sweeps.
+- Relaxed Spectra tolerance only for small-basis solves (`n_eigenvectors <= 16`); the 16-mode scalar spectral path benefits, but the 64-mode hodge/diffgeo guards should stay on the stricter convergence target.
+- Matrix-form Hodge curl-energy assembly on the mean-centered path; keep the exact `U^T diag(P^T c) U - (P U)^T diag(c) (P U)` formulation plus the weighted mixed-gamma cross term instead of the old basis-pair `carre_du_champ` sweeps.
+- Cholesky whitening for dense generalized form/Hodge spectra with fallback to the old mass-eigenspace path; this is the accepted fast path for the current SPD mass matrices.
+
+## Do Not Repeat
+- Tightening Arnoldi subspace below the current accepted compact fallback.
+- Circular-coordinate solver/order changes that alter mode selection.
+- KD-tree leaf-size experiments beyond the accepted setting.
+- Scratch-reuse-only attempts in diffusion build.
+- Hybrid brute-force kNN variants.
+- Naive circular-coordinate cache reuse.
+- The rejected 1-form block-GEMM rewrite as-is.
+- Low-value curl-energy micro-tweaks that only move isolated kernels without pipeline wins.
+- Precomputing normalized symmetric-kernel CSR values during diffusion build; it improved isolated eigenbasis solves but failed the build-vs-pipeline trade.
+- Replacing the circular generalized solve with a cached mass-space standard solve; it only moved the phase at noise level and regressed the diffgeo pipeline.
+- Deflating the trivial normalized-kernel diffusion mode inside the symmetric Spectra path; it did not produce a clear, repeatable pipeline win.
+- Replacing the circular generalized eigensolver with the macOS Accelerate/CLAPACK path; it only moved the circular phase at noise level and added platform-specific complexity.
+- Batching the paired circular-operator assembly; it did not materially reduce the remaining circular phase.
+- The direct CSR symmetrization rewrite tested in Wave 3 improved some structure phases but regressed downstream spectral/pipeline behavior; do not reuse that exact formulation.
+- Treat the `4000/64` diffgeo torus eigensolve as a guard when changing diffusion build internals; the accepted post-kNN fusion pass already spends some of that headroom.
+- Lowering the spectral CSR matvec parallel threshold to force parallel row sweeps at `n ~= 2k-4k`; the worker-pool overhead regressed the eigensolve-heavy paths.
+- Relaxing the Spectra tolerance globally on large-basis solves; it helps the 16-mode scalar spectral path but regresses the 64-mode hodge guard.
+- Replacing the symmetric normalized-kernel solver with Spectra's Davidson path; it remained much slower than the accepted Lanczos solver even with a direct CSR adapter.
+- Matrixizing `Gamma(data_coord, immersion_coord)` for diffgeo reconstruction; the diffgeo pipeline stayed effectively flat, so that precompute is not worth revisiting on the current head.
+- Matrixizing the mean-centered `DiffusionFormWorkspace` mixed-gamma cache fill; the diffgeo pipeline regressed and the operator-phase movement was too noisy to justify the larger cache rewrite.
+- Weighted coordinate seeds for the large-basis symmetric diffusion eigensolve; they regressed the 64-mode hodge/diffgeo guards.
+- CPU Markov multi-step copy-avoidance rewrites that change ping/pong ownership; they only help isolated large microcases and regress the higher-priority pipeline guard.
+- Early serial short-circuiting inside `parallel_for_index`; the env/backend lookup overhead is not the dominant sparse-eigensolve cost on the current head.
+- Hoisting CPU Markov repeated-step dispatch into a parallel row helper; it helps one large microbenchmark but regresses the diffusion pipeline guards.
+- Parallel row-sweep versions of `carre_du_champ`; they interact badly with nested worker-pool usage in the generic form path.
+- Scalarized ambient reconstruction for diffgeo/hodge export paths; it regressed the diffgeo pipeline and app guard on the current head.
+- Caching `IGNEOUS_BACKEND` / `IGNEOUS_NUM_THREADS` once per process; it regressed the 64-mode spectral/hodge/diffgeo guard path.
+
+## New Focus
+- Benchmark and optimize the generic k-form pipeline, which was added after most of the earlier perf work.
+- Reuse already-computed 1-form operators inside the 2-form path.
+- Remove exact recomputation before trying lower-level loop tuning.
+- Preserve the lazy normalized symmetric-kernel cache inside eigensolve; it is the accepted spectral improvement over the rejected build-time precompute.
+- Revisit diffusion build, circular solve internals, or non-diffusion geometry kernels only if they can beat the current head without shifting cost into downstream spectral behavior.
+- Treat the 16-mode scalar spectral path separately from the 64-mode hodge/diffgeo guard path when changing solver tolerances or policy.
+- Treat dense generalized spectra separately from the large sparse diffusion eigensolve; the dense form/Hodge solve now has an accepted Cholesky fast path and should not be conflated with the sparse basis extraction problem.
diff --git a/notes/perf/20260306-wave3/profiles/.gitkeep b/notes/perf/20260306-wave3/profiles/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/notes/perf/20260306-wave3/profiles/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/notes/perf/20260306-wave3/report.md b/notes/perf/20260306-wave3/report.md
new file mode 100644
index 0000000..d6be220
--- /dev/null
+++ b/notes/perf/20260306-wave3/report.md
@@ -0,0 +1,163 @@
+# Wave 3 Report
+
+## Outcome
+- Status: active.
+- Current focus: large-basis diffusion eigenbasis time first, then remaining diffgeo spectrum/reconstruction overhead.
+- Current branch head: `49b3bd3`.
+
+## Kept Improvements
+1. `2c56f6d` `perf(diffgeo): add wave3 harness and reuse form operators`
+   - Added `bench_diffgeo` and the Wave 3 perf note scaffolding.
+   - Reused 1-form operators inside the 2-form path and replaced the inner `2x2` solve with closed-form adjugate arithmetic.
+2. `6ca4ba7` `perf(diffgeo): densify weak derivative assembly`
+   - Rewrote the generic k-form weak derivative around dense block couplings.
+3. `2810304` `perf(spectral): reuse circular coordinate setup`
+   - Shared circular-coordinate setup across the paired solves and densified `X_op` assembly.
+4. `2a6213d` `perf(diffgeo): parallelize form up-laplacian assembly`
+   - Parallelized exact upper-triangle assembly for the generic up-Laplacian sweep.
+5. `1425f1f` `perf(spectral): cache normalized kernel operator`
+   - Cached the normalized symmetric-kernel CSR lazily on first eigensolve so the solver stops reapplying row normalization inside every matvec.
+6. `dee719f` `perf(diffusion): fuse post-knn build passes`
+   - Parallelized and fused the post-kNN diffusion-build passes while keeping the accepted sparse symmetrization path intact.
+7. `89e82be` `perf(hodge): cache gamma assembly in workspace`
+   - Reused mixed gamma and coordinate gamma inside `HodgeWorkspace`, then reset the workspace explicitly in rebuild-heavy loops.
+8. `5d0f4db` `perf(dec): parallelize stable face-incidence build`
+   - Replaced the large-mesh serial DEC face-incidence count/write passes with a stable block-prefix build that preserves per-vertex face order.
+9. `907b8ac` `perf(circular): matrixize workspace preparation`
+   - Replaced the mean-centered circular workspace preparation sweeps with exact matrix formulas for `Gamma(x, phi)` and the scalar weak Laplacian.
+10. `2a24aef` `perf(spectral): loosen tolerance for small basis solves`
+   - Relaxed the Spectra convergence tolerance only for `n_eigenvectors <= 16`, cutting the scalar spectral path without moving the 64-mode hodge/diffgeo guards.
+11. `5e07f81` `perf(hodge): matrixize curl energy assembly`
+   - Replaced the mean-centered Hodge curl-energy basis-pair sweeps with exact dense block contractions over cached Markov and gamma data.
+12. `49b3bd3` `perf(spectral): use cholesky whitening for dense solves`
+   - Added a direct Cholesky whitening fast path for dense generalized form/Hodge spectra with exact fallback to the older mass-eigenspace regularization path.
+
+## Rejected Fronts
+- Normalized symmetric-kernel precompute during diffusion build.
+  - Improved isolated eigenbasis kernels.
+  - Failed the build-vs-pipeline trade and regressed `bench_diffusion_build/2000`.
+- Circular mass-space standard solve.
+  - Changed solver strategy without producing a meaningful circular or pipeline win.
+- Symmetric normalized-kernel sort/init variants.
+  - `LargestAlge` and `sqrt(row_sums)` seeding improved some scalar spectral cases but regressed higher-rank hodge/diffgeo eigensolve targets.
+- Large-mesh atomic DEC face incidence.
+  - Improved isolated structure build but hurt large-grid curvature/flow locality enough to lose the overall geometry benchmark.
+- Symmetric trivial-mode deflation in the normalized-kernel eigensolver.
+  - Did not produce a clear repeatable win across the spectral, hodge, and diffgeo pipeline guards.
+- Accelerate-backed circular generalized eigensolve.
+  - Barely moved the circular phase and added an Apple-only code path.
+- Batched paired circular assembly.
+  - Removed duplicated operator setup work on paper, but did not materially reduce the actual circular phase.
+- Lower spectral matvec parallel threshold.
+  - Forcing worker-pool parallelism inside the CSR matvec regressed the eigensolve-heavy paths.
+- Global spectral tolerance relaxation.
+  - Helped the 16-mode scalar spectral path but regressed the 64-mode hodge eigensolve.
+- Davidson symmetric eigensolver.
+  - The JD solver family was dramatically slower than the accepted symmetric Lanczos path on the 64-mode target.
+- Matrix diffgeo gamma-data immersion.
+  - The exact covariance rewrite for `Gamma(data_coord, immersion_coord)` did not move the end-to-end diffgeo pipeline enough to keep.
+- Matrix diffgeo mixed-gamma cache.
+  - The mean-centered matrix rewrite for the generic form mixed-gamma cache regressed the end-to-end diffgeo pipeline and was rejected at smoke.
+
+## Final Headline Numbers
+Using the Wave 3 artifacts in this directory:
+- Diffgeo pipeline:
+  - Baseline after Wave 3 harness landed: `bench_diffgeo_pipeline/4000/64/32/32 = 4169.532 ms` in `baseline.md`
+  - Current head: `145.730 ms` CPU in `bench_diffgeo_form_spectrum_cholesky_candidate_20260307-025546.txt`
+- Diffgeo circular phase:
+  - Baseline: `2416.567 ms` in `baseline.md`
+  - Current head: `3.821 ms` CPU in `bench_diffgeo_circularprep_candidate_20260306-192320.txt`
+- Diffgeo eigenbasis phase:
+  - Baseline after the harness landed: `893.912 ms` in `baseline.md`
+  - Current head: `107.411 ms` CPU in `bench_diffgeo_form_spectrum_cholesky_candidate_20260307-025546.txt`
+- Hodge pipeline:
+  - Current head: `149.293 ms` CPU in `bench_pipelines_form_spectrum_cholesky_candidate_20260307-025546.txt`
+- Hodge circular phase:
+  - Current head: `4.034 ms` CPU in `bench_pipelines_hodge_curl_matrix_candidate_20260307-024447.txt`
+- Hodge eigenbasis phase:
+  - Current head: `115.785 ms` CPU in `bench_pipelines_form_spectrum_cholesky_candidate_20260307-025546.txt`
+- Hodge curl-energy phase:
+  - Current head: `10.199 ms` CPU in `bench_pipelines_hodge_curl_matrix_candidate_20260307-024447.txt`
+- Hodge dense solve phase:
+  - Current head: `2.151 ms` CPU in `bench_pipelines_form_spectrum_cholesky_candidate_20260307-025546.txt`
+- Spectral pipeline:
+  - Current head: `16.783 ms` CPU in `bench_pipelines_small_spectral_tol_candidate_20260307-023622.txt`
+- Small spectral eigensolve:
+  - `bench_eigenbasis/2000/16`: `11.368 ms` -> `8.199 ms` CPU
+- App guard:
+  - `igneous-diffusion-geometry --n-points 1000 --output-dir <tempdir>`
+  - Early Wave 3 head: `real 1.40`
+  - Current head: `real 0.07`
+  - `igneous-hodge --no-ply`: current head `real 0.05`
+- Geometry large-grid guard:
+  - `bench_geometry` `Grid 1000x1000`
+  - structure: `17.288 ms -> 14.532 ms`
+  - curvature: `5.450 ms -> 5.048 ms`
+  - flow: `0.904 ms -> 0.879 ms`
+
+## Current Hotspot Order
+### Diffgeo
+- `eigenbasis`
+- remaining spectrum/reconstruction overhead
+- remaining misc pipeline overhead
+
+### Hodge
+- `eigenbasis`
+- remaining spectrum overhead
+
+## Assessment
+- The high-confidence, behavior-preserving wins in diffusion build, generic form assembly, and Hodge gamma reuse have been taken.
+- Build-heavy workloads are no longer the main problem.
+- The matrix circular-prep rewrite removed the dominant remaining circular hotspot without changing solver behavior, which reset the hotspot ranking.
+- The small-basis spectral path still had oversolve headroom, and relaxing its convergence target reclaimed that without moving the higher-priority 64-mode guards.
+- The exact matrix curl-energy rewrite removed most of the remaining non-eigenbasis Hodge cost without moving the eigensolve or circular guards.
+- The dense generalized-solve fast path now removes a large fraction of the remaining Hodge solve overhead while keeping the diffgeo pipeline within guard.
+- The main remaining shared cost is still the large-basis diffusion eigenbasis solve; after that, the pipelines are mostly diffgeo spectrum/reconstruction overhead rather than Hodge or circular assembly.
+- The new DEC face-incidence rewrite and the circular-prep rewrite both came from replacing large scalar sweeps with exact matrix formulations that preserve downstream behavior.
+
+## Recommended Next Wave
+- Only continue if one of these is acceptable:
+  - a new eigensolver trade that improves diffusion eigenbasis without shifting too much cost back into diffusion build,
+  - a mathematically exact reduction of the remaining scalar spectrum/reconstruction overhead,
+  - or another order-preserving data-oriented rewrite outside diffusion geometry like the accepted DEC face-incidence pass.
+
+## Post-PR Continuation
+
+### Kept Since The PR Was Opened
+- `cf653c4` `ci(perf): add diffgeo coverage to smoke and deep reports`
+- `59336a7` `perf(dec): specialize curvature traversal for DEC storage`
+
+### Current Additional Gains
+- DEC curvature:
+  - `bench_curvature_kernel/400`: `888653 ns -> 816836 ns` CPU (`-8.08%`)
+  - `bench_geometry` `Grid 1000x1000` curvature: `5.716 ms -> 5.358 ms` (`-6.26%`)
+  - `bench_geometry` `Grid 1000x1000` structure: `14.516 ms -> 13.661 ms` (`-5.89%`)
+
+### New Rejections
+- Weighted coordinate seed for the large-basis symmetric eigensolve.
+- CPU Markov multi-step copy-avoidance rewrite.
+- Early serial short-circuit in `parallel_for_index`.
+
+### Current Read Of The Remaining Work
+- The large sparse diffusion eigensolve is still the biggest shared runtime cost.
+- The geometry/DEC path still has room for order-preserving data-oriented rewrites, but the wins are now single-digit rather than order-of-magnitude.
+- GitHub perf smoke/deep will now surface `bench_diffgeo` even when the baseline branch predates that benchmark target.
+
+## Continuation Outcome
+
+### Additional Rejections After The DEC Curvature Pass
+- Hoisted/parallel CPU Markov repeated-step helper: improved one large microbenchmark but regressed both diffusion pipeline guards.
+- Parallel `carre_du_champ` row sweeps: helped some hodge-side phases but appears unsafe under diffgeo due to nested worker-pool behavior.
+- Scalarized ambient reconstruction in diffgeo/hodge apps and benches: regressed both the diffgeo benchmark guard and the diffgeo app wall-time guard.
+- Cached backend/thread env lookups in `parallel_for_index`: regressed the 64-mode spectral/hodge/diffgeo guard path.
+
+### New Profile Artifact
+- Current diffgeo pipeline sample:
+  - `notes/perf/20260306-wave3/profiles/20260307-154012-diffgeo-pipeline/sample.txt`
+- Readout:
+  - The large sparse diffusion eigensolve is still the dominant remaining shared cost.
+  - The next most likely wins require a deeper solver/orthogonalization change rather than another local cache or loop rewrite.
+
+### Stop Condition
+- This continuation hit multiple consecutive rejected fronts after the accepted DEC curvature pass.
+- The remaining plausible wins are now mostly in the large-basis eigensolver path, where the recent profiler-guided experiments did not clear the guard thresholds.
diff --git a/notes/perf/20260306-wave3/results/.gitkeep b/notes/perf/20260306-wave3/results/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/notes/perf/20260306-wave3/results/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/scripts/perf/profile_counters.sh b/scripts/perf/profile_counters.sh
index 43e6c73..689162e 100755
--- a/scripts/perf/profile_counters.sh
+++ b/scripts/perf/profile_counters.sh
@@ -9,8 +9,9 @@ fi
 ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
 cd "$ROOT_DIR"
 
+NOTES_DIR="${IGNEOUS_PERF_NOTES_DIR:-notes/perf}"
 TS="$(date +%Y%m%d-%H%M%S)"
-OUT_DIR="notes/perf/profiles/$TS"
+OUT_DIR="${NOTES_DIR}/profiles/${TS}"
 mkdir -p "$OUT_DIR"
 
 xctrace record \
diff --git a/scripts/perf/profile_time.sh b/scripts/perf/profile_time.sh
index 647d121..73a1e5b 100755
--- a/scripts/perf/profile_time.sh
+++ b/scripts/perf/profile_time.sh
@@ -9,8 +9,9 @@ fi
 ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
 cd "$ROOT_DIR"
 
+NOTES_DIR="${IGNEOUS_PERF_NOTES_DIR:-notes/perf}"
 TS="$(date +%Y%m%d-%H%M%S)"
-OUT_DIR="notes/perf/profiles/$TS"
+OUT_DIR="${NOTES_DIR}/profiles/${TS}"
 mkdir -p "$OUT_DIR"
 
 xctrace record \
diff --git a/scripts/perf/run_deep_bench.sh b/scripts/perf/run_deep_bench.sh
index b85f0db..5c6a0ec 100755
--- a/scripts/perf/run_deep_bench.sh
+++ b/scripts/perf/run_deep_bench.sh
@@ -4,13 +4,18 @@ set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
 cd "$ROOT_DIR"
 
-mkdir -p notes/perf/results
+NOTES_DIR="${IGNEOUS_PERF_NOTES_DIR:-notes/perf}"
+RESULTS_DIR="${NOTES_DIR}/results"
+
+mkdir -p "$RESULTS_DIR"
 
 TS="$(date +%Y%m%d-%H%M%S)"
-OUT_DOD_JSON="notes/perf/results/bench_dod_${TS}.json"
-OUT_DOD_TXT="notes/perf/results/bench_dod_${TS}.txt"
-OUT_PIPE_JSON="notes/perf/results/bench_pipelines_${TS}.json"
-OUT_PIPE_TXT="notes/perf/results/bench_pipelines_${TS}.txt"
+OUT_DOD_JSON="${RESULTS_DIR}/bench_dod_${TS}.json"
+OUT_DOD_TXT="${RESULTS_DIR}/bench_dod_${TS}.txt"
+OUT_PIPE_JSON="${RESULTS_DIR}/bench_pipelines_${TS}.json"
+OUT_PIPE_TXT="${RESULTS_DIR}/bench_pipelines_${TS}.txt"
+OUT_DIFFGEO_JSON="${RESULTS_DIR}/bench_diffgeo_${TS}.json"
+OUT_DIFFGEO_TXT="${RESULTS_DIR}/bench_diffgeo_${TS}.txt"
 
 IGNEOUS_BENCH_MODE=1 ./build/bench_dod \
   --benchmark_min_time=0.2s \
@@ -28,7 +33,17 @@ IGNEOUS_BENCH_MODE=1 ./build/bench_pipelines \
   --benchmark_out_format=json \
   | tee "$OUT_PIPE_TXT"
 
+IGNEOUS_BENCH_MODE=1 ./build/bench_diffgeo \
+  --benchmark_min_time=0.2s \
+  --benchmark_repetitions=10 \
+  --benchmark_report_aggregates_only=true \
+  --benchmark_out="$OUT_DIFFGEO_JSON" \
+  --benchmark_out_format=json \
+  | tee "$OUT_DIFFGEO_TXT"
+
 echo "Wrote: $OUT_DOD_JSON"
 echo "Wrote: $OUT_DOD_TXT"
 echo "Wrote: $OUT_PIPE_JSON"
 echo "Wrote: $OUT_PIPE_TXT"
+echo "Wrote: $OUT_DIFFGEO_JSON"
+echo "Wrote: $OUT_DIFFGEO_TXT"
diff --git a/src/main_diffusion_geometry.cpp b/src/main_diffusion_geometry.cpp
index 65f9973..87369af 100644
--- a/src/main_diffusion_geometry.cpp
+++ b/src/main_diffusion_geometry.cpp
@@ -389,6 +389,10 @@ int main(int argc, char** argv) {
 
   // 1-forms
   const Eigen::MatrixXf G1 = ops::diffusion::compute_kform_gram_matrix(mesh, 1, n_coeff, forms_ws);
+  const Eigen::MatrixXf G1_inv =
+      ops::diffusion::compute_kform_gram_inverse(mesh, 1, n_coeff, forms_ws);
+  const Eigen::MatrixXf D1 =
+      ops::diffusion::compute_weak_exterior_derivative(mesh, 1, n_coeff, forms_ws);
   const Eigen::MatrixXf down1 =
       ops::diffusion::compute_down_laplacian_matrix(mesh, 1, n_coeff, forms_ws);
   const Eigen::MatrixXf up1 =
@@ -419,17 +423,18 @@ int main(int argc, char** argv) {
 
   const int idx0 = harmonic1_idx.empty() ? 0 : harmonic1_idx[0];
   const int idx1 = harmonic1_idx.size() > 1 ? harmonic1_idx[1] : idx0;
+  ops::diffusion::CircularCoordinateWorkspace<DiffusionMesh> circular_ws;
   const Eigen::VectorXf theta_0 = ops::diffusion::compute_circular_coordinates(
       mesh, pad_1form_coeffs(evecs1.col(idx0), mesh.structure.eigen_basis.cols(), n_coeff), 0.0f,
-      cfg.circular_lambda, cfg.circular_mode_0, nullptr);
+      cfg.circular_lambda, cfg.circular_mode_0, nullptr, circular_ws);
   const Eigen::VectorXf theta_1 = ops::diffusion::compute_circular_coordinates(
       mesh, pad_1form_coeffs(evecs1.col(idx1), mesh.structure.eigen_basis.cols(), n_coeff), 0.0f,
-      cfg.circular_lambda, cfg.circular_mode_1, nullptr);
+      cfg.circular_lambda, cfg.circular_mode_1, nullptr, circular_ws);
 
   // 2-forms
   const Eigen::MatrixXf G2 = ops::diffusion::compute_kform_gram_matrix(mesh, 2, n_coeff, forms_ws);
   const Eigen::MatrixXf down2 =
-      ops::diffusion::compute_down_laplacian_matrix(mesh, 2, n_coeff, forms_ws);
+      ops::diffusion::assemble_down_laplacian_matrix_from_previous(D1, G1_inv);
   const Eigen::MatrixXf up2 =
       ops::diffusion::compute_up_laplacian_matrix(mesh, 2, n_coeff, forms_ws);
   const Eigen::MatrixXf L2 = ops::diffusion::assemble_hodge_laplacian_matrix(up2, down2);
diff --git a/src/main_hodge.cpp b/src/main_hodge.cpp
index 088b29b..4621ff7 100644
--- a/src/main_hodge.cpp
+++ b/src/main_hodge.cpp
@@ -424,10 +424,13 @@ int main(int argc, char** argv) {
 
   std::complex<float> selected_eval_0(0.0f, 0.0f);
   std::complex<float> selected_eval_1(0.0f, 0.0f);
+  ops::diffusion::CircularCoordinateWorkspace<DiffusionMesh> circular_ws;
   const auto theta_0 = ops::diffusion::compute_circular_coordinates(
-      mesh, evecs.col(0), 0.0f, cfg.circular_lambda, cfg.circular_mode_0, &selected_eval_0);
+      mesh, evecs.col(0), 0.0f, cfg.circular_lambda, cfg.circular_mode_0, &selected_eval_0,
+      circular_ws);
   const auto theta_1 = ops::diffusion::compute_circular_coordinates(
-      mesh, evecs.col(1), 0.0f, cfg.circular_lambda, cfg.circular_mode_1, &selected_eval_1);
+      mesh, evecs.col(1), 0.0f, cfg.circular_lambda, cfg.circular_mode_1, &selected_eval_1,
+      circular_ws);
 
   std::cout << "HODGE SPECTRUM (first 12 modes)\n";
   for (int i = 0; i < 12 && i < evals.size(); ++i) {