AztecProtocol · iakovenkos · Jun 4, 2026 · Jun 4, 2026
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.fuzzer.cpp
@@ -147,7 +147,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
     }
 
     // Check 2: SIMD x4 path agrees with scalar path lane-by-lane.
-    std::array<uint32_t, 4> simd_out{};
+    alignas(16) std::array<uint32_t, 4> simd_out{};
     production_simd(scalars, bit_offset, window_bits, simd_out);
     for (size_t lane = 0; lane < 4; ++lane) {
         const uint32_t want = production_scalar(scalars[lane].data(), bit_offset, window_bits);

diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp
@@ -211,11 +211,9 @@ struct ConstantineSliceParamsU32 {
 }
 
 // Store a `SimdU32x4` to a 4-lane uint32 destination as a single 128-bit op.
-// On WASM the explicit `wasm_v128_store` is used because earlier codegen for
-// the equivalent struct-wrapper assignment was observed to round-trip the
-// vector through 4 scalar memory slots; the intrinsic guarantees the
-// `i32x4.store` opcode. On native the `vector_size` store lowers directly to
-// SSE2 `movdqu` / NEON `st1`.
+// Precondition: `dst` is 16-byte aligned.
+// On WASM the explicit intrinsic guarantees a `v128.store`; on native the typed
+// vector store lets the compiler use aligned SIMD stores (e.g. x86 movaps/movdqa).
 [[gnu::always_inline]] inline void simd_u32x4_store(uint32_t* dst, SimdU32x4 v) noexcept
 {
 #ifdef __wasm_simd128__

diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.test.cpp
@@ -207,7 +207,7 @@ TEST(PippengerConstantine, SimdX4MatchesScalarPathLanewise)
                 std::array<std::array<uint64_t, NUM_LIMBS_U64>, 4> scalars{
                     random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs(), random_scalar_limbs()
                 };
-                std::array<uint32_t, 4> got_simd{};
+                alignas(16) std::array<uint32_t, 4> got_simd{};
                 production_simd_path(scalars.data(), bit_offset, window_bits, got_simd.data());
                 for (size_t lane = 0; lane < 4; ++lane) {
                     const uint32_t want = production_scalar_path(scalars[lane].data(), bit_offset, window_bits);