diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
index 70b4799ab..4ea52185a 100644
--- a/BIBLIOGRAPHY.md
+++ b/BIBLIOGRAPHY.md
@@ -91,7 +91,8 @@ source code and documentation.
   - [mldsa/src/fips202/fips202x4.c](mldsa/src/fips202/fips202x4.c)
   - [mldsa/src/poly.c](mldsa/src/poly.c)
   - [mldsa/src/poly_kl.c](mldsa/src/poly_kl.c)
-  - [mldsa/src/polyvec.c](mldsa/src/polyvec.c)
+  - [mldsa/src/polyvec_lazy.c](mldsa/src/polyvec_lazy.c)
+  - [mldsa/src/polyvec_lazy.h](mldsa/src/polyvec_lazy.h)
   - [mldsa/src/rounding.h](mldsa/src/rounding.h)
   - [mldsa/src/sign.c](mldsa/src/sign.c)
   - [mldsa/src/sign.h](mldsa/src/sign.h)
diff --git a/examples/bring_your_own_fips202/mldsa_native/src/polyvec_lazy.c b/examples/bring_your_own_fips202/mldsa_native/src/polyvec_lazy.c
new file mode 120000
index 000000000..0d268d68e
--- /dev/null
+++ b/examples/bring_your_own_fips202/mldsa_native/src/polyvec_lazy.c
@@ -0,0 +1 @@
+../../../../mldsa/src/polyvec_lazy.c
\ No newline at end of file
diff --git a/examples/bring_your_own_fips202_static/mldsa_native/src/polyvec_lazy.c b/examples/bring_your_own_fips202_static/mldsa_native/src/polyvec_lazy.c
new file mode 120000
index 000000000..0d268d68e
--- /dev/null
+++ b/examples/bring_your_own_fips202_static/mldsa_native/src/polyvec_lazy.c
@@ -0,0 +1 @@
+../../../../mldsa/src/polyvec_lazy.c
\ No newline at end of file
diff --git a/examples/custom_backend/mldsa_native/src/polyvec_lazy.c b/examples/custom_backend/mldsa_native/src/polyvec_lazy.c
new file mode 120000
index 000000000..0d268d68e
--- /dev/null
+++ b/examples/custom_backend/mldsa_native/src/polyvec_lazy.c
@@ -0,0 +1 @@
+../../../../mldsa/src/polyvec_lazy.c
\ No newline at end of file
diff --git a/integration/liboqs/ML-DSA-44_META.yml b/integration/liboqs/ML-DSA-44_META.yml
index 3c12c68e3..31f950773 100644
--- a/integration/liboqs/ML-DSA-44_META.yml
+++ b/integration/liboqs/ML-DSA-44_META.yml
@@ -35,8 +35,9 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/packing.c mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c
     mldsa/src/poly.h mldsa/src/poly_kl.c mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h
-    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
-    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/polyvec_lazy.c mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h
+    mldsa/src/rounding.h mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h
+    mldsa/src/zetas.inc
 - name: x86_64
   version: FIPS204
   folder_name: .
@@ -51,9 +52,10 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/native/api.h mldsa/src/native/meta.h mldsa/src/packing.c
     mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c mldsa/src/poly.h mldsa/src/poly_kl.c
-    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.h
-    mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h mldsa/src/sign.c
-    mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc mldsa/src/native/x86_64
+    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.c
+    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
+    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/native/x86_64
   supported_platforms:
   - architecture: x86_64
     operating_systems:
@@ -77,9 +79,10 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/native/api.h mldsa/src/native/meta.h mldsa/src/packing.c
     mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c mldsa/src/poly.h mldsa/src/poly_kl.c
-    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.h
-    mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h mldsa/src/sign.c
-    mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc mldsa/src/native/aarch64
+    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.c
+    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
+    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/native/aarch64
   supported_platforms:
   - architecture: arm_8
     operating_systems:
diff --git a/integration/liboqs/ML-DSA-65_META.yml b/integration/liboqs/ML-DSA-65_META.yml
index 79e762921..5c58b47cf 100644
--- a/integration/liboqs/ML-DSA-65_META.yml
+++ b/integration/liboqs/ML-DSA-65_META.yml
@@ -35,8 +35,9 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/packing.c mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c
     mldsa/src/poly.h mldsa/src/poly_kl.c mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h
-    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
-    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/polyvec_lazy.c mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h
+    mldsa/src/rounding.h mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h
+    mldsa/src/zetas.inc
 - name: x86_64
   version: FIPS204
   folder_name: .
@@ -51,9 +52,10 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/native/api.h mldsa/src/native/meta.h mldsa/src/packing.c
     mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c mldsa/src/poly.h mldsa/src/poly_kl.c
-    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.h
-    mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h mldsa/src/sign.c
-    mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc mldsa/src/native/x86_64
+    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.c
+    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
+    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/native/x86_64
   supported_platforms:
   - architecture: x86_64
     operating_systems:
@@ -77,9 +79,10 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/native/api.h mldsa/src/native/meta.h mldsa/src/packing.c
     mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c mldsa/src/poly.h mldsa/src/poly_kl.c
-    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.h
-    mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h mldsa/src/sign.c
-    mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc mldsa/src/native/aarch64
+    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.c
+    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
+    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/native/aarch64
   supported_platforms:
   - architecture: arm_8
     operating_systems:
diff --git a/integration/liboqs/ML-DSA-87_META.yml b/integration/liboqs/ML-DSA-87_META.yml
index db3c4383e..e67633095 100644
--- a/integration/liboqs/ML-DSA-87_META.yml
+++ b/integration/liboqs/ML-DSA-87_META.yml
@@ -35,8 +35,9 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/packing.c mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c
     mldsa/src/poly.h mldsa/src/poly_kl.c mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h
-    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
-    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/polyvec_lazy.c mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h
+    mldsa/src/rounding.h mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h
+    mldsa/src/zetas.inc
 - name: x86_64
   version: FIPS204
   folder_name: .
@@ -51,9 +52,10 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/native/api.h mldsa/src/native/meta.h mldsa/src/packing.c
     mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c mldsa/src/poly.h mldsa/src/poly_kl.c
-    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.h
-    mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h mldsa/src/sign.c
-    mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc mldsa/src/native/x86_64
+    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.c
+    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
+    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/native/x86_64
   supported_platforms:
   - architecture: x86_64
     operating_systems:
@@ -77,9 +79,10 @@ implementations:
     mldsa/src/cbmc.h mldsa/src/common.h mldsa/src/ct.c mldsa/src/ct.h mldsa/src/debug.c
     mldsa/src/debug.h mldsa/src/native/api.h mldsa/src/native/meta.h mldsa/src/packing.c
     mldsa/src/packing.h mldsa/src/params.h mldsa/src/poly.c mldsa/src/poly.h mldsa/src/poly_kl.c
-    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.h
-    mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h mldsa/src/sign.c
-    mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc mldsa/src/native/aarch64
+    mldsa/src/poly_kl.h mldsa/src/polyvec.c mldsa/src/polyvec.h mldsa/src/polyvec_lazy.c
+    mldsa/src/polyvec_lazy.h mldsa/src/randombytes.h mldsa/src/reduce.h mldsa/src/rounding.h
+    mldsa/src/sign.c mldsa/src/sign.h mldsa/src/symmetric.h mldsa/src/sys.h mldsa/src/zetas.inc
+    mldsa/src/native/aarch64
   supported_platforms:
   - architecture: arm_8
     operating_systems:
diff --git a/integration/opentitan/add_polyvec_lazy.patch b/integration/opentitan/add_polyvec_lazy.patch
index 5f79de60b..d3898adf0 100644
--- a/integration/opentitan/add_polyvec_lazy.patch
+++ b/integration/opentitan/add_polyvec_lazy.patch
@@ -4,10 +4,11 @@ diff --git a/third_party/mldsa_native/BUILD.mldsa_native.bazel b/third_party/mld
 index 8a63d09..1c576c2 100644
 --- a/third_party/mldsa_native/BUILD.mldsa_native.bazel
 +++ b/third_party/mldsa_native/BUILD.mldsa_native.bazel
-@@ -26,6 +26,7 @@ cc_library(
+@@ -26,6 +26,8 @@ cc_library(
          "mldsa/src/poly_kl.h",
          "mldsa/src/polyvec.c",
          "mldsa/src/polyvec.h",
++        "mldsa/src/polyvec_lazy.c",
 +        "mldsa/src/polyvec_lazy.h",
          "mldsa/src/reduce.h",
          "mldsa/src/rounding.h",
diff --git a/integration/opentitan/reduce_alloc.patch b/integration/opentitan/reduce_alloc.patch
index 1dae278af..f346edaa7 100644
--- a/integration/opentitan/reduce_alloc.patch
+++ b/integration/opentitan/reduce_alloc.patch
@@ -1,7 +1,6 @@
 # Copyright (c) The mldsa-native project authors
 # SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
 diff --git a/sw/device/lib/crypto/include/mldsa.h b/sw/device/lib/crypto/include/mldsa.h
-index be11f20..26351ee 100644
 --- a/sw/device/lib/crypto/include/mldsa.h
 +++ b/sw/device/lib/crypto/include/mldsa.h
 @@ -41,16 +41,16 @@ enum {
@@ -10,20 +9,23 @@ index be11f20..26351ee 100644
    // Work buffer sizes in 32-bit words
 -  kOtcryptoMldsa44WorkBufferKeypairWords = 32992 / sizeof(uint32_t),
 -  kOtcryptoMldsa44WorkBufferSignWords = 32448 / sizeof(uint32_t),
-+  kOtcryptoMldsa44WorkBufferKeypairWords = 28960 / sizeof(uint32_t),
-+  kOtcryptoMldsa44WorkBufferSignWords = 20256 / sizeof(uint32_t),
-   kOtcryptoMldsa44WorkBufferVerifyWords = 22464 / sizeof(uint32_t),
+-  kOtcryptoMldsa44WorkBufferVerifyWords = 22464 / sizeof(uint32_t),
++  kOtcryptoMldsa44WorkBufferKeypairWords = 12576 / sizeof(uint32_t),
++  kOtcryptoMldsa44WorkBufferSignWords = 18208 / sizeof(uint32_t),
++  kOtcryptoMldsa44WorkBufferVerifyWords = 20416 / sizeof(uint32_t),
  
 -  kOtcryptoMldsa65WorkBufferKeypairWords = 46304 / sizeof(uint32_t),
 -  kOtcryptoMldsa65WorkBufferSignWords = 44768 / sizeof(uint32_t),
-+  kOtcryptoMldsa65WorkBufferKeypairWords = 40224 / sizeof(uint32_t),
-+  kOtcryptoMldsa65WorkBufferSignWords = 26432 / sizeof(uint32_t),
-   kOtcryptoMldsa65WorkBufferVerifyWords = 30720 / sizeof(uint32_t),
+-  kOtcryptoMldsa65WorkBufferVerifyWords = 30720 / sizeof(uint32_t),
++  kOtcryptoMldsa65WorkBufferKeypairWords = 15648 / sizeof(uint32_t),
++  kOtcryptoMldsa65WorkBufferSignWords = 23360 / sizeof(uint32_t),
++  kOtcryptoMldsa65WorkBufferVerifyWords = 27648 / sizeof(uint32_t),
  
 -  kOtcryptoMldsa87WorkBufferKeypairWords = 62688 / sizeof(uint32_t),
 -  kOtcryptoMldsa87WorkBufferSignWords = 59104 / sizeof(uint32_t),
-+  kOtcryptoMldsa87WorkBufferKeypairWords = 54560 / sizeof(uint32_t),
-+  kOtcryptoMldsa87WorkBufferSignWords = 34624 / sizeof(uint32_t),
-   kOtcryptoMldsa87WorkBufferVerifyWords = 41216 / sizeof(uint32_t),
+-  kOtcryptoMldsa87WorkBufferVerifyWords = 41216 / sizeof(uint32_t),
++  kOtcryptoMldsa87WorkBufferKeypairWords = 19744 / sizeof(uint32_t),
++  kOtcryptoMldsa87WorkBufferSignWords = 29504 / sizeof(uint32_t),
++  kOtcryptoMldsa87WorkBufferVerifyWords = 36096 / sizeof(uint32_t),
  };
  
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index 0279b10a3..7c959967d 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -65,6 +65,7 @@
 #include "src/poly.c"
 #include "src/poly_kl.c"
 #include "src/polyvec.c"
+#include "src/polyvec_lazy.c"
 #include "src/sign.c"
 
 #if !defined(MLD_CONFIG_FIPS202_CUSTOM_HEADER)
@@ -256,11 +257,13 @@
 #undef mld_memset
 /* mldsa/src/packing.h */
 #undef MLD_PACKING_H
-#undef mld_pack_pk
+#undef mld_pack_pk_t1
 #undef mld_pack_sig_c
 #undef mld_pack_sig_h_poly
 #undef mld_pack_sig_z
-#undef mld_pack_sk
+#undef mld_pack_sk_rho_key_tr_s2
+#undef mld_pack_sk_s1
+#undef mld_pack_sk_t0
 #undef mld_unpack_pk
 #undef mld_unpack_sig
 #undef mld_unpack_sk
@@ -310,22 +313,15 @@
 #undef mld_polyz_unpack
 /* mldsa/src/polyvec.h */
 #undef MLD_POLYVEC_H
-#undef mld_polymat
-#undef mld_polymat_get_row
-#undef mld_polyvec_matrix_expand
-#undef mld_polyvec_matrix_pointwise_montgomery
 #undef mld_polyveck
-#undef mld_polyveck_add
 #undef mld_polyveck_caddq
 #undef mld_polyveck_chknorm
 #undef mld_polyveck_decompose
 #undef mld_polyveck_invntt_tomont
 #undef mld_polyveck_ntt
 #undef mld_polyveck_pack_eta
-#undef mld_polyveck_pack_t0
 #undef mld_polyveck_pack_w1
 #undef mld_polyveck_pointwise_poly_montgomery
-#undef mld_polyveck_power2round
 #undef mld_polyveck_reduce
 #undef mld_polyveck_shiftl
 #undef mld_polyveck_sub
@@ -336,12 +332,26 @@
 #undef mld_polyvecl_chknorm
 #undef mld_polyvecl_ntt
 #undef mld_polyvecl_pack_eta
-#undef mld_polyvecl_pointwise_acc_montgomery
 #undef mld_polyvecl_uniform_gamma1
 #undef mld_polyvecl_unpack_eta
 #undef mld_polyvecl_unpack_z
 /* mldsa/src/polyvec_lazy.h */
 #undef MLD_POLYVEC_LAZY_H
+#undef mld_poly_permute_bitrev_to_custom_optional
+#undef mld_polymat
+#undef mld_polymat_eager
+#undef mld_polymat_get_poly_lazy
+#undef mld_polymat_get_row_eager
+#undef mld_polymat_lazy
+#undef mld_polyvec_matrix_expand
+#undef mld_polyvec_matrix_expand_eager
+#undef mld_polyvec_matrix_expand_lazy
+#undef mld_polyvec_matrix_pointwise_montgomery
+#undef mld_polyvec_matrix_pointwise_montgomery_eager
+#undef mld_polyvec_matrix_pointwise_montgomery_lazy
+#undef mld_polyvecl_pointwise_acc_montgomery
+#undef mld_polyvecl_pointwise_acc_montgomery_eager
+#undef mld_polyvecl_pointwise_acc_montgomery_lazy
 #undef mld_sk_s1hat
 #undef mld_sk_s1hat_eager
 #undef mld_sk_s1hat_get_poly
diff --git a/mldsa/mldsa_native.h b/mldsa/mldsa_native.h
index aa19e39b2..c1612ec09 100644
--- a/mldsa/mldsa_native.h
+++ b/mldsa/mldsa_native.h
@@ -912,37 +912,37 @@ int MLD_API_NAMESPACE(pk_from_sk)(
  */
 /* check-magic: off */
 #if defined(MLD_API_LEGACY_CONFIG) || !defined(MLD_CONFIG_REDUCE_RAM)
-#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 41216
+#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 26880
 #define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 52544
-#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 45248
+#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 27232
 #define MLD_TOTAL_ALLOC_44_SIGN 48800
 #define MLD_TOTAL_ALLOC_44_VERIFY 38816
-#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 65792
+#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 44288
 #define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 79712
-#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 71872
+#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 44640
 #define MLD_TOTAL_ALLOC_65_SIGN 74432
 #define MLD_TOTAL_ALLOC_65_VERIFY 62432
-#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 104704
+#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 75008
 #define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 122624
-#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 112832
+#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 75360
 #define MLD_TOTAL_ALLOC_87_SIGN 115392
 #define MLD_TOTAL_ALLOC_87_VERIFY 99552
 #else /* MLD_API_LEGACY_CONFIG || !MLD_CONFIG_REDUCE_RAM */
-#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 28960
-#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 28960
-#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 32992
-#define MLD_TOTAL_ALLOC_44_SIGN 20256
-#define MLD_TOTAL_ALLOC_44_VERIFY 22464
-#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 40224
-#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 40224
-#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 46304
-#define MLD_TOTAL_ALLOC_65_SIGN 26432
-#define MLD_TOTAL_ALLOC_65_VERIFY 30720
-#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 54560
-#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 54560
-#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 62688
-#define MLD_TOTAL_ALLOC_87_SIGN 34624
-#define MLD_TOTAL_ALLOC_87_VERIFY 41216
+#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 12576
+#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 24160
+#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 12928
+#define MLD_TOTAL_ALLOC_44_SIGN 18208
+#define MLD_TOTAL_ALLOC_44_VERIFY 20416
+#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 15648
+#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 32928
+#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 16000
+#define MLD_TOTAL_ALLOC_65_SIGN 23360
+#define MLD_TOTAL_ALLOC_65_VERIFY 27648
+#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 19744
+#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 43328
+#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 20096
+#define MLD_TOTAL_ALLOC_87_SIGN 29504
+#define MLD_TOTAL_ALLOC_87_VERIFY 36096
 #endif /* !(MLD_API_LEGACY_CONFIG || !MLD_CONFIG_REDUCE_RAM) */
 /* check-magic: on */
 
diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S
index 0558fa621..5498eb3f6 100644
--- a/mldsa/mldsa_native_asm.S
+++ b/mldsa/mldsa_native_asm.S
@@ -261,11 +261,13 @@
 #undef mld_memset
 /* mldsa/src/packing.h */
 #undef MLD_PACKING_H
-#undef mld_pack_pk
+#undef mld_pack_pk_t1
 #undef mld_pack_sig_c
 #undef mld_pack_sig_h_poly
 #undef mld_pack_sig_z
-#undef mld_pack_sk
+#undef mld_pack_sk_rho_key_tr_s2
+#undef mld_pack_sk_s1
+#undef mld_pack_sk_t0
 #undef mld_unpack_pk
 #undef mld_unpack_sig
 #undef mld_unpack_sk
@@ -315,22 +317,15 @@
 #undef mld_polyz_unpack
 /* mldsa/src/polyvec.h */
 #undef MLD_POLYVEC_H
-#undef mld_polymat
-#undef mld_polymat_get_row
-#undef mld_polyvec_matrix_expand
-#undef mld_polyvec_matrix_pointwise_montgomery
 #undef mld_polyveck
-#undef mld_polyveck_add
 #undef mld_polyveck_caddq
 #undef mld_polyveck_chknorm
 #undef mld_polyveck_decompose
 #undef mld_polyveck_invntt_tomont
 #undef mld_polyveck_ntt
 #undef mld_polyveck_pack_eta
-#undef mld_polyveck_pack_t0
 #undef mld_polyveck_pack_w1
 #undef mld_polyveck_pointwise_poly_montgomery
-#undef mld_polyveck_power2round
 #undef mld_polyveck_reduce
 #undef mld_polyveck_shiftl
 #undef mld_polyveck_sub
@@ -341,12 +336,26 @@
 #undef mld_polyvecl_chknorm
 #undef mld_polyvecl_ntt
 #undef mld_polyvecl_pack_eta
-#undef mld_polyvecl_pointwise_acc_montgomery
 #undef mld_polyvecl_uniform_gamma1
 #undef mld_polyvecl_unpack_eta
 #undef mld_polyvecl_unpack_z
 /* mldsa/src/polyvec_lazy.h */
 #undef MLD_POLYVEC_LAZY_H
+#undef mld_poly_permute_bitrev_to_custom_optional
+#undef mld_polymat
+#undef mld_polymat_eager
+#undef mld_polymat_get_poly_lazy
+#undef mld_polymat_get_row_eager
+#undef mld_polymat_lazy
+#undef mld_polyvec_matrix_expand
+#undef mld_polyvec_matrix_expand_eager
+#undef mld_polyvec_matrix_expand_lazy
+#undef mld_polyvec_matrix_pointwise_montgomery
+#undef mld_polyvec_matrix_pointwise_montgomery_eager
+#undef mld_polyvec_matrix_pointwise_montgomery_lazy
+#undef mld_polyvecl_pointwise_acc_montgomery
+#undef mld_polyvecl_pointwise_acc_montgomery_eager
+#undef mld_polyvecl_pointwise_acc_montgomery_lazy
 #undef mld_sk_s1hat
 #undef mld_sk_s1hat_eager
 #undef mld_sk_s1hat_get_poly
diff --git a/mldsa/src/packing.c b/mldsa/src/packing.c
index f9cf275a3..2cbc656d3 100644
--- a/mldsa/src/packing.c
+++ b/mldsa/src/packing.c
@@ -16,25 +16,6 @@
 #define mld_unpack_hints MLD_ADD_PARAM_SET(mld_unpack_hints)
 /* End of parameter set namespacing */
 
-MLD_INTERNAL_API
-void mld_pack_pk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
-                 const uint8_t rho[MLDSA_SEEDBYTES], const mld_polyveck *t1)
-{
-  unsigned int i;
-
-  mld_memcpy(pk, rho, MLDSA_SEEDBYTES);
-  for (i = 0; i < MLDSA_K; ++i)
-  __loop__(
-    assigns(i, memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES))
-    invariant(i <= MLDSA_K)
-    decreases(MLDSA_K - i)
-  )
-  {
-    mld_polyt1_pack(pk + MLDSA_SEEDBYTES + i * MLDSA_POLYT1_PACKEDBYTES,
-                    &t1->vec[i]);
-  }
-}
-
 MLD_INTERNAL_API
 void mld_unpack_pk(uint8_t rho[MLDSA_SEEDBYTES], mld_polyveck *t1,
                    const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES])
@@ -51,11 +32,18 @@ void mld_unpack_pk(uint8_t rho[MLDSA_SEEDBYTES], mld_polyveck *t1,
 }
 
 MLD_INTERNAL_API
-void mld_pack_sk(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
-                 const uint8_t rho[MLDSA_SEEDBYTES],
-                 const uint8_t tr[MLDSA_TRBYTES],
-                 const uint8_t key[MLDSA_SEEDBYTES], const mld_polyveck *t0,
-                 const mld_polyvecl *s1, const mld_polyveck *s2)
+void mld_pack_sk_s1(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
+                    const mld_polyvecl *s1)
+{
+  mld_polyvecl_pack_eta(sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES, s1);
+}
+
+MLD_INTERNAL_API
+void mld_pack_sk_rho_key_tr_s2(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
+                               const uint8_t rho[MLDSA_SEEDBYTES],
+                               const uint8_t tr[MLDSA_TRBYTES],
+                               const uint8_t key[MLDSA_SEEDBYTES],
+                               const mld_polyveck *s2)
 {
   mld_memcpy(sk, rho, MLDSA_SEEDBYTES);
   sk += MLDSA_SEEDBYTES;
@@ -66,13 +54,30 @@ void mld_pack_sk(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
   mld_memcpy(sk, tr, MLDSA_TRBYTES);
   sk += MLDSA_TRBYTES;
 
-  mld_polyvecl_pack_eta(sk, s1);
+  /* s1 already packed via mld_pack_sk_s1 */
   sk += MLDSA_L * MLDSA_POLYETA_PACKEDBYTES;
 
   mld_polyveck_pack_eta(sk, s2);
-  sk += MLDSA_K * MLDSA_POLYETA_PACKEDBYTES;
 
-  mld_polyveck_pack_t0(sk, t0);
+  /* t0 packed per row by the caller via mld_pack_sk_t0 */
+}
+
+MLD_INTERNAL_API
+void mld_pack_sk_t0(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], unsigned int k,
+                    const mld_poly *t0k)
+{
+  mld_polyt0_pack(sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES +
+                      MLDSA_L * MLDSA_POLYETA_PACKEDBYTES +
+                      MLDSA_K * MLDSA_POLYETA_PACKEDBYTES +
+                      k * MLDSA_POLYT0_PACKEDBYTES,
+                  t0k);
+}
+
+MLD_INTERNAL_API
+void mld_pack_pk_t1(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], unsigned int k,
+                    const mld_poly *t1k)
+{
+  mld_polyt1_pack(pk + MLDSA_SEEDBYTES + k * MLDSA_POLYT1_PACKEDBYTES, t1k);
 }
 
 MLD_INTERNAL_API
diff --git a/mldsa/src/packing.h b/mldsa/src/packing.h
index 0479c09a9..2afcd38ea 100644
--- a/mldsa/src/packing.h
+++ b/mldsa/src/packing.h
@@ -8,66 +8,103 @@
 #include "polyvec.h"
 #include "polyvec_lazy.h"
 
-#define mld_pack_pk MLD_NAMESPACE_KL(pack_pk)
+
+#define mld_pack_sk_s1 MLD_NAMESPACE_KL(pack_sk_s1)
 /*************************************************
- * Name:        mld_pack_pk
+ * Name:        mld_pack_sk_s1
  *
- * Description: Bit-pack public key pk = (rho, t1).
+ * Description: Bit-pack the s1 component into the secret key.
  *
- * Arguments:   - uint8_t pk[]: output byte array
- *              - const uint8_t rho[]: byte array containing rho
- *              - const mld_polyveck *t1: pointer to vector t1
+ * Arguments:   - uint8_t sk[]: output byte array
+ *              - const mld_polyvecl *s1: pointer to vector s1
  **************************************************/
 MLD_INTERNAL_API
-void mld_pack_pk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
-                 const uint8_t rho[MLDSA_SEEDBYTES], const mld_polyveck *t1)
+void mld_pack_sk_s1(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
+                    const mld_polyvecl *s1)
 __contract__(
-  requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES))
-  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
-  requires(memory_no_alias(t1, sizeof(mld_polyveck)))
-  requires(forall(k0, 0, MLDSA_K,
-    array_bound(t1->vec[k0].coeffs, 0, MLDSA_N, 0, 1 << 10)))
-  assigns(memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES))
+  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
+  requires(forall(k1, 0, MLDSA_L,
+    array_abs_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
+  assigns(memory_slice(sk, MLDSA_CRYPTO_SECRETKEYBYTES))
 );
 
-
-#define mld_pack_sk MLD_NAMESPACE_KL(pack_sk)
+#define mld_pack_sk_rho_key_tr_s2 MLD_NAMESPACE_KL(pack_sk_rho_key_tr_s2)
 /*************************************************
- * Name:        mld_pack_sk
+ * Name:        mld_pack_sk_rho_key_tr_s2
  *
- * Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
+ * Description: Bit-pack rho, key, tr, s2 into the secret key.
+ *              s1 must already be packed via mld_pack_sk_s1.
+ *              t0 is packed per row by the caller (see
+ *              mld_pack_sk_t0).
  *
  * Arguments:   - uint8_t sk[]: output byte array
  *              - const uint8_t rho[]: byte array containing rho
  *              - const uint8_t tr[]: byte array containing tr
  *              - const uint8_t key[]: byte array containing key
- *              - const mld_polyveck *t0: pointer to vector t0
- *              - const mld_polyvecl *s1: pointer to vector s1
  *              - const mld_polyveck *s2: pointer to vector s2
  **************************************************/
 MLD_INTERNAL_API
-void mld_pack_sk(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
-                 const uint8_t rho[MLDSA_SEEDBYTES],
-                 const uint8_t tr[MLDSA_TRBYTES],
-                 const uint8_t key[MLDSA_SEEDBYTES], const mld_polyveck *t0,
-                 const mld_polyvecl *s1, const mld_polyveck *s2)
+void mld_pack_sk_rho_key_tr_s2(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
+                               const uint8_t rho[MLDSA_SEEDBYTES],
+                               const uint8_t tr[MLDSA_TRBYTES],
+                               const uint8_t key[MLDSA_SEEDBYTES],
+                               const mld_polyveck *s2)
 __contract__(
   requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES))
   requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
   requires(memory_no_alias(tr, MLDSA_TRBYTES))
   requires(memory_no_alias(key, MLDSA_SEEDBYTES))
-  requires(memory_no_alias(t0, sizeof(mld_polyveck)))
-  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
   requires(memory_no_alias(s2, sizeof(mld_polyveck)))
-  requires(forall(k0, 0, MLDSA_K,
-    array_bound(t0->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
-  requires(forall(k1, 0, MLDSA_L,
-    array_abs_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
   requires(forall(k2, 0, MLDSA_K,
     array_abs_bound(s2->vec[k2].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
   assigns(memory_slice(sk, MLDSA_CRYPTO_SECRETKEYBYTES))
 );
 
+#define mld_pack_sk_t0 MLD_NAMESPACE_KL(pack_sk_t0)
+/*************************************************
+ * Name:        mld_pack_sk_t0
+ *
+ * Description: Bit-pack a single t0 polynomial t0[k] at the
+ *              corresponding offset in the secret key.
+ *
+ * Arguments:   - uint8_t sk[]: output byte array (full secret key)
+ *              - unsigned int k: row index, must be < MLDSA_K
+ *              - const mld_poly *t0k: pointer to t0[k]
+ **************************************************/
+MLD_INTERNAL_API
+void mld_pack_sk_t0(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], unsigned int k,
+                    const mld_poly *t0k)
+__contract__(
+  requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES))
+  requires(memory_no_alias(t0k, sizeof(mld_poly)))
+  requires(k < MLDSA_K)
+  requires(array_bound(t0k->coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))
+  assigns(memory_slice(sk, MLDSA_CRYPTO_SECRETKEYBYTES))
+);
+
+#define mld_pack_pk_t1 MLD_NAMESPACE_KL(pack_pk_t1)
+/*************************************************
+ * Name:        mld_pack_pk_t1
+ *
+ * Description: Bit-pack a single t1 polynomial t1[k] at the
+ *              corresponding offset in the public key.
+ *
+ * Arguments:   - uint8_t pk[]: output byte array (full public key)
+ *              - unsigned int k: row index, must be < MLDSA_K
+ *              - const mld_poly *t1k: pointer to t1[k]
+ **************************************************/
+MLD_INTERNAL_API
+void mld_pack_pk_t1(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], unsigned int k,
+                    const mld_poly *t1k)
+__contract__(
+  requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(t1k, sizeof(mld_poly)))
+  requires(k < MLDSA_K)
+  requires(array_bound(t1k->coeffs, 0, MLDSA_N, 0, 1 << 10))
+  assigns(memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES))
+);
+
 
 #define mld_pack_sig_c MLD_NAMESPACE_KL(pack_sig_c)
 /*************************************************
diff --git a/mldsa/src/poly.c b/mldsa/src/poly.c
index b7609fc37..0b71f7455 100644
--- a/mldsa/src/poly.c
+++ b/mldsa/src/poly.c
@@ -508,6 +508,7 @@ void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a)
   __loop__(
     assigns(i, memory_slice(a0, sizeof(mld_poly)), memory_slice(a1, sizeof(mld_poly)))
     invariant(i <= MLDSA_N)
+    invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0]))
     invariant(array_bound(a0->coeffs, 0, i, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1))
     invariant(array_bound(a1->coeffs, 0, i, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1))
     decreases(MLDSA_N - i)
diff --git a/mldsa/src/poly.h b/mldsa/src/poly.h
index 17f54f980..fb34103ad 100644
--- a/mldsa/src/poly.h
+++ b/mldsa/src/poly.h
@@ -213,7 +213,7 @@ void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a)
 __contract__(
   requires(memory_no_alias(a0, sizeof(mld_poly)))
   requires(memory_no_alias(a1, sizeof(mld_poly)))
-  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(a0 == a)
   requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
   assigns(memory_slice(a1, sizeof(mld_poly)))
   assigns(memory_slice(a0, sizeof(mld_poly)))
diff --git a/mldsa/src/polyvec.c b/mldsa/src/polyvec.c
index 652ca2e38..a2e8bf08d 100644
--- a/mldsa/src/polyvec.c
+++ b/mldsa/src/polyvec.c
@@ -15,236 +15,7 @@
 #include "polyvec.h"
 
 #include "debug.h"
-
-/* This namespacing is not done at the top to avoid a naming conflict
- * with native backends, which are currently not yet namespaced. */
-#define mld_polymat_permute_bitrev_to_custom \
-  MLD_ADD_PARAM_SET(mld_polymat_permute_bitrev_to_custom)
-#define mld_polyvecl_permute_bitrev_to_custom \
-  MLD_ADD_PARAM_SET(mld_polyvecl_permute_bitrev_to_custom)
-#define mld_polyvecl_pointwise_acc_montgomery_c \
-  MLD_ADD_PARAM_SET(mld_polyvecl_pointwise_acc_montgomery_c)
-
-#if !defined(MLD_CONFIG_REDUCE_RAM)
-/* Helper function to ensure that the polynomial entries in the output
- * of mld_polyvec_matrix_expand use the standard (bitreversed) ordering
- * of coefficients.
- * No-op unless a native backend with a custom ordering is used.
- */
-
-static void mld_polyvecl_permute_bitrev_to_custom(mld_polyvecl *v)
-__contract__(
-  /* We don't specify that this should be a permutation, but only
-   * that it does not change the bound established at the end of
-   * mld_polyvec_matrix_expand. 
-   */
-  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
-  requires(forall(x, 0, MLDSA_L,
-    array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
-  assigns(memory_slice(v, sizeof(mld_polyvecl)))
-  ensures(forall(x, 0, MLDSA_L,
-    array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-{
-#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER)
-  unsigned i;
-  for (i = 0; i < MLDSA_L; i++)
-  __loop__(
-     assigns(i, memory_slice(v, sizeof(mld_polyvecl)))
-     invariant(i <= MLDSA_L)
-     invariant(forall(x, 0, MLDSA_L,
-       array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
-     decreases(MLDSA_L - i))
-  {
-    mld_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
-  }
-#else  /* MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
-  /* Nothing to do */
-  (void)v;
-#endif /* !MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
-}
-
-static void mld_polymat_permute_bitrev_to_custom(mld_polymat *mat)
-__contract__(
-  /* We don't specify that this should be a permutation, but only
-   * that it does not change the bound established at the end of
-   * mld_polyvec_matrix_expand.
-   */
-  requires(memory_no_alias(mat, sizeof(mld_polymat)))
-  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
-    array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-  assigns(memory_slice(mat, sizeof(mld_polymat)))
-  ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
-    array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-)
-{
-  unsigned int i;
-  for (i = 0; i < MLDSA_K; i++)
-  __loop__(
-    assigns(i, memory_slice(mat, sizeof(mld_polymat)))
-    invariant(i <= MLDSA_K)
-    invariant(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
-      array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-    decreases(MLDSA_K - i))
-  {
-    mld_polyvecl_permute_bitrev_to_custom(&mat->vec[i]);
-  }
-}
-#endif /* !MLD_CONFIG_REDUCE_RAM */
-
-MLD_INTERNAL_API
-const mld_polyvecl *mld_polymat_get_row(mld_polymat *mat, unsigned int row)
-{
-#if defined(MLD_CONFIG_REDUCE_RAM)
-  unsigned int i;
-  MLD_ALIGN uint8_t seed_ext[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
-
-  mld_memcpy(seed_ext, mat->rho, MLDSA_SEEDBYTES);
-
-  /* Generate row on-demand */
-  for (i = 0; i < MLDSA_L; i++)
-  {
-    uint8_t x = (uint8_t)row;
-    uint8_t y = (uint8_t)i;
-
-    seed_ext[MLDSA_SEEDBYTES + 0] = y;
-    seed_ext[MLDSA_SEEDBYTES + 1] = x;
-
-    mld_poly_uniform(&mat->row_buffer.vec[i], seed_ext);
-
-#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER)
-    mld_poly_permute_bitrev_to_custom(mat->row_buffer.vec[i].coeffs);
-#endif
-  }
-
-  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
-  mld_zeroize(seed_ext, sizeof(seed_ext));
-
-  return &mat->row_buffer;
-#else  /* MLD_CONFIG_REDUCE_RAM */
-  return &mat->vec[row];
-#endif /* !MLD_CONFIG_REDUCE_RAM */
-}
-
-MLD_INTERNAL_API
-void mld_polyvec_matrix_expand(mld_polymat *mat,
-                               const uint8_t rho[MLDSA_SEEDBYTES])
-{
-#if defined(MLD_CONFIG_REDUCE_RAM)
-  /* In REDUCE_RAM mode, just copy the seed for later on-demand generation */
-  mld_memcpy(mat->rho, rho, MLDSA_SEEDBYTES);
-#else
-  unsigned int i, j;
-  /*
-   * We generate four separate seed arrays rather than a single one to work
-   * around limitations in CBMC function contracts dealing with disjoint slices
-   * of the same parent object.
-   */
-
-  MLD_ALIGN uint8_t seed_ext[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
-
-  for (j = 0; j < 4; j++)
-  __loop__(
-    assigns(j, object_whole(seed_ext))
-    invariant(j <= 4)
-    decreases(4 - j)
-  )
-  {
-    mld_memcpy(seed_ext[j], rho, MLDSA_SEEDBYTES);
-  }
-
-#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
-  /* Sample 4 matrix entries a time. */
-  for (i = 0; i < (MLDSA_K * MLDSA_L / 4) * 4; i += 4)
-  __loop__(
-    assigns(i, j, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat)))
-    invariant(i <= (MLDSA_K * MLDSA_L / 4) * 4 && i % 4 == 0)
-    /* vectors 0 .. i / MLDSA_L are completely sampled */
-    invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
-      array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-    /* last vector is sampled up to i % MLDSA_L */
-    invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L,
-      array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-    decreases((MLDSA_K * MLDSA_L / 4) * 4 - i)
-  )
-  {
-    for (j = 0; j < 4; j++)
-    __loop__(
-      assigns(j, object_whole(seed_ext))
-      invariant(j <= 4)
-      decreases(4 - j)
-    )
-    {
-      uint8_t x = (uint8_t)((i + j) / MLDSA_L);
-      uint8_t y = (uint8_t)((i + j) % MLDSA_L);
-
-      seed_ext[j][MLDSA_SEEDBYTES + 0] = y;
-      seed_ext[j][MLDSA_SEEDBYTES + 1] = x;
-    }
-
-    mld_poly_uniform_4x(&mat->vec[i / MLDSA_L].vec[i % MLDSA_L],
-                        &mat->vec[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L],
-                        &mat->vec[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L],
-                        &mat->vec[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L],
-                        seed_ext);
-  }
-#else  /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
-  i = 0;
-#endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */
-
-  /* Entries omitted by the batch-sampling are sampled individually. */
-  while (i < MLDSA_K * MLDSA_L)
-  __loop__(
-    assigns(i, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat)))
-    invariant(i <= MLDSA_K * MLDSA_L)
-    /* vectors 0 .. i / MLDSA_L are completely sampled */
-    invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
-      array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-    /* last vector is sampled up to i % MLDSA_L */
-    invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L,
-      array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-    decreases(MLDSA_K * MLDSA_L - i)
-  )
-  {
-    uint8_t x = (uint8_t)(i / MLDSA_L);
-    uint8_t y = (uint8_t)(i % MLDSA_L);
-    mld_poly *this_poly = &mat->vec[i / MLDSA_L].vec[i % MLDSA_L];
-
-    seed_ext[0][MLDSA_SEEDBYTES + 0] = y;
-    seed_ext[0][MLDSA_SEEDBYTES + 1] = x;
-
-    mld_poly_uniform(this_poly, seed_ext[0]);
-    i++;
-  }
-
-  mld_polymat_permute_bitrev_to_custom(mat);
-
-  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
-  mld_zeroize(seed_ext, sizeof(seed_ext));
-#endif /* !MLD_CONFIG_REDUCE_RAM */
-}
-
-MLD_INTERNAL_API
-void mld_polyvec_matrix_pointwise_montgomery(mld_polyveck *t, mld_polymat *mat,
-                                             const mld_polyvecl *v)
-{
-  unsigned int i;
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-
-  for (i = 0; i < MLDSA_K; ++i)
-  __loop__(
-    assigns(i, memory_slice(t, sizeof(mld_polyveck)))
-    invariant(i <= MLDSA_K)
-    invariant(forall(k0, 0, i,
-                     array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
-    decreases(MLDSA_K - i)
-  )
-  {
-    const mld_polyvecl *row = mld_polymat_get_row(mat, i);
-    mld_polyvecl_pointwise_acc_montgomery(&t->vec[i], row, v);
-  }
-
-  mld_assert_abs_bound_2d(t->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
-}
+#include "polyvec_lazy.h"
 
 /**************************************************************/
 /************ Vectors of polynomials of length MLDSA_L **************/
@@ -312,112 +83,6 @@ void mld_polyvecl_ntt(mld_polyvecl *v)
   mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
 }
 
-MLD_STATIC_TESTABLE void mld_polyvecl_pointwise_acc_montgomery_c(
-    mld_poly *w, const mld_polyvecl *u, const mld_polyvecl *v)
-__contract__(
-  requires(memory_no_alias(w, sizeof(mld_poly)))
-  requires(memory_no_alias(u, sizeof(mld_polyvecl)))
-  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
-  requires(forall(l0, 0, MLDSA_L,
-                  array_bound(u->vec[l0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
-  requires(forall(l1, 0, MLDSA_L,
-    array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
-  assigns(memory_slice(w, sizeof(mld_poly)))
-  ensures(array_abs_bound(w->coeffs, 0, MLDSA_N, MLDSA_Q))
-)
-{
-  unsigned int i, j;
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  for (i = 0; i < MLDSA_N; i++)
-  __loop__(
-    assigns(i, j, memory_slice(w, sizeof(mld_poly)))
-    invariant(i <= MLDSA_N)
-    invariant(array_abs_bound(w->coeffs, 0, i, MLDSA_Q))
-    decreases(MLDSA_N - i)
-  )
-  {
-    int64_t t = 0;
-    int32_t r;
-    for (j = 0; j < MLDSA_L; j++)
-    __loop__(
-      assigns(j, t)
-      invariant(j <= MLDSA_L)
-      invariant(t >= -(int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
-      invariant(t <= (int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
-      decreases(MLDSA_L - j)
-    )
-    {
-      t += (int64_t)u->vec[j].coeffs[i] * v->vec[j].coeffs[i];
-    }
-
-    r = mld_montgomery_reduce(t);
-    w->coeffs[i] = r;
-  }
-
-  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
-}
-
-MLD_INTERNAL_API
-void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
-                                           const mld_polyvecl *v)
-{
-#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) && \
-    MLD_CONFIG_PARAMETER_SET == 44
-  int ret;
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  ret = mld_polyvecl_pointwise_acc_montgomery_l4_native(
-      w->coeffs, (const int32_t (*)[MLDSA_N])u->vec,
-      (const int32_t (*)[MLDSA_N])v->vec);
-  if (ret == MLD_NATIVE_FUNC_SUCCESS)
-  {
-    mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
-    return;
-  }
-#elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) && \
-    MLD_CONFIG_PARAMETER_SET == 65
-  int ret;
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  ret = mld_polyvecl_pointwise_acc_montgomery_l5_native(
-      w->coeffs, (const int32_t (*)[MLDSA_N])u->vec,
-      (const int32_t (*)[MLDSA_N])v->vec);
-  if (ret == MLD_NATIVE_FUNC_SUCCESS)
-  {
-    mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
-    return;
-  }
-#elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) && \
-    MLD_CONFIG_PARAMETER_SET == 87
-  int ret;
-  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
-  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
-  ret = mld_polyvecl_pointwise_acc_montgomery_l7_native(
-      w->coeffs, (const int32_t (*)[MLDSA_N])u->vec,
-      (const int32_t (*)[MLDSA_N])v->vec);
-  if (ret == MLD_NATIVE_FUNC_SUCCESS)
-  {
-    mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
-    return;
-  }
-#endif /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
-          MLD_CONFIG_PARAMETER_SET == 44) &&                       \
-          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
-          MLD_CONFIG_PARAMETER_SET == 65) &&                       \
-          MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 &&   \
-          MLD_CONFIG_PARAMETER_SET == 87 */
-  /* The first input is bounded by [0, Q-1] inclusive
-   * The second input is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can
-   * safely accumulate in 64-bits without intermediate reductions as
-   * MLDSA_L * (MLD_NTT_BOUND-1) * (Q-1) < INT64_MAX
-   *
-   * The worst case is ML-DSA-87: 7 * (9Q-1) * (Q-1) < 2**52
-   * (and likewise for negative values)
-   */
-  mld_polyvecl_pointwise_acc_montgomery_c(w, u, v);
-}
-
 MLD_INTERNAL_API
 uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t bound)
 {
@@ -490,29 +155,6 @@ void mld_polyveck_caddq(mld_polyveck *v)
   mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q);
 }
 
-/* Reference: We use destructive version (output=first input) to avoid
- *            reasoning about aliasing in the CBMC specification */
-MLD_INTERNAL_API
-void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v)
-{
-  unsigned int i;
-
-  for (i = 0; i < MLDSA_K; ++i)
-  __loop__(
-    assigns(i, memory_slice(u, sizeof(mld_polyveck)))
-    invariant(i <= MLDSA_K)
-    invariant(forall(k0, i, MLDSA_K,
-              forall(k1, 0, MLDSA_N, u->vec[k0].coeffs[k1] == loop_entry(*u).vec[k0].coeffs[k1])))
-    invariant(forall(k6, 0, i, array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX)))
-    decreases(MLDSA_K - i)
-  )
-  {
-    mld_poly_add(&u->vec[i], &v->vec[i]);
-  }
-  mld_assert_bound_2d(u->vec, MLDSA_K, MLDSA_N, INT32_MIN,
-                      MLD_REDUCE32_DOMAIN_MAX);
-}
-
 MLD_INTERNAL_API
 void mld_polyveck_sub(mld_polyveck *u, const mld_polyveck *v)
 {
@@ -644,31 +286,6 @@ uint32_t mld_polyveck_chknorm(const mld_polyveck *v, int32_t bound)
   return t;
 }
 
-MLD_INTERNAL_API
-void mld_polyveck_power2round(mld_polyveck *v1, mld_polyveck *v0,
-                              const mld_polyveck *v)
-{
-  unsigned int i;
-  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q);
-
-  for (i = 0; i < MLDSA_K; ++i)
-  __loop__(
-    assigns(i, memory_slice(v0, sizeof(mld_polyveck)), memory_slice(v1, sizeof(mld_polyveck)))
-    invariant(i <= MLDSA_K)
-    invariant(forall(k1, 0, i, array_bound(v0->vec[k1].coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1)))
-    invariant(forall(k2, 0, i, array_bound(v1->vec[k2].coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1)))
-    decreases(MLDSA_K - i)
-  )
-  {
-    mld_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
-  }
-
-  mld_assert_bound_2d(v0->vec, MLDSA_K, MLDSA_N, -(MLD_2_POW_D / 2) + 1,
-                      (MLD_2_POW_D / 2) + 1);
-  mld_assert_bound_2d(v1->vec, MLDSA_K, MLDSA_N, 0,
-                      ((MLDSA_Q - 1) / MLD_2_POW_D) + 1);
-}
-
 MLD_INTERNAL_API
 void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0)
 {
@@ -774,24 +391,6 @@ void mld_polyvecl_pack_eta(uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES],
   }
 }
 
-MLD_INTERNAL_API
-void mld_polyveck_pack_t0(uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES],
-                          const mld_polyveck *p)
-{
-  unsigned int i;
-  mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1,
-                      (1 << (MLDSA_D - 1)) + 1);
-  for (i = 0; i < MLDSA_K; ++i)
-  __loop__(
-    assigns(i, memory_slice(r, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES))
-    invariant(i <= MLDSA_K)
-    decreases(MLDSA_K - i)
-  )
-  {
-    mld_polyt0_pack(&r[i * MLDSA_POLYT0_PACKEDBYTES], &p->vec[i]);
-  }
-}
-
 MLD_INTERNAL_API
 void mld_polyvecl_unpack_eta(
     mld_polyvecl *p, const uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES])
@@ -834,6 +433,7 @@ void mld_polyveck_unpack_eta(
                       MLDSA_ETA + 1);
 }
 
+#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
 MLD_INTERNAL_API
 void mld_polyveck_unpack_t0(mld_polyveck *p,
                             const uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES])
@@ -847,9 +447,4 @@ void mld_polyveck_unpack_t0(mld_polyveck *p,
   mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1,
                       (1 << (MLDSA_D - 1)) + 1);
 }
-
-/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
- * Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef mld_polymat_permute_bitrev_to_custom
-#undef mld_polyvecl_permute_bitrev_to_custom
-#undef mld_polyvecl_pointwise_acc_montgomery_c
+#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
diff --git a/mldsa/src/polyvec.h b/mldsa/src/polyvec.h
index d9932e928..4212b078e 100644
--- a/mldsa/src/polyvec.h
+++ b/mldsa/src/polyvec.h
@@ -16,7 +16,6 @@
  * within a single compilation unit. */
 #define mld_polyvecl MLD_ADD_PARAM_SET(mld_polyvecl)
 #define mld_polyveck MLD_ADD_PARAM_SET(mld_polyveck)
-#define mld_polymat MLD_ADD_PARAM_SET(mld_polymat)
 /* End of parameter set namespacing */
 
 /* Vectors of polynomials of length MLDSA_L */
@@ -70,45 +69,6 @@ __contract__(
   ensures(forall(k1, 0, MLDSA_L, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
 );
 
-#define mld_polyvecl_pointwise_acc_montgomery \
-  MLD_NAMESPACE_KL(polyvecl_pointwise_acc_montgomery)
-/*************************************************
- * Name:        mld_polyvecl_pointwise_acc_montgomery
- *
- * Description: Pointwise multiply vectors of polynomials of length MLDSA_L,
- *              multiply resulting vector by 2^{-32} and add (accumulate)
- *              polynomials in it.
- *              Input/output vectors are in NTT domain representation.
- *
- *              The first input "u" must be the output of
- *              polyvec_matrix_expand() and so have coefficients in [0, Q-1]
- *              inclusive.
- *
- *              The second input "v" is assumed to be output of an NTT, and
- *              hence must have coefficients bounded by [-9q+1, +9q-1]
- *              inclusive.
- *
- *
- * Arguments:   - mld_poly *w: output polynomial
- *              - const mld_polyvecl *u: pointer to first input vector
- *              - const mld_polyvecl *v: pointer to second input vector
- **************************************************/
-MLD_INTERNAL_API
-void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
-                                           const mld_polyvecl *v)
-__contract__(
-  requires(memory_no_alias(w, sizeof(mld_poly)))
-  requires(memory_no_alias(u, sizeof(mld_polyvecl)))
-  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
-  requires(forall(l0, 0, MLDSA_L,
-                  array_bound(u->vec[l0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
-  requires(forall(l1, 0, MLDSA_L,
-    array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
-  assigns(memory_slice(w, sizeof(mld_poly)))
-  ensures(array_abs_bound(w->coeffs, 0, MLDSA_N, MLDSA_Q))
-);
-
-
 #define mld_polyvecl_chknorm MLD_NAMESPACE_KL(polyvecl_chknorm)
 /*************************************************
  * Name:        mld_polyvecl_chknorm
@@ -140,17 +100,6 @@ typedef struct
   mld_poly vec[MLDSA_K];
 } mld_polyveck;
 
-/* Matrix of polynomials (K x L) */
-typedef struct
-{
-#if defined(MLD_CONFIG_REDUCE_RAM)
-  mld_polyvecl row_buffer;
-  uint8_t rho[MLDSA_SEEDBYTES];
-#else
-  mld_polyvecl vec[MLDSA_K];
-#endif
-} mld_polymat;
-
 #define mld_polyveck_reduce MLD_NAMESPACE_KL(polyveck_reduce)
 /*************************************************
  * Name:        polyveck_reduce
@@ -192,31 +141,6 @@ __contract__(
     array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
 );
 
-#define mld_polyveck_add MLD_NAMESPACE_KL(polyveck_add)
-/*************************************************
- * Name:        mld_polyveck_add
- *
- * Description: Add vectors of polynomials of length MLDSA_K.
- *              No modular reduction is performed.
- *
- * Arguments:   - mld_polyveck *u: pointer to input-output vector of polynomials
- *                                 to be added to
- *              - const mld_polyveck *v: pointer to second input vector of
- *                                       polynomials
- **************************************************/
-MLD_INTERNAL_API
-void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v)
-__contract__(
-  requires(memory_no_alias(u, sizeof(mld_polyveck)))
-  requires(memory_no_alias(v, sizeof(mld_polyveck)))
-  requires(forall(p0, 0, MLDSA_K, array_abs_bound(u->vec[p0].coeffs, 0, MLDSA_N, MLD_INTT_BOUND)))
-  requires(forall(p1, 0, MLDSA_K,
-    array_bound(v->vec[p1].coeffs, 0, MLDSA_N, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX)))
-  assigns(memory_slice(u, sizeof(mld_polyveck)))
-  ensures(forall(q2, 0, MLDSA_K,
-                array_bound(u->vec[q2].coeffs, 0, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX)))
-);
-
 #define mld_polyveck_sub MLD_NAMESPACE_KL(polyveck_sub)
 /*************************************************
  * Name:        mld_polyveck_sub
@@ -348,35 +272,6 @@ __contract__(
   ensures((return_value == 0) == forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, B)))
 );
 
-#define mld_polyveck_power2round MLD_NAMESPACE_KL(polyveck_power2round)
-/*************************************************
- * Name:        mld_polyveck_power2round
- *
- * Description: For all coefficients a of polynomials in vector of length
- *MLDSA_K, compute a0, a1 such that a mod^+ MLDSA_Q = a1*2^MLDSA_D + a0 with
- *-2^{MLDSA_D-1} < a0 <= 2^{MLDSA_D-1}. Assumes coefficients to be standard
- *representatives.
- *
- * Arguments:   - mld_polyveck *v1: pointer to output vector of polynomials with
- *                              coefficients a1
- *              - mld_polyveck *v0: pointer to output vector of polynomials with
- *                              coefficients a0
- *              - const mld_polyveck *v: pointer to input vector
- **************************************************/
-MLD_INTERNAL_API
-void mld_polyveck_power2round(mld_polyveck *v1, mld_polyveck *v0,
-                              const mld_polyveck *v)
-__contract__(
-  requires(memory_no_alias(v1, sizeof(mld_polyveck)))
-  requires(memory_no_alias(v0, sizeof(mld_polyveck)))
-  requires(memory_no_alias(v, sizeof(mld_polyveck)))
-  requires(forall(k0, 0, MLDSA_K, array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
-  assigns(memory_slice(v1, sizeof(mld_polyveck)))
-  assigns(memory_slice(v0, sizeof(mld_polyveck)))
-  ensures(forall(k1, 0, MLDSA_K, array_bound(v0->vec[k1].coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1)))
-  ensures(forall(k2, 0, MLDSA_K, array_bound(v1->vec[k2].coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1)))
-);
-
 #define mld_polyveck_decompose MLD_NAMESPACE_KL(polyveck_decompose)
 /*************************************************
  * Name:        mld_polyveck_decompose
@@ -507,28 +402,6 @@ __contract__(
   assigns(memory_slice(r, MLDSA_L * MLDSA_POLYETA_PACKEDBYTES))
 );
 
-#define mld_polyveck_pack_t0 MLD_NAMESPACE_KL(polyveck_pack_t0)
-/*************************************************
- * Name:        mld_polyveck_pack_t0
- *
- * Description: Bit-pack polynomial vector to with coefficients in
- *              ]-2^{MLDSA_D-1}, 2^{MLDSA_D-1}].
- *
- * Arguments:   - uint8_t *r: pointer to output byte array with
- *                            MLDSA_K * MLDSA_POLYT0_PACKEDBYTES bytes
- *              - const mld_poly *p: pointer to input polynomial vector
- **************************************************/
-MLD_INTERNAL_API
-void mld_polyveck_pack_t0(uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES],
-                          const mld_polyveck *p)
-__contract__(
-  requires(memory_no_alias(r,  MLDSA_K * MLDSA_POLYT0_PACKEDBYTES))
-  requires(memory_no_alias(p, sizeof(mld_polyveck)))
-  requires(forall(k0, 0, MLDSA_K,
-    array_bound(p->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
-  assigns(memory_slice(r, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES))
-);
-
 #define mld_polyvecl_unpack_eta MLD_NAMESPACE_KL(polyvecl_unpack_eta)
 /*************************************************
  * Name:        mld_polyvecl_unpack_eta
@@ -595,6 +468,7 @@ __contract__(
     array_bound(p->vec[k1].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
 );
 
+#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
 #define mld_polyveck_unpack_t0 MLD_NAMESPACE_KL(polyveck_unpack_t0)
 /*************************************************
  * Name:        mld_polyveck_unpack_t0
@@ -616,85 +490,6 @@ __contract__(
   ensures(forall(k1, 0, MLDSA_K,
     array_bound(p->vec[k1].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
 );
-
-
-
-#define mld_polymat_get_row MLD_NAMESPACE_KL(polymat_get_row)
-/*************************************************
- * Name:        mld_polymat_get_row
- *
- * Description: Retrieve a pointer to a specific row of the matrix.
- *              In MLD_CONFIG_REDUCE_RAM mode, generates the row on-demand.
- *
- * Arguments:   - mld_polymat *mat: pointer to matrix
- *              - unsigned int row: row index (must be < MLDSA_K)
- *
- * Returns pointer to the row (mld_polyvecl)
- **************************************************/
-MLD_INTERNAL_API
-MLD_MUST_CHECK_RETURN_VALUE
-const mld_polyvecl *mld_polymat_get_row(mld_polymat *mat, unsigned int row);
-
-#define mld_polyvec_matrix_expand MLD_NAMESPACE_KL(polyvec_matrix_expand)
-/*************************************************
- * Name:        mld_polyvec_matrix_expand
- *
- * Description: Implementation of ExpandA. Generates matrix A with uniformly
- *              random coefficients a_{i,j} by performing rejection
- *              sampling on the output stream of SHAKE128(rho|j|i)
- *
- * Arguments:   - mld_polymat *mat: pointer to output matrix
- *              - const uint8_t rho[]: byte array containing seed rho
- **************************************************/
-MLD_INTERNAL_API
-void mld_polyvec_matrix_expand(mld_polymat *mat,
-                               const uint8_t rho[MLDSA_SEEDBYTES])
-__contract__(
-  requires(memory_no_alias(mat, sizeof(mld_polymat)))
-  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
-  assigns(memory_slice(mat, sizeof(mld_polymat)))
-  ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
-    array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-);
-
-#define mld_polyvec_matrix_pointwise_montgomery \
-  MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery)
-/*************************************************
- * Name:        mld_polyvec_matrix_pointwise_montgomery
- *
- * Description: Compute matrix-vector multiplication in NTT domain with
- *              pointwise multiplication and multiplication by 2^{-32}.
- *              Input matrix and vector must be in NTT domain representation.
- *
- *              The first input "mat" must be the output of
- *              polyvec_matrix_expand() and so have coefficients in [0, Q-1]
- *              inclusive.
- *
- *              The second input "v" is assumed to be output of an NTT, and
- *              hence must have coefficients bounded by [-9q+1, +9q-1]
- *              inclusive.
- *
- *              Note: In MLD_CONFIG_REDUCE_RAM mode, mat cannot be const
- *              as rows are generated on-demand.
- *
- * Arguments:   - mld_polyveck *t: pointer to output vector t
- *              - mld_polymat *mat: pointer to input matrix
- *              - const mld_polyvecl *v: pointer to input vector v
- **************************************************/
-MLD_INTERNAL_API
-void mld_polyvec_matrix_pointwise_montgomery(mld_polyveck *t, mld_polymat *mat,
-                                             const mld_polyvecl *v)
-__contract__(
-  requires(memory_no_alias(t, sizeof(mld_polyveck)))
-  requires(memory_no_alias(mat, sizeof(mld_polymat)))
-  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
-  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
-                                         array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
-  requires(forall(l1, 0, MLDSA_L,
-                  array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
-  assigns(memory_slice(t, sizeof(mld_polyveck)))
-  ensures(forall(k0, 0, MLDSA_K,
-                 array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
-);
+#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
 
 #endif /* !MLD_POLYVEC_H */
diff --git a/mldsa/src/polyvec_lazy.c b/mldsa/src/polyvec_lazy.c
new file mode 100644
index 000000000..6f2a43c52
--- /dev/null
+++ b/mldsa/src/polyvec_lazy.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+#include "polyvec_lazy.h"
+
+#include "debug.h"
+
+/* This namespacing is not done at the top to avoid a naming conflict
+ * with native backends, which are currently not yet namespaced. */
+#define mld_polymat_permute_bitrev_to_custom \
+  MLD_ADD_PARAM_SET(mld_polymat_permute_bitrev_to_custom)
+#define mld_polyvecl_permute_bitrev_to_custom \
+  MLD_ADD_PARAM_SET(mld_polyvecl_permute_bitrev_to_custom)
+#define mld_polyvecl_pointwise_acc_montgomery_c \
+  MLD_ADD_PARAM_SET(mld_polyvecl_pointwise_acc_montgomery_c)
+
+#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
+
+static void mld_polyvecl_permute_bitrev_to_custom(mld_polyvecl *v)
+__contract__(
+  /* We don't specify that this should be a permutation, but only
+   * that it does not change the bound established at the end of
+   * mld_polyvec_matrix_expand.
+   */
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(x, 0, MLDSA_L,
+    array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  assigns(memory_slice(v, sizeof(mld_polyvecl)))
+  ensures(forall(x, 0, MLDSA_L,
+    array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+{
+#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER)
+  unsigned i;
+  for (i = 0; i < MLDSA_L; i++)
+  __loop__(
+     assigns(i, memory_slice(v, sizeof(mld_polyvecl)))
+     invariant(i <= MLDSA_L)
+     invariant(forall(x, 0, MLDSA_L,
+       array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+     decreases(MLDSA_L - i))
+  {
+    mld_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+  }
+#else  /* MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
+  /* Nothing to do */
+  (void)v;
+#endif /* !MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mld_polymat_permute_bitrev_to_custom(mld_polymat_eager *mat)
+__contract__(
+  /* We don't specify that this should be a permutation, but only
+   * that it does not change the bound established at the end of
+   * mld_polyvec_matrix_expand.
+   */
+  requires(memory_no_alias(mat, sizeof(mld_polymat_eager)))
+  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  assigns(memory_slice(mat, sizeof(mld_polymat_eager)))
+  ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+)
+{
+  unsigned int i;
+  for (i = 0; i < MLDSA_K; i++)
+  __loop__(
+    assigns(i, memory_slice(mat, sizeof(mld_polymat_eager)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+      array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    decreases(MLDSA_K - i))
+  {
+    mld_polyvecl_permute_bitrev_to_custom(&mat->vec[i]);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyvec_matrix_expand_eager(mld_polymat_eager *mat,
+                                     const uint8_t rho[MLDSA_SEEDBYTES])
+{
+  unsigned int i, j;
+  /*
+   * We generate four separate seed arrays rather than a single one to work
+   * around limitations in CBMC function contracts dealing with disjoint slices
+   * of the same parent object.
+   */
+
+  MLD_ALIGN uint8_t seed_ext[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
+
+  for (j = 0; j < 4; j++)
+  __loop__(
+    assigns(j, object_whole(seed_ext))
+    invariant(j <= 4)
+    decreases(4 - j)
+  )
+  {
+    mld_memcpy(seed_ext[j], rho, MLDSA_SEEDBYTES);
+  }
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) && !defined(MLD_CONFIG_REDUCE_RAM)
+  /* Sample 4 matrix entries a time. */
+  for (i = 0; i < (MLDSA_K * MLDSA_L / 4) * 4; i += 4)
+  __loop__(
+    assigns(i, j, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat_eager)))
+    invariant(i <= (MLDSA_K * MLDSA_L / 4) * 4 && i % 4 == 0)
+    /* vectors 0 .. i / MLDSA_L are completely sampled */
+    invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
+      array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    /* last vector is sampled up to i % MLDSA_L */
+    invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L,
+      array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    decreases((MLDSA_K * MLDSA_L / 4) * 4 - i)
+  )
+  {
+    for (j = 0; j < 4; j++)
+    __loop__(
+      assigns(j, object_whole(seed_ext))
+      invariant(j <= 4)
+      decreases(4 - j)
+    )
+    {
+      uint8_t x = (uint8_t)((i + j) / MLDSA_L);
+      uint8_t y = (uint8_t)((i + j) % MLDSA_L);
+
+      seed_ext[j][MLDSA_SEEDBYTES + 0] = y;
+      seed_ext[j][MLDSA_SEEDBYTES + 1] = x;
+    }
+
+    mld_poly_uniform_4x(&mat->vec[i / MLDSA_L].vec[i % MLDSA_L],
+                        &mat->vec[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L],
+                        &mat->vec[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L],
+                        &mat->vec[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L],
+                        seed_ext);
+  }
+#else  /* !MLD_CONFIG_SERIAL_FIPS202_ONLY && !MLD_CONFIG_REDUCE_RAM */
+  i = 0;
+#endif /* !(!MLD_CONFIG_SERIAL_FIPS202_ONLY && !MLD_CONFIG_REDUCE_RAM) */
+
+  /* Entries omitted by the batch-sampling are sampled individually. */
+  while (i < MLDSA_K * MLDSA_L)
+  __loop__(
+    assigns(i, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat_eager)))
+    invariant(i <= MLDSA_K * MLDSA_L)
+    /* vectors 0 .. i / MLDSA_L are completely sampled */
+    invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
+      array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    /* last vector is sampled up to i % MLDSA_L */
+    invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L,
+      array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    decreases(MLDSA_K * MLDSA_L - i)
+  )
+  {
+    uint8_t x = (uint8_t)(i / MLDSA_L);
+    uint8_t y = (uint8_t)(i % MLDSA_L);
+    mld_poly *this_poly = &mat->vec[i / MLDSA_L].vec[i % MLDSA_L];
+
+    seed_ext[0][MLDSA_SEEDBYTES + 0] = y;
+    seed_ext[0][MLDSA_SEEDBYTES + 1] = x;
+
+    mld_poly_uniform(this_poly, seed_ext[0]);
+    i++;
+  }
+
+  mld_polymat_permute_bitrev_to_custom(mat);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(seed_ext, sizeof(seed_ext));
+}
+
+MLD_STATIC_TESTABLE void mld_polyvecl_pointwise_acc_montgomery_c(
+    mld_poly *w, const mld_polyvecl *u, const mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(w, sizeof(mld_poly)))
+  requires(memory_no_alias(u, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(l0, 0, MLDSA_L,
+                  array_bound(u->vec[l0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  requires(forall(l1, 0, MLDSA_L,
+    array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(memory_slice(w, sizeof(mld_poly)))
+  ensures(array_abs_bound(w->coeffs, 0, MLDSA_N, MLDSA_Q))
+)
+{
+  unsigned int i, j;
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+  for (i = 0; i < MLDSA_N; i++)
+  __loop__(
+    assigns(i, j, memory_slice(w, sizeof(mld_poly)))
+    invariant(i <= MLDSA_N)
+    invariant(array_abs_bound(w->coeffs, 0, i, MLDSA_Q))
+    decreases(MLDSA_N - i)
+  )
+  {
+    int64_t t = 0;
+    int32_t r;
+    for (j = 0; j < MLDSA_L; j++)
+    __loop__(
+      assigns(j, t)
+      invariant(j <= MLDSA_L)
+      invariant(t >= -(int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+      invariant(t <= (int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+      decreases(MLDSA_L - j)
+    )
+    {
+      t += (int64_t)u->vec[j].coeffs[i] * v->vec[j].coeffs[i];
+    }
+
+    r = mld_montgomery_reduce(t);
+    w->coeffs[i] = r;
+  }
+
+  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_acc_montgomery_eager(mld_poly *w,
+                                                 const mld_polymat_eager *mat,
+                                                 unsigned int k,
+                                                 const mld_polyvecl *v)
+{
+  const mld_polyvecl *u = &mat->vec[k];
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) && \
+    MLD_CONFIG_PARAMETER_SET == 44
+  {
+    int ret;
+    ret = mld_polyvecl_pointwise_acc_montgomery_l4_native(
+        w->coeffs, (const int32_t (*)[MLDSA_N])u->vec,
+        (const int32_t (*)[MLDSA_N])v->vec);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+      return;
+    }
+  }
+#elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) && \
+    MLD_CONFIG_PARAMETER_SET == 65
+  {
+    int ret;
+    ret = mld_polyvecl_pointwise_acc_montgomery_l5_native(
+        w->coeffs, (const int32_t (*)[MLDSA_N])u->vec,
+        (const int32_t (*)[MLDSA_N])v->vec);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+      return;
+    }
+  }
+#elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) && \
+    MLD_CONFIG_PARAMETER_SET == 87
+  {
+    int ret;
+    ret = mld_polyvecl_pointwise_acc_montgomery_l7_native(
+        w->coeffs, (const int32_t (*)[MLDSA_N])u->vec,
+        (const int32_t (*)[MLDSA_N])v->vec);
+    if (ret == MLD_NATIVE_FUNC_SUCCESS)
+    {
+      mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+      return;
+    }
+  }
+#endif /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
+          MLD_CONFIG_PARAMETER_SET == 44) &&                       \
+          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
+          MLD_CONFIG_PARAMETER_SET == 65) &&                       \
+          MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 &&   \
+          MLD_CONFIG_PARAMETER_SET == 87 */
+  /* The first input is bounded by [0, Q-1] inclusive
+   * The second input is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can
+   * safely accumulate in 64-bits without intermediate reductions as
+   * MLDSA_L * (MLD_NTT_BOUND-1) * (Q-1) < INT64_MAX
+   *
+   * The worst case is ML-DSA-87: 7 * (9Q-1) * (Q-1) < 2**52
+   * (and likewise for negative values)
+   */
+  mld_polyvecl_pointwise_acc_montgomery_c(w, u, v);
+}
+
+MLD_INTERNAL_API
+void mld_polyvec_matrix_pointwise_montgomery_eager(mld_polyveck *t,
+                                                   mld_polymat_eager *mat,
+                                                   const mld_polyvecl *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(t, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, 0, i,
+                     array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+    decreases(MLDSA_K - i)
+  )
+  {
+    mld_polyvecl_pointwise_acc_montgomery_eager(&t->vec[i], mat, i, v);
+  }
+
+  mld_assert_abs_bound_2d(t->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+}
+
+#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
+
+#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
+
+MLD_INTERNAL_API
+void mld_polyvec_matrix_expand_lazy(mld_polymat_lazy *mat,
+                                    const uint8_t rho[MLDSA_SEEDBYTES])
+{
+  mld_memcpy(mat->rho, rho, MLDSA_SEEDBYTES);
+}
+
+MLD_INTERNAL_API
+void mld_polyvec_matrix_pointwise_montgomery_lazy(mld_polyveck *t,
+                                                  mld_polymat_lazy *mat,
+                                                  const mld_polyvecl *v)
+{
+  unsigned int i;
+  for (i = 0; i < MLDSA_K; ++i)
+  {
+    mld_polyvecl_pointwise_acc_montgomery_lazy(&t->vec[i], mat, i, v);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_acc_montgomery_lazy(mld_poly *w,
+                                                mld_polymat_lazy *mat,
+                                                unsigned int k,
+                                                const mld_polyvecl *v)
+{
+  unsigned int l;
+  const mld_poly *a_kl = mld_polymat_get_poly_lazy(mat, k, 0);
+  mld_poly_pointwise_montgomery(w, a_kl, &v->vec[0]);
+  for (l = 1; l < MLDSA_L; l++)
+  {
+    a_kl = mld_polymat_get_poly_lazy(mat, k, l);
+    mld_poly_pointwise_montgomery(&mat->tmp, a_kl, &v->vec[l]);
+    mld_poly_add(w, &mat->tmp);
+  }
+  mld_poly_reduce(w);
+}
+
+#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mld_polymat_permute_bitrev_to_custom
+#undef mld_polyvecl_permute_bitrev_to_custom
+#undef mld_polyvecl_pointwise_acc_montgomery_c
diff --git a/mldsa/src/polyvec_lazy.h b/mldsa/src/polyvec_lazy.h
index fbc0ec60b..e4510b082 100644
--- a/mldsa/src/polyvec_lazy.h
+++ b/mldsa/src/polyvec_lazy.h
@@ -3,6 +3,15 @@
  * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
  */
 
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
 /*
  * Eager and lazy variants of polynomial vector types.
  *
@@ -44,6 +53,25 @@
 #define mld_sk_t0hat_get_poly_eager \
   MLD_ADD_PARAM_SET(mld_sk_t0hat_get_poly_eager)
 #define mld_sk_t0hat_get_poly_lazy MLD_ADD_PARAM_SET(mld_sk_t0hat_get_poly_lazy)
+#define mld_polymat MLD_ADD_PARAM_SET(mld_polymat)
+#define mld_polymat_eager MLD_ADD_PARAM_SET(mld_polymat_eager)
+#define mld_polymat_lazy MLD_ADD_PARAM_SET(mld_polymat_lazy)
+#define mld_polymat_get_row_eager MLD_ADD_PARAM_SET(mld_polymat_get_row_eager)
+#define mld_polymat_get_poly_lazy MLD_ADD_PARAM_SET(mld_polymat_get_poly_lazy)
+#define mld_polyvec_matrix_expand_eager \
+  MLD_NAMESPACE_KL(polyvec_matrix_expand_eager)
+#define mld_polyvec_matrix_expand_lazy \
+  MLD_NAMESPACE_KL(polyvec_matrix_expand_lazy)
+#define mld_polyvec_matrix_pointwise_montgomery_eager \
+  MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery_eager)
+#define mld_polyvec_matrix_pointwise_montgomery_lazy \
+  MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery_lazy)
+#define mld_polyvecl_pointwise_acc_montgomery_eager \
+  MLD_NAMESPACE_KL(polyvecl_pointwise_acc_montgomery_eager)
+#define mld_polyvecl_pointwise_acc_montgomery_lazy \
+  MLD_NAMESPACE_KL(polyvecl_pointwise_acc_montgomery_lazy)
+#define mld_poly_permute_bitrev_to_custom_optional \
+  MLD_ADD_PARAM_SET(mld_poly_permute_bitrev_to_custom_optional)
 /* End of parameter set namespacing */
 
 /* Eager: precompute and store full NTT'd vector */
@@ -183,27 +211,219 @@ static MLD_INLINE void mld_sk_t0hat_get_poly_lazy(mld_poly *buf,
 }
 #endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
 
+/* polymat */
+
+/* Eager: precompute and store full matrix */
+typedef struct
+{
+  mld_polyvecl vec[MLDSA_K];
+} mld_polymat_eager;
+
+/* Lazy: store seed, sample elements on demand.
+ * poly_buffer holds the on-demand sampled matrix element A[k][l].
+ * tmp is needed as scratch space for pointwise multiplication. */
+typedef struct
+{
+  mld_poly poly_buffer;
+  mld_poly tmp;
+  uint8_t rho[MLDSA_SEEDBYTES];
+} mld_polymat_lazy;
+
+#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE const mld_polyvecl *mld_polymat_get_row_eager(
+    mld_polymat_eager *mat, unsigned int row)
+{
+  return &mat->vec[row];
+}
+#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
+
+#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
+static MLD_INLINE void mld_poly_permute_bitrev_to_custom_optional(mld_poly *p)
+{
+#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER)
+  mld_poly_permute_bitrev_to_custom(p->coeffs);
+#else
+  (void)p;
+#endif
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE const mld_poly *mld_polymat_get_poly_lazy(
+    mld_polymat_lazy *mat, unsigned int k, unsigned int l)
+{
+  MLD_ALIGN uint8_t seed_ext[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
+  mld_memcpy(seed_ext, mat->rho, MLDSA_SEEDBYTES);
+  seed_ext[MLDSA_SEEDBYTES + 0] = (uint8_t)l;
+  seed_ext[MLDSA_SEEDBYTES + 1] = (uint8_t)k;
+  mld_poly_uniform(&mat->poly_buffer, seed_ext);
+  mld_poly_permute_bitrev_to_custom_optional(&mat->poly_buffer);
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(seed_ext, sizeof(seed_ext));
+  return &mat->poly_buffer;
+}
+#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
+
+#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
+/*************************************************
+ * Name:        mld_polyvec_matrix_expand_eager
+ *
+ * Description: Implementation of ExpandA. Generates matrix A with uniformly
+ *              random coefficients a_{i,j} by performing rejection
+ *              sampling on the output stream of SHAKE128(rho|j|i)
+ *
+ * Arguments:   - mld_polymat_eager *mat: pointer to output matrix
+ *              - const uint8_t rho[]: byte array containing seed rho
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvec_matrix_expand_eager(mld_polymat_eager *mat,
+                                     const uint8_t rho[MLDSA_SEEDBYTES])
+__contract__(
+  requires(memory_no_alias(mat, sizeof(mld_polymat_eager)))
+  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
+  assigns(memory_slice(mat, sizeof(mld_polymat_eager)))
+  ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+);
+
+/*************************************************
+ * Name:        mld_polyvec_matrix_pointwise_montgomery_eager
+ *
+ * Description: Compute matrix-vector multiplication in NTT domain with
+ *              pointwise multiplication and multiplication by 2^{-32}.
+ *              Input matrix and vector must be in NTT domain representation.
+ *
+ *              The first input "mat" must be the output of
+ *              polyvec_matrix_expand() and so have coefficients in [0, Q-1]
+ *              inclusive.
+ *
+ *              The second input "v" is assumed to be output of an NTT, and
+ *              hence must have coefficients bounded by [-9q+1, +9q-1]
+ *              inclusive.
+ *
+ * Arguments:   - mld_polyveck *t: pointer to output vector t
+ *              - mld_polymat_eager *mat: pointer to input matrix
+ *              - const mld_polyvecl *v: pointer to input vector v
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvec_matrix_pointwise_montgomery_eager(mld_polyveck *t,
+                                                   mld_polymat_eager *mat,
+                                                   const mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(t, sizeof(mld_polyveck)))
+  requires(memory_no_alias(mat, sizeof(mld_polymat_eager)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+                                         array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  requires(forall(l1, 0, MLDSA_L,
+                  array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(memory_slice(t, sizeof(mld_polyveck)))
+  ensures(forall(k0, 0, MLDSA_K,
+                 array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+);
+
+/*************************************************
+ * Name:        mld_polyvecl_pointwise_acc_montgomery_eager
+ *
+ * Description: Compute one row of the matrix-vector multiplication
+ *              w = (A * v)[k] in NTT domain. The matrix is the eager
+ *              precomputed matrix from mld_polyvec_matrix_expand_eager,
+ *              the vector is in NTT domain.
+ *
+ *              Pointwise multiplies row k of A with v, accumulates,
+ *              and multiplies by 2^{-32}. Dispatches to the native
+ *              implementation when available.
+ *
+ * Arguments:   - mld_poly *w: pointer to output polynomial w (row k of A*v)
+ *              - const mld_polymat_eager *mat: pointer to input matrix
+ *              - unsigned int k: row index, must be < MLDSA_K
+ *              - const mld_polyvecl *v: pointer to input vector v
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_acc_montgomery_eager(mld_poly *w,
+                                                 const mld_polymat_eager *mat,
+                                                 unsigned int k,
+                                                 const mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(w, sizeof(mld_poly)))
+  requires(memory_no_alias(mat, sizeof(mld_polymat_eager)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(k < MLDSA_K)
+  requires(forall(l0, 0, MLDSA_L,
+                  array_bound(mat->vec[k].vec[l0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  requires(forall(l1, 0, MLDSA_L,
+    array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(memory_slice(w, sizeof(mld_poly)))
+  ensures(array_abs_bound(w->coeffs, 0, MLDSA_N, MLDSA_Q))
+);
+#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
+
+#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)
+MLD_INTERNAL_API
+void mld_polyvec_matrix_expand_lazy(mld_polymat_lazy *mat,
+                                    const uint8_t rho[MLDSA_SEEDBYTES]);
+
+MLD_INTERNAL_API
+void mld_polyvec_matrix_pointwise_montgomery_lazy(mld_polyveck *t,
+                                                  mld_polymat_lazy *mat,
+                                                  const mld_polyvecl *v);
+
+/*************************************************
+ * Name:        mld_polyvecl_pointwise_acc_montgomery_lazy
+ *
+ * Description: Compute one row of the matrix-vector multiplication
+ *              w = (A * v)[k] in NTT domain. The matrix is sampled
+ *              on-the-fly from mat->rho via per-element rejection
+ *              sampling, the vector is in NTT domain.
+ *
+ *              Uses the internal mat->tmp scratch for the per-column
+ *              pointwise products.
+ *
+ * Arguments:   - mld_poly *w: pointer to output polynomial w (row k of A*v)
+ *              - mld_polymat_lazy *mat: pointer to (lazy) input matrix
+ *              - unsigned int k: row index, must be < MLDSA_K
+ *              - const mld_polyvecl *v: pointer to input vector v
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_acc_montgomery_lazy(mld_poly *w,
+                                                mld_polymat_lazy *mat,
+                                                unsigned int k,
+                                                const mld_polyvecl *v);
+#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */
+
 /* Dispatch: typedef and define based on MLD_CONFIG_REDUCE_RAM */
 #if defined(MLD_CONFIG_REDUCE_RAM)
 typedef mld_sk_s1hat_lazy mld_sk_s1hat;
 typedef mld_sk_s2hat_lazy mld_sk_s2hat;
 typedef mld_sk_t0hat_lazy mld_sk_t0hat;
+typedef mld_polymat_lazy mld_polymat;
 #define mld_unpack_sk_s1hat mld_unpack_sk_s1hat_lazy
 #define mld_sk_s1hat_get_poly mld_sk_s1hat_get_poly_lazy
 #define mld_unpack_sk_s2hat mld_unpack_sk_s2hat_lazy
 #define mld_sk_s2hat_get_poly mld_sk_s2hat_get_poly_lazy
 #define mld_unpack_sk_t0hat mld_unpack_sk_t0hat_lazy
 #define mld_sk_t0hat_get_poly mld_sk_t0hat_get_poly_lazy
+#define mld_polyvec_matrix_expand mld_polyvec_matrix_expand_lazy
+#define mld_polyvec_matrix_pointwise_montgomery \
+  mld_polyvec_matrix_pointwise_montgomery_lazy
+#define mld_polyvecl_pointwise_acc_montgomery \
+  mld_polyvecl_pointwise_acc_montgomery_lazy
 #else /* MLD_CONFIG_REDUCE_RAM */
 typedef mld_sk_s1hat_eager mld_sk_s1hat;
 typedef mld_sk_s2hat_eager mld_sk_s2hat;
 typedef mld_sk_t0hat_eager mld_sk_t0hat;
+typedef mld_polymat_eager mld_polymat;
 #define mld_unpack_sk_s1hat mld_unpack_sk_s1hat_eager
 #define mld_sk_s1hat_get_poly mld_sk_s1hat_get_poly_eager
 #define mld_unpack_sk_s2hat mld_unpack_sk_s2hat_eager
 #define mld_sk_s2hat_get_poly mld_sk_s2hat_get_poly_eager
 #define mld_unpack_sk_t0hat mld_unpack_sk_t0hat_eager
 #define mld_sk_t0hat_get_poly mld_sk_t0hat_get_poly_eager
+#define mld_polyvec_matrix_expand mld_polyvec_matrix_expand_eager
+#define mld_polyvec_matrix_pointwise_montgomery \
+  mld_polyvec_matrix_pointwise_montgomery_eager
+#define mld_polyvecl_pointwise_acc_montgomery \
+  mld_polyvecl_pointwise_acc_montgomery_eager
 #endif /* !MLD_CONFIG_REDUCE_RAM */
 
 #endif /* !MLD_POLYVEC_LAZY_H */
diff --git a/mldsa/src/sign.c b/mldsa/src/sign.c
index 4b15b1a50..79c339dc4 100644
--- a/mldsa/src/sign.c
+++ b/mldsa/src/sign.c
@@ -48,9 +48,7 @@
 #define mld_compute_pack_z MLD_ADD_PARAM_SET(mld_compute_pack_z)
 #define mld_attempt_signature_generation \
   MLD_ADD_PARAM_SET(mld_attempt_signature_generation) MLD_CONTEXT_PARAMETERS_8
-#define mld_compute_t0_t1_tr_from_sk_components              \
-  MLD_ADD_PARAM_SET(mld_compute_t0_t1_tr_from_sk_components) \
-  MLD_CONTEXT_PARAMETERS_7
+#define mld_compute_t0k_t1k MLD_ADD_PARAM_SET(mld_compute_t0k_t1k)
 /* End of parameter set namespacing */
 
 
@@ -196,67 +194,55 @@ __contract__(
 }
 
 /*************************************************
- * Name:        mld_compute_t0_t1_tr_from_sk_components
+ * Name:        mld_compute_t0k_t1k
  *
- * Description: Computes t0, t1, tr, and pk from secret key components
- *              rho, s1, s2. This is the shared computation used by
- *              both keygen and generating the public key from the
- *              secret key.
+ * Description: Compute row k of t = A*s1hat + s2, and decompose into
+ *              t0[k] and t1[k] via power2round. This is the per-row
+ *              shared computation used by both keygen and pk_from_sk
+ *              to avoid materializing the full t/t0/t1 polyveck.
  *
- * Arguments:   - mld_polyveck *t0: output t0
- *              - mld_polyveck *t1: output t1
- *              - uint8_t tr[MLDSA_TRBYTES]: output tr
- *              - uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: output public key
- *              - const uint8_t rho[MLDSA_SEEDBYTES]: input rho
- *              - const mld_polyvecl *s1: input s1
- *              - const mld_polyveck *s2: input s2
+ *              The accumulator scratch tk is reused as t0k by the
+ *              caller (the contract of mld_poly_power2round requires
+ *              t0k == tk).
+ *
+ * Arguments:   - mld_poly *t0k: output t0 polynomial (must alias tk)
+ *              - mld_poly *t1k: output t1 polynomial
+ *              - mld_poly *tk: scratch / accumulator for the matrix row
+ *              - mld_polymat *mat: expanded matrix (sampled lazily in
+ *                                    REDUCE_RAM mode)
+ *              - unsigned int k: row index, must be < MLDSA_K
+ *              - const mld_polyvecl *s1hat: s1 in NTT domain
+ *              - const mld_poly *s2k: s2 polynomial for row k
  **************************************************/
-MLD_MUST_CHECK_RETURN_VALUE
-static int mld_compute_t0_t1_tr_from_sk_components(
-    mld_polyveck *t0, mld_polyveck *t1, uint8_t tr[MLDSA_TRBYTES],
-    uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[MLDSA_SEEDBYTES],
-    const mld_polyvecl *s1, const mld_polyveck *s2,
-    MLD_CONFIG_CONTEXT_PARAMETER_TYPE context)
+static void mld_compute_t0k_t1k(mld_poly *t0k, mld_poly *t1k, mld_poly *tk,
+                                mld_polymat *mat, unsigned int k,
+                                const mld_polyvecl *s1hat, const mld_poly *s2k)
 __contract__(
-  requires(memory_no_alias(t0, sizeof(mld_polyveck)))
-  requires(memory_no_alias(t1, sizeof(mld_polyveck)))
-  requires(memory_no_alias(tr, MLDSA_TRBYTES))
-  requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES))
-  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
-  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
-  requires(memory_no_alias(s2, sizeof(mld_polyveck)))
-  requires(forall(l0, 0, MLDSA_L, array_bound(s1->vec[l0].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
-  requires(forall(k0, 0, MLDSA_K, array_bound(s2->vec[k0].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
-  assigns(memory_slice(t0, sizeof(mld_polyveck)))
-  assigns(memory_slice(t1, sizeof(mld_polyveck)))
-  assigns(memory_slice(tr, MLDSA_TRBYTES))
-  assigns(memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES))
-  ensures(forall(k1, 0, MLDSA_K, array_bound(t0->vec[k1].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
-  ensures(forall(k2, 0, MLDSA_K, array_bound(t1->vec[k2].coeffs, 0, MLDSA_N, 0, 1 << 10)))
-  ensures(return_value == 0 || return_value == MLD_ERR_OUT_OF_MEMORY))
+  requires(memory_no_alias(tk, sizeof(mld_poly)))
+  requires(memory_no_alias(t1k, sizeof(mld_poly)))
+  requires(memory_no_alias(mat, sizeof(mld_polymat)))
+  requires(memory_no_alias(s1hat, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(s2k, sizeof(mld_poly)))
+  requires(t0k == tk)
+  requires(k < MLDSA_K)
+  requires(forall(l0, 0, MLDSA_L,
+    array_bound(mat->vec[k].vec[l0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  requires(forall(l1, 0, MLDSA_L,
+    array_abs_bound(s1hat->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  requires(array_bound(s2k->coeffs, 0, MLDSA_N,
+    MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))
+  assigns(memory_slice(tk, sizeof(mld_poly)))
+  assigns(memory_slice(t1k, sizeof(mld_poly)))
+  ensures(array_bound(t0k->coeffs, 0, MLDSA_N,
+    -(1 << (MLDSA_D - 1)) + 1, (1 << (MLDSA_D - 1)) + 1))
+  ensures(array_bound(t1k->coeffs, 0, MLDSA_N, 0, 1 << 10)))
 {
-  int ret;
-  MLD_ALLOC(mat, mld_polymat, 1, context);
-  MLD_ALLOC(s1hat, mld_polyvecl, 1, context);
-  MLD_ALLOC(t, mld_polyveck, 1, context);
-
-  if (mat == NULL || s1hat == NULL || t == NULL)
-  {
-    ret = MLD_ERR_OUT_OF_MEMORY;
-    goto cleanup;
-  }
-
-  /* Expand matrix */
-  mld_polyvec_matrix_expand(mat, rho);
-
-  /* Matrix-vector multiplication */
-  *s1hat = *s1;
-  mld_polyvecl_ntt(s1hat);
-  mld_polyvec_matrix_pointwise_montgomery(t, mat, s1hat);
-  mld_polyveck_invntt_tomont(t);
+  /* Matrix-vector multiplication (row k) */
+  mld_polyvecl_pointwise_acc_montgomery(tk, mat, k, s1hat);
+  mld_poly_invntt_tomont(tk);
 
-  /* Add error vector s2 */
-  mld_polyveck_add(t, s2);
+  /* Add error vector s2[k] */
+  mld_poly_add(tk, s2k);
 
   /* Reference: The following reduction is not present in the reference
    *            implementation. Omitting this reduction requires the output of
@@ -267,24 +253,11 @@ __contract__(
    *            reasoning. We instead add an additional reduction, and can
    *            consequently, relax the bounds requirements for the invntt.
    */
-  mld_polyveck_reduce(t);
+  mld_poly_reduce(tk);
 
-  /* Decompose to get t1, t0 */
-  mld_polyveck_caddq(t);
-  mld_polyveck_power2round(t1, t0, t);
-
-  /* Pack public key and compute tr */
-  mld_pack_pk(pk, rho, t1);
-  mld_shake256(tr, MLDSA_TRBYTES, pk, MLDSA_CRYPTO_PUBLICKEYBYTES);
-
-  ret = 0;
-
-cleanup:
-  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
-  MLD_FREE(t, mld_polyveck, 1, context);
-  MLD_FREE(s1hat, mld_polyvecl, 1, context);
-  MLD_FREE(mat, mld_polymat, 1, context);
-  return ret;
+  /* Decompose to get t1[k], t0[k] */
+  mld_poly_caddq(tk);
+  mld_poly_power2round(t1k, t0k, tk);
 }
 
 MLD_MUST_CHECK_RETURN_VALUE
@@ -294,18 +267,21 @@ int mld_sign_keypair_internal(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
                               const uint8_t seed[MLDSA_SEEDBYTES],
                               MLD_CONFIG_CONTEXT_PARAMETER_TYPE context)
 {
+  unsigned int k;
   int ret;
   const uint8_t *rho, *rhoprime, *key;
+
   MLD_ALLOC(seedbuf, uint8_t, 2 * MLDSA_SEEDBYTES + MLDSA_CRHBYTES, context);
   MLD_ALLOC(inbuf, uint8_t, MLDSA_SEEDBYTES + 2, context);
   MLD_ALLOC(tr, uint8_t, MLDSA_TRBYTES, context);
   MLD_ALLOC(s1, mld_polyvecl, 1, context);
   MLD_ALLOC(s2, mld_polyveck, 1, context);
-  MLD_ALLOC(t1, mld_polyveck, 1, context);
-  MLD_ALLOC(t0, mld_polyveck, 1, context);
+  MLD_ALLOC(mat, mld_polymat, 1, context);
+  MLD_ALLOC(t0k, mld_poly, 1, context);
+  MLD_ALLOC(t1k, mld_poly, 1, context);
 
   if (seedbuf == NULL || inbuf == NULL || tr == NULL || s1 == NULL ||
-      s2 == NULL || t1 == NULL || t0 == NULL)
+      s2 == NULL || mat == NULL || t0k == NULL || t1k == NULL)
   {
     ret = MLD_ERR_OUT_OF_MEMORY;
     goto cleanup;
@@ -327,24 +303,52 @@ int mld_sign_keypair_internal(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
   /* Sample s1 and s2 */
   mld_sample_s1_s2(s1, s2, rhoprime);
 
-  /* Compute t0, t1, tr, and pk from rho, s1, s2 */
-  ret = mld_compute_t0_t1_tr_from_sk_components(t0, t1, tr, pk, rho, s1, s2,
-                                                context);
-  if (ret != 0)
+  /* Pack s1 into sk before NTT */
+  mld_pack_sk_s1(sk, s1);
+
+  /* NTT s1 in-place to use as s1hat */
+  mld_polyvecl_ntt(s1);
+
+  /* Expand matrix */
+  mld_polyvec_matrix_expand(mat, rho);
+
+  /* Pack rho into pk */
+  mld_memcpy(pk, rho, MLDSA_SEEDBYTES);
+
+  /* Compute t row by row, decompose into t1[k]/t0[k], and pack
+   * immediately into pk and sk. The t0k buffer aliases the accumulator
+   * scratch (last argument), as required by mld_poly_power2round. */
+  for (k = 0; k < MLDSA_K; k++)
+  __loop__(
+    assigns(k, memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES),
+            memory_slice(sk, MLDSA_CRYPTO_SECRETKEYBYTES),
+            memory_slice(t0k, sizeof(mld_poly)),
+            memory_slice(t1k, sizeof(mld_poly)))
+    invariant(k <= MLDSA_K)
+    decreases(MLDSA_K - k)
+  )
   {
-    goto cleanup;
+    mld_compute_t0k_t1k(t0k, t1k, t0k, mat, k, s1, &s2->vec[k]);
+    mld_pack_pk_t1(pk, k, t1k);
+    mld_pack_sk_t0(sk, k, t0k);
   }
 
-  /* Pack secret key */
-  mld_pack_sk(sk, rho, tr, key, t0, s1, s2);
+  /* Compute tr = H(pk) */
+  mld_shake256(tr, MLDSA_TRBYTES, pk, MLDSA_CRYPTO_PUBLICKEYBYTES);
+
+  /* Pack remaining secret key components (s1 and t0 already packed) */
+  mld_pack_sk_rho_key_tr_s2(sk, rho, tr, key, s2);
 
   /* Constant time: pk is the public key, inherently public data */
   MLD_CT_TESTING_DECLASSIFY(pk, MLDSA_CRYPTO_PUBLICKEYBYTES);
 
+  ret = 0;
+
 cleanup:
   /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
-  MLD_FREE(t0, mld_polyveck, 1, context);
-  MLD_FREE(t1, mld_polyveck, 1, context);
+  MLD_FREE(t1k, mld_poly, 1, context);
+  MLD_FREE(t0k, mld_poly, 1, context);
+  MLD_FREE(mat, mld_polymat, 1, context);
   MLD_FREE(s2, mld_polyveck, 1, context);
   MLD_FREE(s1, mld_polyvecl, 1, context);
   MLD_FREE(tr, uint8_t, MLDSA_TRBYTES, context);
@@ -1464,6 +1468,7 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
                         const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES],
                         MLD_CONFIG_CONTEXT_PARAMETER_TYPE context)
 {
+  unsigned int k;
   uint8_t check, cmp0, cmp1, chk1, chk2;
   int ret;
   MLD_ALLOC(rho, uint8_t, MLDSA_SEEDBYTES, context);
@@ -1472,45 +1477,67 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
   MLD_ALLOC(key, uint8_t, MLDSA_SEEDBYTES, context);
   MLD_ALLOC(s1, mld_polyvecl, 1, context);
   MLD_ALLOC(s2, mld_polyveck, 1, context);
-  MLD_ALLOC(t0, mld_polyveck, 1, context);
-  MLD_ALLOC(t0_computed, mld_polyveck, 1, context);
-  MLD_ALLOC(t1, mld_polyveck, 1, context);
+  MLD_ALLOC(mat, mld_polymat, 1, context);
+  MLD_ALLOC(t0k, mld_poly, 1, context);
+  MLD_ALLOC(t1k, mld_poly, 1, context);
+  MLD_ALLOC(t0k_packed, uint8_t, MLDSA_POLYT0_PACKEDBYTES, context);
 
   if (rho == NULL || tr == NULL || tr_computed == NULL || key == NULL ||
-      s1 == NULL || s2 == NULL || t0 == NULL || t0_computed == NULL ||
-      t1 == NULL)
+      s1 == NULL || s2 == NULL || mat == NULL || t0k == NULL || t1k == NULL ||
+      t0k_packed == NULL)
   {
     ret = MLD_ERR_OUT_OF_MEMORY;
     goto cleanup;
   }
 
   /* Inline unpack_sk: mld_unpack_sk uses lazy types for s1/s2/t0 which
-   * we cannot use here.  */
+   * we cannot use here. t0 stays in packed form -- we compare it
+   * row-by-row against the recomputed value. */
   mld_memcpy(rho, sk, MLDSA_SEEDBYTES);
   mld_memcpy(key, sk + MLDSA_SEEDBYTES, MLDSA_SEEDBYTES);
   mld_memcpy(tr, sk + 2 * MLDSA_SEEDBYTES, MLDSA_TRBYTES);
   mld_polyvecl_unpack_eta(s1, sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES);
   mld_polyveck_unpack_eta(s2, sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES +
                                   MLDSA_L * MLDSA_POLYETA_PACKEDBYTES);
-  mld_polyveck_unpack_t0(t0, sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES +
-                                 MLDSA_L * MLDSA_POLYETA_PACKEDBYTES +
-                                 MLDSA_K * MLDSA_POLYETA_PACKEDBYTES);
 
   /* Validate s1 and s2 coefficients are within [-MLDSA_ETA, MLDSA_ETA] */
   chk1 = mld_polyvecl_chknorm(s1, MLDSA_ETA + 1) & 0xFF;
   chk2 = mld_polyveck_chknorm(s2, MLDSA_ETA + 1) & 0xFF;
 
-  /* Recompute t0, t1, tr, and pk from rho, s1, s2 */
-  ret = mld_compute_t0_t1_tr_from_sk_components(t0_computed, t1, tr_computed,
-                                                pk, rho, s1, s2, context);
-  if (ret != 0)
+  /* NTT s1 in place to use as s1hat */
+  mld_polyvecl_ntt(s1);
+
+  /* Expand matrix and pack rho into pk */
+  mld_polyvec_matrix_expand(mat, rho);
+  mld_memcpy(pk, rho, MLDSA_SEEDBYTES);
+
+  /* Recompute t row by row, decompose into t1[k]/t0[k], pack t1[k]
+   * into pk, and compare the packed t0[k] against the value stored
+   * in sk. */
+  cmp0 = 0;
+  for (k = 0; k < MLDSA_K; k++)
+  __loop__(
+    assigns(k, cmp0, memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES),
+            memory_slice(t0k, sizeof(mld_poly)),
+            memory_slice(t1k, sizeof(mld_poly)),
+            memory_slice(t0k_packed, MLDSA_POLYT0_PACKEDBYTES))
+    invariant(k <= MLDSA_K)
+    decreases(MLDSA_K - k)
+  )
   {
-    goto cleanup;
+    mld_compute_t0k_t1k(t0k, t1k, t0k, mat, k, s1, &s2->vec[k]);
+    mld_pack_pk_t1(pk, k, t1k);
+    mld_polyt0_pack(t0k_packed, t0k);
+    cmp0 |= mld_ct_memcmp(t0k_packed,
+                          sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES +
+                              MLDSA_L * MLDSA_POLYETA_PACKEDBYTES +
+                              MLDSA_K * MLDSA_POLYETA_PACKEDBYTES +
+                              k * MLDSA_POLYT0_PACKEDBYTES,
+                          MLDSA_POLYT0_PACKEDBYTES);
   }
 
-  /* Validate t0 and tr using constant-time comparisons */
-  cmp0 = mld_ct_memcmp((const uint8_t *)t0, (const uint8_t *)t0_computed,
-                       sizeof(mld_polyveck));
+  /* Compute tr_computed = H(pk) and compare to the stored tr */
+  mld_shake256(tr_computed, MLDSA_TRBYTES, pk, MLDSA_CRYPTO_PUBLICKEYBYTES);
   cmp1 = mld_ct_memcmp((const uint8_t *)tr, (const uint8_t *)tr_computed,
                        MLDSA_TRBYTES);
   check = mld_value_barrier_u8(cmp0 | cmp1 | chk1 | chk2);
@@ -1530,9 +1557,10 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
   MLD_CT_TESTING_DECLASSIFY(pk, MLDSA_CRYPTO_PUBLICKEYBYTES);
 
   /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
-  MLD_FREE(t1, mld_polyveck, 1, context);
-  MLD_FREE(t0_computed, mld_polyveck, 1, context);
-  MLD_FREE(t0, mld_polyveck, 1, context);
+  MLD_FREE(t0k_packed, uint8_t, MLDSA_POLYT0_PACKEDBYTES, context);
+  MLD_FREE(t1k, mld_poly, 1, context);
+  MLD_FREE(t0k, mld_poly, 1, context);
+  MLD_FREE(mat, mld_polymat, 1, context);
   MLD_FREE(s2, mld_polyveck, 1, context);
   MLD_FREE(s1, mld_polyvecl, 1, context);
   MLD_FREE(key, uint8_t, MLDSA_SEEDBYTES, context);
@@ -1552,6 +1580,6 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES],
 #undef mld_H
 #undef mld_compute_pack_z
 #undef mld_attempt_signature_generation
-#undef mld_compute_t0_t1_tr_from_sk_components
+#undef mld_compute_t0k_t1k
 #undef MLD_NONCE_UB
 #undef MLD_PRE_HASH_OID_LEN
diff --git a/proofs/cbmc/attempt_signature_generation/Makefile b/proofs/cbmc/attempt_signature_generation/Makefile
index ebf0a15bb..de9792b97 100644
--- a/proofs/cbmc/attempt_signature_generation/Makefile
+++ b/proofs/cbmc/attempt_signature_generation/Makefile
@@ -22,7 +22,7 @@ PROJECT_SOURCES += $(SRCDIR)/mldsa/src/sign.c
 CHECK_FUNCTION_CONTRACTS=mld_attempt_signature_generation
 USE_FUNCTION_CONTRACTS=mld_polyvecl_uniform_gamma1
 USE_FUNCTION_CONTRACTS+=mld_polyvecl_ntt
-USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_pointwise_montgomery
+USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_pointwise_montgomery_eager
 USE_FUNCTION_CONTRACTS+=mld_polyveck_invntt_tomont
 USE_FUNCTION_CONTRACTS+=mld_polyveck_caddq
 USE_FUNCTION_CONTRACTS+=mld_polyveck_decompose
diff --git a/proofs/cbmc/compute_t0_t1_tr_from_sk_components/Makefile b/proofs/cbmc/compute_t0_t1_tr_from_sk_components/Makefile
deleted file mode 100644
index c744c23d9..000000000
--- a/proofs/cbmc/compute_t0_t1_tr_from_sk_components/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) The mldsa-native project authors
-# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-include ../Makefile_params.common
-
-HARNESS_ENTRY = harness
-HARNESS_FILE = compute_t0_t1_tr_from_sk_components_harness
-
-# This should be a unique identifier for this proof, and will appear on the
-# Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = mld_compute_t0_t1_tr_from_sk_components
-
-DEFINES +=
-INCLUDES +=
-
-REMOVE_FUNCTION_BODY +=
-UNWINDSET +=
-
-PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/sign.c
-
-CHECK_FUNCTION_CONTRACTS=mld_compute_t0_t1_tr_from_sk_components
-USE_FUNCTION_CONTRACTS=mld_shake256
-USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand
-USE_FUNCTION_CONTRACTS+=mld_polyvecl_ntt
-USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_pointwise_montgomery
-USE_FUNCTION_CONTRACTS+=mld_polyveck_reduce
-USE_FUNCTION_CONTRACTS+=mld_polyveck_invntt_tomont
-USE_FUNCTION_CONTRACTS+=mld_polyveck_add
-USE_FUNCTION_CONTRACTS+=mld_polyveck_caddq
-USE_FUNCTION_CONTRACTS+=mld_polyveck_power2round
-USE_FUNCTION_CONTRACTS+=mld_pack_pk
-USE_FUNCTION_CONTRACTS+=mld_zeroize
-
-APPLY_LOOP_CONTRACTS=on
-USE_DYNAMIC_FRAMES=1
-
-# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
-EXTERNAL_SAT_SOLVER=
-CBMCFLAGS=--external-smt2-solver $(PROOF_ROOT)/lib/z3_no_bv_extract --z3
-CBMCFLAGS += --slice-formula
-CBMCFLAGS += --no-array-field-sensitivity
-
-FUNCTION_NAME = mld_compute_t0_t1_tr_from_sk_components
-
-# If this proof is found to consume huge amounts of RAM, you can set the
-# EXPENSIVE variable. With new enough versions of the proof tools, this will
-# restrict the number of EXPENSIVE CBMC jobs running at once. See the
-# documentation in Makefile.common under the "Job Pools" heading for details.
-# EXPENSIVE = true
-
-# This function is large enough to need...
-CBMC_OBJECT_BITS = 12
-
-# If you require access to a file-local ("static") function or object to conduct
-# your proof, set the following (and do not include the original source file
-# ("mldsa/poly.c") in PROJECT_SOURCES).
-# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i
-# include ../Makefile.common
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mldsa/src/poly.c
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz
-# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must
-# be set before including Makefile.common, but any use of variables on the
-# left-hand side requires those variables to be defined. Hence, _SOURCE,
-# _FUNCTIONS, _OBJECTS is set after including Makefile.common.
-
-include ../Makefile.common
diff --git a/proofs/cbmc/compute_t0_t1_tr_from_sk_components/compute_t0_t1_tr_from_sk_components_harness.c b/proofs/cbmc/compute_t0_t1_tr_from_sk_components/compute_t0_t1_tr_from_sk_components_harness.c
deleted file mode 100644
index 1fa7ed575..000000000
--- a/proofs/cbmc/compute_t0_t1_tr_from_sk_components/compute_t0_t1_tr_from_sk_components_harness.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) The mldsa-native project authors
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-#include "sign.h"
-
-int mld_compute_t0_t1_tr_from_sk_components(
-    mld_polyveck *t0, mld_polyveck *t1, uint8_t tr[MLDSA_TRBYTES],
-    uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[MLDSA_SEEDBYTES],
-    const mld_polyvecl *s1, const mld_polyveck *s2);
-
-void harness(void)
-{
-  mld_polyveck *t0;
-  mld_polyveck *t1;
-  uint8_t *tr;
-  uint8_t *pk;
-  uint8_t *rho;
-  mld_polyvecl *s1;
-  mld_polyveck *s2;
-
-  mld_compute_t0_t1_tr_from_sk_components(t0, t1, tr, pk, rho, s1, s2);
-}
diff --git a/proofs/cbmc/polyvecl_pointwise_acc_montgomery/Makefile b/proofs/cbmc/compute_t0k_t1k/Makefile
similarity index 76%
rename from proofs/cbmc/polyvecl_pointwise_acc_montgomery/Makefile
rename to proofs/cbmc/compute_t0k_t1k/Makefile
index dbf9abf4f..c8ffc001f 100644
--- a/proofs/cbmc/polyvecl_pointwise_acc_montgomery/Makefile
+++ b/proofs/cbmc/compute_t0k_t1k/Makefile
@@ -4,11 +4,11 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = polyvecl_pointwise_acc_montgomery_harness
+HARNESS_FILE = compute_t0k_t1k_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyvecl_pointwise_acc_montgomery
+PROOF_UID = compute_t0k_t1k
 
 DEFINES +=
 INCLUDES +=
@@ -17,20 +17,25 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
-
-CHECK_FUNCTION_CONTRACTS=mld_polyvecl_pointwise_acc_montgomery
-USE_FUNCTION_CONTRACTS= mld_polyvecl_pointwise_acc_montgomery_c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/sign.c
+
+CHECK_FUNCTION_CONTRACTS=mld_compute_t0k_t1k
+USE_FUNCTION_CONTRACTS=mld_polyvecl_pointwise_acc_montgomery_eager
+USE_FUNCTION_CONTRACTS+=mld_poly_invntt_tomont
+USE_FUNCTION_CONTRACTS+=mld_poly_add
+USE_FUNCTION_CONTRACTS+=mld_poly_reduce
+USE_FUNCTION_CONTRACTS+=mld_poly_caddq
+USE_FUNCTION_CONTRACTS+=mld_poly_power2round
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
 # Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
 EXTERNAL_SAT_SOLVER=
-CBMCFLAGS=--external-smt2-solver $(PROOF_ROOT)/lib/z3_smt_only --z3
+CBMCFLAGS=--external-smt2-solver $(PROOF_ROOT)/lib/z3_no_bv_extract --z3
 CBMCFLAGS += --slice-formula
 CBMCFLAGS += --no-array-field-sensitivity
 
-FUNCTION_NAME = polyvecl_pointwise_acc_montgomery
+FUNCTION_NAME = compute_t0k_t1k
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
@@ -39,7 +44,7 @@ FUNCTION_NAME = polyvecl_pointwise_acc_montgomery
 # EXPENSIVE = true
 
 # This function is large enough to need...
-CBMC_OBJECT_BITS = 12
+CBMC_OBJECT_BITS = 10
 
 # If you require access to a file-local ("static") function or object to conduct
 # your proof, set the following (and do not include the original source file
diff --git a/proofs/cbmc/compute_t0k_t1k/compute_t0k_t1k_harness.c b/proofs/cbmc/compute_t0k_t1k/compute_t0k_t1k_harness.c
new file mode 100644
index 000000000..de2486d5e
--- /dev/null
+++ b/proofs/cbmc/compute_t0k_t1k/compute_t0k_t1k_harness.c
@@ -0,0 +1,22 @@
+// Copyright (c) The mldsa-native project authors
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+#include "polyvec_lazy.h"
+#include "sign.h"
+
+void mld_compute_t0k_t1k(mld_poly *t0k, mld_poly *t1k, mld_poly *tk,
+                         mld_polymat *mat, unsigned int k,
+                         const mld_polyvecl *s1hat, const mld_poly *s2k);
+
+void harness(void)
+{
+  mld_poly *t0k;
+  mld_poly *t1k;
+  mld_poly *tk;
+  mld_polymat *mat;
+  unsigned int k;
+  mld_polyvecl *s1hat;
+  mld_poly *s2k;
+
+  mld_compute_t0k_t1k(t0k, t1k, tk, mat, k, s1hat, s2k);
+}
diff --git a/proofs/cbmc/pack_pk/Makefile b/proofs/cbmc/pack_pk_t1/Makefile
similarity index 90%
rename from proofs/cbmc/pack_pk/Makefile
rename to proofs/cbmc/pack_pk_t1/Makefile
index de383b72b..a92d82e5e 100644
--- a/proofs/cbmc/pack_pk/Makefile
+++ b/proofs/cbmc/pack_pk_t1/Makefile
@@ -4,30 +4,31 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = pack_pk_harness
+HARNESS_FILE = pack_pk_t1_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = pack_pk
+PROOF_UID = pack_pk_t1
 
 DEFINES +=
 INCLUDES +=
 
 REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
 PROJECT_SOURCES += $(SRCDIR)/mldsa/src/packing.c
 
-CHECK_FUNCTION_CONTRACTS=mld_pack_pk
+CHECK_FUNCTION_CONTRACTS=mld_pack_pk_t1
 USE_FUNCTION_CONTRACTS=mld_polyt1_pack
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
 # Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
 EXTERNAL_SAT_SOLVER=
-CBMCFLAGS=--smt2 --no-array-field-sensitivity --slice-formula
+CBMCFLAGS=--smt2
 
-FUNCTION_NAME = pack_pk
+FUNCTION_NAME = pack_pk_t1
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
@@ -36,7 +37,7 @@ FUNCTION_NAME = pack_pk
 # EXPENSIVE = true
 
 # This function is large enough to need...
-CBMC_OBJECT_BITS = 11
+CBMC_OBJECT_BITS = 9
 
 # If you require access to a file-local ("static") function or object to conduct
 # your proof, set the following (and do not include the original source file
diff --git a/proofs/cbmc/pack_pk_t1/pack_pk_t1_harness.c b/proofs/cbmc/pack_pk_t1/pack_pk_t1_harness.c
new file mode 100644
index 000000000..7e1cee1ed
--- /dev/null
+++ b/proofs/cbmc/pack_pk_t1/pack_pk_t1_harness.c
@@ -0,0 +1,13 @@
+// Copyright (c) The mldsa-native project authors
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+#include "packing.h"
+
+
+void harness(void)
+{
+  uint8_t *pk;
+  unsigned int k;
+  mld_poly *t1k;
+  mld_pack_pk_t1(pk, k, t1k);
+}
diff --git a/proofs/cbmc/polyveck_add/Makefile b/proofs/cbmc/pack_sk_rho_key_tr_s2/Makefile
similarity index 85%
rename from proofs/cbmc/polyveck_add/Makefile
rename to proofs/cbmc/pack_sk_rho_key_tr_s2/Makefile
index c55834a0c..1fec653ad 100644
--- a/proofs/cbmc/polyveck_add/Makefile
+++ b/proofs/cbmc/pack_sk_rho_key_tr_s2/Makefile
@@ -4,11 +4,11 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = polyveck_add_harness
+HARNESS_FILE = pack_sk_rho_key_tr_s2_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyveck_add
+PROOF_UID = pack_sk_rho_key_tr_s2
 
 DEFINES +=
 INCLUDES +=
@@ -17,18 +17,18 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/packing.c
 
-CHECK_FUNCTION_CONTRACTS=mld_polyveck_add
-USE_FUNCTION_CONTRACTS=mld_poly_add
+CHECK_FUNCTION_CONTRACTS=mld_pack_sk_rho_key_tr_s2
+USE_FUNCTION_CONTRACTS=mld_polyveck_pack_eta
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
 # Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
 EXTERNAL_SAT_SOLVER=
-CBMCFLAGS=--smt2 --arrays-uf-always --slice-formula
+CBMCFLAGS=--smt2
 
-FUNCTION_NAME = polyveck_add
+FUNCTION_NAME = pack_sk_rho_key_tr_s2
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
@@ -37,7 +37,7 @@ FUNCTION_NAME = polyveck_add
 # EXPENSIVE = true
 
 # This function is large enough to need...
-CBMC_OBJECT_BITS = 8
+CBMC_OBJECT_BITS = 9
 
 # If you require access to a file-local ("static") function or object to conduct
 # your proof, set the following (and do not include the original source file
diff --git a/proofs/cbmc/pack_sk/pack_sk_harness.c b/proofs/cbmc/pack_sk_rho_key_tr_s2/pack_sk_rho_key_tr_s2_harness.c
similarity index 67%
rename from proofs/cbmc/pack_sk/pack_sk_harness.c
rename to proofs/cbmc/pack_sk_rho_key_tr_s2/pack_sk_rho_key_tr_s2_harness.c
index 99558ee16..fa88887d5 100644
--- a/proofs/cbmc/pack_sk/pack_sk_harness.c
+++ b/proofs/cbmc/pack_sk_rho_key_tr_s2/pack_sk_rho_key_tr_s2_harness.c
@@ -7,7 +7,6 @@
 void harness(void)
 {
   uint8_t *a, *b, *c, *d;
-  mld_polyveck *t0, *s2;
-  mld_polyvecl *s1;
-  mld_pack_sk(a, b, c, d, t0, s1, s2);
+  mld_polyveck *s2;
+  mld_pack_sk_rho_key_tr_s2(a, b, c, d, s2);
 }
diff --git a/proofs/cbmc/pack_sk/Makefile b/proofs/cbmc/pack_sk_s1/Makefile
similarity index 90%
rename from proofs/cbmc/pack_sk/Makefile
rename to proofs/cbmc/pack_sk_s1/Makefile
index 9f09cff9c..9d77946f5 100644
--- a/proofs/cbmc/pack_sk/Makefile
+++ b/proofs/cbmc/pack_sk_s1/Makefile
@@ -4,11 +4,11 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = pack_sk_harness
+HARNESS_FILE = pack_sk_s1_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = pack_sk
+PROOF_UID = pack_sk_s1
 
 DEFINES +=
 INCLUDES +=
@@ -19,10 +19,8 @@ UNWINDSET +=
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
 PROJECT_SOURCES += $(SRCDIR)/mldsa/src/packing.c
 
-CHECK_FUNCTION_CONTRACTS=mld_pack_sk
+CHECK_FUNCTION_CONTRACTS=mld_pack_sk_s1
 USE_FUNCTION_CONTRACTS=mld_polyvecl_pack_eta
-USE_FUNCTION_CONTRACTS+=mld_polyveck_pack_eta
-USE_FUNCTION_CONTRACTS+=mld_polyveck_pack_t0
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
@@ -30,7 +28,7 @@ USE_DYNAMIC_FRAMES=1
 EXTERNAL_SAT_SOLVER=
 CBMCFLAGS=--smt2
 
-FUNCTION_NAME = pack_sk
+FUNCTION_NAME = pack_sk_s1
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
diff --git a/proofs/cbmc/pack_pk/pack_pk_harness.c b/proofs/cbmc/pack_sk_s1/pack_sk_s1_harness.c
similarity index 71%
rename from proofs/cbmc/pack_pk/pack_pk_harness.c
rename to proofs/cbmc/pack_sk_s1/pack_sk_s1_harness.c
index 695a1884d..5b1b09b44 100644
--- a/proofs/cbmc/pack_pk/pack_pk_harness.c
+++ b/proofs/cbmc/pack_sk_s1/pack_sk_s1_harness.c
@@ -6,7 +6,7 @@
 
 void harness(void)
 {
-  uint8_t *a, *b;
-  mld_polyveck *c;
-  mld_pack_pk(a, b, c);
+  uint8_t *a;
+  mld_polyvecl *s1;
+  mld_pack_sk_s1(a, s1);
 }
diff --git a/proofs/cbmc/polyveck_pack_t0/Makefile b/proofs/cbmc/pack_sk_t0/Makefile
similarity index 87%
rename from proofs/cbmc/polyveck_pack_t0/Makefile
rename to proofs/cbmc/pack_sk_t0/Makefile
index 8c3c745df..b40a50280 100644
--- a/proofs/cbmc/polyveck_pack_t0/Makefile
+++ b/proofs/cbmc/pack_sk_t0/Makefile
@@ -4,21 +4,22 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = polyveck_pack_t0_harness
+HARNESS_FILE = pack_sk_t0_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyveck_pack_t0
+PROOF_UID = pack_sk_t0
 
 DEFINES +=
 INCLUDES +=
 
 REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/packing.c
 
-CHECK_FUNCTION_CONTRACTS=mld_polyveck_pack_t0
+CHECK_FUNCTION_CONTRACTS=mld_pack_sk_t0
 USE_FUNCTION_CONTRACTS=mld_polyt0_pack
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
@@ -26,10 +27,8 @@ USE_DYNAMIC_FRAMES=1
 # Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
 EXTERNAL_SAT_SOLVER=
 CBMCFLAGS=--smt2
-CBMCFLAGS += --slice-formula
-CBMCFLAGS += --no-array-field-sensitivity
 
-FUNCTION_NAME = polyveck_pack_t0
+FUNCTION_NAME = pack_sk_t0
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
diff --git a/proofs/cbmc/pack_sk_t0/pack_sk_t0_harness.c b/proofs/cbmc/pack_sk_t0/pack_sk_t0_harness.c
new file mode 100644
index 000000000..38db80895
--- /dev/null
+++ b/proofs/cbmc/pack_sk_t0/pack_sk_t0_harness.c
@@ -0,0 +1,13 @@
+// Copyright (c) The mldsa-native project authors
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+#include "packing.h"
+
+
+void harness(void)
+{
+  uint8_t *sk;
+  unsigned int k;
+  mld_poly *t0k;
+  mld_pack_sk_t0(sk, k, t0k);
+}
diff --git a/proofs/cbmc/polymat_permute_bitrev_to_custom/Makefile b/proofs/cbmc/polymat_permute_bitrev_to_custom/Makefile
index 6349d83ac..44a713e78 100644
--- a/proofs/cbmc/polymat_permute_bitrev_to_custom/Makefile
+++ b/proofs/cbmc/polymat_permute_bitrev_to_custom/Makefile
@@ -17,7 +17,7 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
 CHECK_FUNCTION_CONTRACTS=mld_polymat_permute_bitrev_to_custom
 USE_FUNCTION_CONTRACTS=mld_polyvecl_permute_bitrev_to_custom
diff --git a/proofs/cbmc/polymat_permute_bitrev_to_custom/polymat_permute_bitrev_to_custom_harness.c b/proofs/cbmc/polymat_permute_bitrev_to_custom/polymat_permute_bitrev_to_custom_harness.c
index a35bc954e..5aa295436 100644
--- a/proofs/cbmc/polymat_permute_bitrev_to_custom/polymat_permute_bitrev_to_custom_harness.c
+++ b/proofs/cbmc/polymat_permute_bitrev_to_custom/polymat_permute_bitrev_to_custom_harness.c
@@ -1,7 +1,7 @@
 // Copyright (c) The mldsa-native project authors
 // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
 
-#include "polyvec.h"
+#include "polyvec_lazy.h"
 
 void mld_polymat_permute_bitrev_to_custom(mld_polymat *mat);
 
diff --git a/proofs/cbmc/polyvec_matrix_expand/Makefile b/proofs/cbmc/polyvec_matrix_expand_eager/Makefile
similarity index 89%
rename from proofs/cbmc/polyvec_matrix_expand/Makefile
rename to proofs/cbmc/polyvec_matrix_expand_eager/Makefile
index 4b7faa6c4..ab910cbd6 100644
--- a/proofs/cbmc/polyvec_matrix_expand/Makefile
+++ b/proofs/cbmc/polyvec_matrix_expand_eager/Makefile
@@ -4,11 +4,11 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = polyvec_matrix_expand_harness
+HARNESS_FILE = polyvec_matrix_expand_eager_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyvec_matrix_expand
+PROOF_UID = polyvec_matrix_expand_eager
 
 DEFINES +=
 INCLUDES +=
@@ -17,9 +17,9 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
-CHECK_FUNCTION_CONTRACTS=mld_polyvec_matrix_expand
+CHECK_FUNCTION_CONTRACTS=mld_polyvec_matrix_expand_eager
 USE_FUNCTION_CONTRACTS=mld_poly_uniform_4x
 USE_FUNCTION_CONTRACTS+=mld_poly_uniform
 USE_FUNCTION_CONTRACTS+=mld_polymat_permute_bitrev_to_custom
@@ -31,7 +31,7 @@ EXTERNAL_SAT_SOLVER=
 CBMCFLAGS=--external-smt2-solver $(PROOF_ROOT)/lib/z3_no_bv_extract --z3
 CBMCFLAGS += --slice-formula
 
-FUNCTION_NAME = polyvec_matrix_expand
+FUNCTION_NAME = polyvec_matrix_expand_eager
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
diff --git a/proofs/cbmc/polyvec_matrix_expand/polyvec_matrix_expand_harness.c b/proofs/cbmc/polyvec_matrix_expand_eager/polyvec_matrix_expand_eager_harness.c
similarity index 88%
rename from proofs/cbmc/polyvec_matrix_expand/polyvec_matrix_expand_harness.c
rename to proofs/cbmc/polyvec_matrix_expand_eager/polyvec_matrix_expand_eager_harness.c
index 0365c6ac5..586ecbe49 100644
--- a/proofs/cbmc/polyvec_matrix_expand/polyvec_matrix_expand_harness.c
+++ b/proofs/cbmc/polyvec_matrix_expand_eager/polyvec_matrix_expand_eager_harness.c
@@ -1,7 +1,7 @@
 // Copyright (c) The mldsa-native project authors
 // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
 
-#include "polyvec.h"
+#include "polyvec_lazy.h"
 
 void harness(void)
 {
diff --git a/proofs/cbmc/polyvec_matrix_expand_serial/Makefile b/proofs/cbmc/polyvec_matrix_expand_eager_serial/Makefile
similarity index 87%
rename from proofs/cbmc/polyvec_matrix_expand_serial/Makefile
rename to proofs/cbmc/polyvec_matrix_expand_eager_serial/Makefile
index f9261da6e..ed76b5840 100644
--- a/proofs/cbmc/polyvec_matrix_expand_serial/Makefile
+++ b/proofs/cbmc/polyvec_matrix_expand_eager_serial/Makefile
@@ -4,11 +4,11 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = polyvec_matrix_expand_harness
+HARNESS_FILE = polyvec_matrix_expand_eager_serial_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyvec_matrix_expand_serial
+PROOF_UID = polyvec_matrix_expand_eager_serial
 
 DEFINES += -DMLD_CONFIG_SERIAL_FIPS202_ONLY
 INCLUDES +=
@@ -17,9 +17,9 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
-CHECK_FUNCTION_CONTRACTS=mld_polyvec_matrix_expand
+CHECK_FUNCTION_CONTRACTS=mld_polyvec_matrix_expand_eager
 USE_FUNCTION_CONTRACTS=mld_poly_uniform
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
@@ -28,7 +28,7 @@ USE_DYNAMIC_FRAMES=1
 EXTERNAL_SAT_SOLVER=
 CBMCFLAGS=--smt2 --arrays-uf-always --slice-formula
 
-FUNCTION_NAME = polyvec_matrix_expand_serial
+FUNCTION_NAME = polyvec_matrix_expand_eager_serial
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
diff --git a/proofs/cbmc/polyvec_matrix_expand_serial/polyvec_matrix_expand_harness.c b/proofs/cbmc/polyvec_matrix_expand_eager_serial/polyvec_matrix_expand_eager_serial_harness.c
similarity index 88%
rename from proofs/cbmc/polyvec_matrix_expand_serial/polyvec_matrix_expand_harness.c
rename to proofs/cbmc/polyvec_matrix_expand_eager_serial/polyvec_matrix_expand_eager_serial_harness.c
index 0365c6ac5..586ecbe49 100644
--- a/proofs/cbmc/polyvec_matrix_expand_serial/polyvec_matrix_expand_harness.c
+++ b/proofs/cbmc/polyvec_matrix_expand_eager_serial/polyvec_matrix_expand_eager_serial_harness.c
@@ -1,7 +1,7 @@
 // Copyright (c) The mldsa-native project authors
 // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
 
-#include "polyvec.h"
+#include "polyvec_lazy.h"
 
 void harness(void)
 {
diff --git a/proofs/cbmc/polyvec_matrix_pointwise_montgomery/Makefile b/proofs/cbmc/polyvec_matrix_pointwise_montgomery_eager/Makefile
similarity index 86%
rename from proofs/cbmc/polyvec_matrix_pointwise_montgomery/Makefile
rename to proofs/cbmc/polyvec_matrix_pointwise_montgomery_eager/Makefile
index 758ca18ba..0219f3829 100644
--- a/proofs/cbmc/polyvec_matrix_pointwise_montgomery/Makefile
+++ b/proofs/cbmc/polyvec_matrix_pointwise_montgomery_eager/Makefile
@@ -4,11 +4,11 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = polyvec_matrix_pointwise_montgomery_harness
+HARNESS_FILE = polyvec_matrix_pointwise_montgomery_eager_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyvec_matrix_pointwise_montgomery
+PROOF_UID = polyvec_matrix_pointwise_montgomery_eager
 
 DEFINES +=
 INCLUDES +=
@@ -17,10 +17,10 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
-CHECK_FUNCTION_CONTRACTS=mld_polyvec_matrix_pointwise_montgomery
-USE_FUNCTION_CONTRACTS=mld_polyvecl_pointwise_acc_montgomery
+CHECK_FUNCTION_CONTRACTS=mld_polyvec_matrix_pointwise_montgomery_eager
+USE_FUNCTION_CONTRACTS=mld_polyvecl_pointwise_acc_montgomery_eager
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
@@ -28,7 +28,7 @@ USE_DYNAMIC_FRAMES=1
 EXTERNAL_SAT_SOLVER=
 CBMCFLAGS=--smt2 --arrays-uf-always --slice-formula
 
-FUNCTION_NAME = polyvec_matrix_pointwise_montgomery
+FUNCTION_NAME = polyvec_matrix_pointwise_montgomery_eager
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
diff --git a/proofs/cbmc/polyvec_matrix_pointwise_montgomery/polyvec_matrix_pointwise_montgomery_harness.c b/proofs/cbmc/polyvec_matrix_pointwise_montgomery_eager/polyvec_matrix_pointwise_montgomery_eager_harness.c
similarity index 90%
rename from proofs/cbmc/polyvec_matrix_pointwise_montgomery/polyvec_matrix_pointwise_montgomery_harness.c
rename to proofs/cbmc/polyvec_matrix_pointwise_montgomery_eager/polyvec_matrix_pointwise_montgomery_eager_harness.c
index d967e10d8..b1002e8e4 100644
--- a/proofs/cbmc/polyvec_matrix_pointwise_montgomery/polyvec_matrix_pointwise_montgomery_harness.c
+++ b/proofs/cbmc/polyvec_matrix_pointwise_montgomery_eager/polyvec_matrix_pointwise_montgomery_eager_harness.c
@@ -1,7 +1,7 @@
 // Copyright (c) The mldsa-native project authors
 // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
 
-#include "polyvec.h"
+#include "polyvec_lazy.h"
 
 void harness(void)
 {
diff --git a/proofs/cbmc/polyveck_add/polyveck_add_harness.c b/proofs/cbmc/polyveck_add/polyveck_add_harness.c
deleted file mode 100644
index 5adb38f5b..000000000
--- a/proofs/cbmc/polyveck_add/polyveck_add_harness.c
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (c) The mldsa-native project authors
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-#include "polyvec.h"
-
-void harness(void)
-{
-  mld_polyveck *r, *b;
-  mld_polyveck_add(r, b);
-}
diff --git a/proofs/cbmc/polyveck_pack_t0/polyveck_pack_t0_harness.c b/proofs/cbmc/polyveck_pack_t0/polyveck_pack_t0_harness.c
deleted file mode 100644
index dfa7c17f3..000000000
--- a/proofs/cbmc/polyveck_pack_t0/polyveck_pack_t0_harness.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) The mldsa-native project authors
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-#include "polyvec.h"
-
-void harness(void)
-{
-  mld_polyveck *a;
-  uint8_t *b;
-  mld_polyveck_pack_t0(b, a);
-}
diff --git a/proofs/cbmc/polyveck_power2round/Makefile b/proofs/cbmc/polyveck_power2round/Makefile
deleted file mode 100644
index a094c3254..000000000
--- a/proofs/cbmc/polyveck_power2round/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) The mldsa-native project authors
-# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-include ../Makefile_params.common
-
-HARNESS_ENTRY = harness
-HARNESS_FILE = polyveck_power2round_harness
-
-# This should be a unique identifier for this proof, and will appear on the
-# Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyveck_power2round
-
-DEFINES +=
-INCLUDES +=
-
-REMOVE_FUNCTION_BODY +=
-UNWINDSET +=
-
-PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
-
-CHECK_FUNCTION_CONTRACTS=mld_polyveck_power2round
-USE_FUNCTION_CONTRACTS=mld_poly_power2round
-APPLY_LOOP_CONTRACTS=on
-USE_DYNAMIC_FRAMES=1
-
-# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
-EXTERNAL_SAT_SOLVER=
-CBMCFLAGS=--smt2 --slice-formula
-
-FUNCTION_NAME = polyveck_power2round
-
-# If this proof is found to consume huge amounts of RAM, you can set the
-# EXPENSIVE variable. With new enough versions of the proof tools, this will
-# restrict the number of EXPENSIVE CBMC jobs running at once. See the
-# documentation in Makefile.common under the "Job Pools" heading for details.
-# EXPENSIVE = true
-
-# This function is large enough to need...
-CBMC_OBJECT_BITS = 10
-
-# If you require access to a file-local ("static") function or object to conduct
-# your proof, set the following (and do not include the original source file
-# ("mldsa/poly.c") in PROJECT_SOURCES).
-# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i
-# include ../Makefile.common
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mldsa/src/poly.c
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz
-# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must
-# be set before including Makefile.common, but any use of variables on the
-# left-hand side requires those variables to be defined. Hence, _SOURCE,
-# _FUNCTIONS, _OBJECTS is set after including Makefile.common.
-
-include ../Makefile.common
diff --git a/proofs/cbmc/polyveck_power2round/polyveck_power2round_harness.c b/proofs/cbmc/polyveck_power2round/polyveck_power2round_harness.c
deleted file mode 100644
index 3220f440d..000000000
--- a/proofs/cbmc/polyveck_power2round/polyveck_power2round_harness.c
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (c) The mldsa-native project authors
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-#include "polyvec.h"
-
-void harness(void)
-{
-  mld_polyveck *a, *b, *c;
-  mld_polyveck_power2round(a, b, c);
-}
diff --git a/proofs/cbmc/polyvecl_permute_bitrev_to_custom/Makefile b/proofs/cbmc/polyvecl_permute_bitrev_to_custom/Makefile
index 370d3571e..f7d51909f 100644
--- a/proofs/cbmc/polyvecl_permute_bitrev_to_custom/Makefile
+++ b/proofs/cbmc/polyvecl_permute_bitrev_to_custom/Makefile
@@ -17,7 +17,7 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
 CHECK_FUNCTION_CONTRACTS=mld_polyvecl_permute_bitrev_to_custom
 USE_FUNCTION_CONTRACTS=
diff --git a/proofs/cbmc/polyvecl_permute_bitrev_to_custom_native/Makefile b/proofs/cbmc/polyvecl_permute_bitrev_to_custom_native/Makefile
index 893a26dec..3f62b6f6e 100644
--- a/proofs/cbmc/polyvecl_permute_bitrev_to_custom_native/Makefile
+++ b/proofs/cbmc/polyvecl_permute_bitrev_to_custom_native/Makefile
@@ -18,7 +18,7 @@ UNWINDSET +=
 REMOVE_FUNCTION_BODY +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
 CHECK_FUNCTION_CONTRACTS=mld_polyvecl_permute_bitrev_to_custom
 USE_FUNCTION_CONTRACTS= mld_poly_permute_bitrev_to_custom
diff --git a/proofs/cbmc/polyvecl_pointwise_acc_montgomery/polyvecl_pointwise_acc_montgomery_harness.c b/proofs/cbmc/polyvecl_pointwise_acc_montgomery/polyvecl_pointwise_acc_montgomery_harness.c
deleted file mode 100644
index 927d5a220..000000000
--- a/proofs/cbmc/polyvecl_pointwise_acc_montgomery/polyvecl_pointwise_acc_montgomery_harness.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) The mldsa-native project authors
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-#include "polyvec.h"
-
-void harness(void)
-{
-  mld_poly *a;
-  mld_polyvecl *b, *c;
-  mld_polyvecl_pointwise_acc_montgomery(a, b, c);
-}
diff --git a/proofs/cbmc/polyvecl_pointwise_acc_montgomery_c/Makefile b/proofs/cbmc/polyvecl_pointwise_acc_montgomery_c/Makefile
index dfbbdab12..b003cac2d 100644
--- a/proofs/cbmc/polyvecl_pointwise_acc_montgomery_c/Makefile
+++ b/proofs/cbmc/polyvecl_pointwise_acc_montgomery_c/Makefile
@@ -17,7 +17,7 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
 CHECK_FUNCTION_CONTRACTS=mld_polyvecl_pointwise_acc_montgomery_c
 USE_FUNCTION_CONTRACTS=mld_montgomery_reduce
diff --git a/proofs/cbmc/polyvecl_pointwise_acc_montgomery_native/Makefile b/proofs/cbmc/polyvecl_pointwise_acc_montgomery_eager/Makefile
similarity index 89%
rename from proofs/cbmc/polyvecl_pointwise_acc_montgomery_native/Makefile
rename to proofs/cbmc/polyvecl_pointwise_acc_montgomery_eager/Makefile
index 402353746..42837daa7 100644
--- a/proofs/cbmc/polyvecl_pointwise_acc_montgomery_native/Makefile
+++ b/proofs/cbmc/polyvecl_pointwise_acc_montgomery_eager/Makefile
@@ -4,11 +4,11 @@
 include ../Makefile_params.common
 
 HARNESS_ENTRY = harness
-HARNESS_FILE = polyvecl_pointwise_acc_montgomery_native_harness
+HARNESS_FILE = polyvecl_pointwise_acc_montgomery_eager_harness
 
 # This should be a unique identifier for this proof, and will appear on the
 # Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = polyvecl_pointwise_acc_montgomery_native
+PROOF_UID = polyvecl_pointwise_acc_montgomery_eager
 
 DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLD_CONFIG_ARITH_BACKEND_FILE="\"dummy_backend.h\""
 INCLUDES +=
@@ -17,9 +17,9 @@ REMOVE_FUNCTION_BODY +=
 UNWINDSET +=
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec.c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/polyvec_lazy.c
 
-CHECK_FUNCTION_CONTRACTS=mld_polyvecl_pointwise_acc_montgomery
+CHECK_FUNCTION_CONTRACTS=mld_polyvecl_pointwise_acc_montgomery_eager
 USE_FUNCTION_CONTRACTS= mld_polyvecl_pointwise_acc_montgomery_c
 ifeq ($(MLD_CONFIG_PARAMETER_SET),44)
 	USE_FUNCTION_CONTRACTS+=mld_polyvecl_pointwise_acc_montgomery_l4_native
@@ -37,7 +37,7 @@ CBMCFLAGS=--external-smt2-solver $(PROOF_ROOT)/lib/z3_smt_only --z3
 CBMCFLAGS += --slice-formula
 CBMCFLAGS += --no-array-field-sensitivity
 
-FUNCTION_NAME = polyvecl_pointwise_acc_montgomery_native
+FUNCTION_NAME = polyvecl_pointwise_acc_montgomery_eager
 
 # If this proof is found to consume huge amounts of RAM, you can set the
 # EXPENSIVE variable. With new enough versions of the proof tools, this will
diff --git a/proofs/cbmc/polyvecl_pointwise_acc_montgomery_eager/polyvecl_pointwise_acc_montgomery_eager_harness.c b/proofs/cbmc/polyvecl_pointwise_acc_montgomery_eager/polyvecl_pointwise_acc_montgomery_eager_harness.c
new file mode 100644
index 000000000..0f4d7e1fd
--- /dev/null
+++ b/proofs/cbmc/polyvecl_pointwise_acc_montgomery_eager/polyvecl_pointwise_acc_montgomery_eager_harness.c
@@ -0,0 +1,13 @@
+// Copyright (c) The mldsa-native project authors
+// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+#include "polyvec_lazy.h"
+
+void harness(void)
+{
+  mld_poly *w;
+  mld_polymat_eager *mat;
+  unsigned int k;
+  mld_polyvecl *v;
+  mld_polyvecl_pointwise_acc_montgomery_eager(w, mat, k, v);
+}
diff --git a/proofs/cbmc/polyvecl_pointwise_acc_montgomery_native/polyvecl_pointwise_acc_montgomery_native_harness.c b/proofs/cbmc/polyvecl_pointwise_acc_montgomery_native/polyvecl_pointwise_acc_montgomery_native_harness.c
deleted file mode 100644
index 927d5a220..000000000
--- a/proofs/cbmc/polyvecl_pointwise_acc_montgomery_native/polyvecl_pointwise_acc_montgomery_native_harness.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) The mldsa-native project authors
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-#include "polyvec.h"
-
-void harness(void)
-{
-  mld_poly *a;
-  mld_polyvecl *b, *c;
-  mld_polyvecl_pointwise_acc_montgomery(a, b, c);
-}
diff --git a/proofs/cbmc/sign_keypair_internal/Makefile b/proofs/cbmc/sign_keypair_internal/Makefile
index 83f79ba70..22af85e53 100644
--- a/proofs/cbmc/sign_keypair_internal/Makefile
+++ b/proofs/cbmc/sign_keypair_internal/Makefile
@@ -21,11 +21,14 @@ PROJECT_SOURCES += $(SRCDIR)/mldsa/src/sign.c
 
 CHECK_FUNCTION_CONTRACTS=mld_keypair_internal
 USE_FUNCTION_CONTRACTS=mld_shake256
-USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand
+USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand_eager
 USE_FUNCTION_CONTRACTS+=mld_sample_s1_s2
-USE_FUNCTION_CONTRACTS+=mld_compute_t0_t1_tr_from_sk_components
-USE_FUNCTION_CONTRACTS+=mld_pack_pk
-USE_FUNCTION_CONTRACTS+=mld_pack_sk
+USE_FUNCTION_CONTRACTS+=mld_pack_sk_s1
+USE_FUNCTION_CONTRACTS+=mld_pack_sk_rho_key_tr_s2
+USE_FUNCTION_CONTRACTS+=mld_pack_sk_t0
+USE_FUNCTION_CONTRACTS+=mld_pack_pk_t1
+USE_FUNCTION_CONTRACTS+=mld_polyvecl_ntt
+USE_FUNCTION_CONTRACTS+=mld_compute_t0k_t1k
 USE_FUNCTION_CONTRACTS+=mld_zeroize
 USE_FUNCTION_CONTRACTS+=mld_check_pct
 
diff --git a/proofs/cbmc/sign_pk_from_sk/Makefile b/proofs/cbmc/sign_pk_from_sk/Makefile
index b3f38c3b4..5e0e8d0f1 100644
--- a/proofs/cbmc/sign_pk_from_sk/Makefile
+++ b/proofs/cbmc/sign_pk_from_sk/Makefile
@@ -22,11 +22,14 @@ PROJECT_SOURCES += $(SRCDIR)/mldsa/src/sign.c
 CHECK_FUNCTION_CONTRACTS=mld_pk_from_sk
 USE_FUNCTION_CONTRACTS=mld_polyvecl_unpack_eta
 USE_FUNCTION_CONTRACTS+=mld_polyveck_unpack_eta
-USE_FUNCTION_CONTRACTS+=mld_polyveck_unpack_t0
-USE_FUNCTION_CONTRACTS+=mld_pack_pk
 USE_FUNCTION_CONTRACTS+=mld_polyvecl_chknorm
 USE_FUNCTION_CONTRACTS+=mld_polyveck_chknorm
-USE_FUNCTION_CONTRACTS+=mld_compute_t0_t1_tr_from_sk_components
+USE_FUNCTION_CONTRACTS+=mld_polyvecl_ntt
+USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand_eager
+USE_FUNCTION_CONTRACTS+=mld_compute_t0k_t1k
+USE_FUNCTION_CONTRACTS+=mld_polyt0_pack
+USE_FUNCTION_CONTRACTS+=mld_pack_pk_t1
+USE_FUNCTION_CONTRACTS+=mld_shake256
 USE_FUNCTION_CONTRACTS+=mld_value_barrier_u8
 USE_FUNCTION_CONTRACTS+=mld_ct_memcmp
 USE_FUNCTION_CONTRACTS+=mld_zeroize
diff --git a/proofs/cbmc/sign_signature_internal/Makefile b/proofs/cbmc/sign_signature_internal/Makefile
index c91e4b0ae..af32c83af 100644
--- a/proofs/cbmc/sign_signature_internal/Makefile
+++ b/proofs/cbmc/sign_signature_internal/Makefile
@@ -22,7 +22,7 @@ PROJECT_SOURCES += $(SRCDIR)/mldsa/src/sign.c
 CHECK_FUNCTION_CONTRACTS=mld_signature_internal
 USE_FUNCTION_CONTRACTS=mld_unpack_sk 
 USE_FUNCTION_CONTRACTS+=mld_H 
-USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand
+USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand_eager
 USE_FUNCTION_CONTRACTS+=mld_attempt_signature_generation
 USE_FUNCTION_CONTRACTS+=mld_zeroize
 
diff --git a/proofs/cbmc/sign_verify_internal/Makefile b/proofs/cbmc/sign_verify_internal/Makefile
index 2ca141a69..711c69455 100644
--- a/proofs/cbmc/sign_verify_internal/Makefile
+++ b/proofs/cbmc/sign_verify_internal/Makefile
@@ -25,9 +25,9 @@ USE_FUNCTION_CONTRACTS+=mld_unpack_sig
 USE_FUNCTION_CONTRACTS+=mld_polyvecl_chknorm
 USE_FUNCTION_CONTRACTS+=mld_H
 USE_FUNCTION_CONTRACTS+=mld_poly_challenge
-USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand
+USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_expand_eager
 USE_FUNCTION_CONTRACTS+=mld_polyvecl_ntt
-USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_pointwise_montgomery
+USE_FUNCTION_CONTRACTS+=mld_polyvec_matrix_pointwise_montgomery_eager
 USE_FUNCTION_CONTRACTS+=mld_poly_ntt
 USE_FUNCTION_CONTRACTS+=mld_polyveck_shiftl
 USE_FUNCTION_CONTRACTS+=mld_polyveck_ntt
diff --git a/proofs/cbmc/unpack_sk/Makefile b/proofs/cbmc/unpack_sk/Makefile
index e198bdd48..0f334ca6a 100644
--- a/proofs/cbmc/unpack_sk/Makefile
+++ b/proofs/cbmc/unpack_sk/Makefile
@@ -23,8 +23,8 @@ CHECK_FUNCTION_CONTRACTS=mld_unpack_sk
 USE_FUNCTION_CONTRACTS=mld_polyvecl_unpack_eta
 USE_FUNCTION_CONTRACTS+=mld_polyvecl_ntt
 USE_FUNCTION_CONTRACTS+=mld_polyveck_unpack_eta
-USE_FUNCTION_CONTRACTS+=mld_polyveck_ntt
 USE_FUNCTION_CONTRACTS+=mld_polyveck_unpack_t0
+USE_FUNCTION_CONTRACTS+=mld_polyveck_ntt
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
diff --git a/test/bench/bench_components_mldsa.c b/test/bench/bench_components_mldsa.c
index 6cf8627e5..04815943c 100644
--- a/test/bench/bench_components_mldsa.c
+++ b/test/bench/bench_components_mldsa.c
@@ -11,6 +11,7 @@
 #include "../../mldsa/src/poly.h"
 #include "../../mldsa/src/poly_kl.h"
 #include "../../mldsa/src/polyvec.h"
+#include "../../mldsa/src/polyvec_lazy.h"
 #include "../../mldsa/src/randombytes.h"
 #include "hal.h"
 
@@ -75,7 +76,7 @@ static int bench(void)
 
   /* pointwise */
   BENCH("polyvecl_pointwise_acc_montgomery",
-        mld_polyvecl_pointwise_acc_montgomery(&poly_out, &polyvecl_a,
+        mld_polyvecl_pointwise_acc_montgomery(&poly_out, &polymat, 0,
                                               &polyvecl_b))
   BENCH("polyvec_matrix_pointwise_montgomery",
         mld_polyvec_matrix_pointwise_montgomery(&polyveck_out, &polymat,
diff --git a/test/src/test_unit.c b/test/src/test_unit.c
index c22998b27..39a54f2d2 100644
--- a/test/src/test_unit.c
+++ b/test/src/test_unit.c
@@ -22,6 +22,8 @@
 #endif
 #endif /* !NUM_RANDOM_TESTS */
 
+#define NUM_RANDOM_TESTS_SLOW 50
+
 #define CHECK(x)                                              \
   do                                                          \
   {                                                           \
@@ -571,8 +573,12 @@ static int test_polyvecl_pointwise_acc_montgomery_core(const mld_polyvecl *u,
                                                        const char *test_name)
 {
   mld_poly test_w, ref_w;
+  mld_polymat_eager mat;
 
-  mld_polyvecl_pointwise_acc_montgomery(&test_w, u, v);
+  /* The eager wrapper takes (mat, k, v); place u at row 0 of a synthetic
+   * polymat and call it with k = 0. */
+  mat.vec[0] = *u;
+  mld_polyvecl_pointwise_acc_montgomery_eager(&test_w, &mat, 0, v);
   mld_polyvecl_pointwise_acc_montgomery_c(&ref_w, u, v);
 
   CHECK(compare_i32_arrays(test_w.coeffs, ref_w.coeffs, MLDSA_N, test_name,
@@ -765,10 +771,11 @@ static int test_backend_units(void)
 /* Test that eager and lazy polyvec init+get produce the same results */
 static int test_polyvec_lazy_eager(void)
 {
-  unsigned int i, t;
+  unsigned int i, j, t;
   uint8_t packed_s1[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES];
   uint8_t packed_s2[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES];
   uint8_t packed_t0[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES];
+  uint8_t rho[MLDSA_SEEDBYTES];
   mld_sk_s1hat_eager s1_eager;
   mld_sk_s1hat_lazy s1_lazy;
   mld_sk_s2hat_eager s2_eager;
@@ -776,8 +783,12 @@ static int test_polyvec_lazy_eager(void)
   mld_sk_t0hat_eager t0_eager;
   mld_sk_t0hat_lazy t0_lazy;
   mld_poly poly_eager, poly_lazy;
+  mld_polymat_eager mat_eager;
+  mld_polymat_lazy mat_lazy;
+  mld_polyvecl v;
+  mld_polyveck tk_eager, tk_lazy;
 
-  for (t = 0; t < NUM_RANDOM_TESTS; t++)
+  for (t = 0; t < NUM_RANDOM_TESTS_SLOW; t++)
   {
     /* Test s1vec: eager vs lazy */
     randombytes(packed_s1, sizeof(packed_s1));
@@ -816,6 +827,34 @@ static int test_polyvec_lazy_eager(void)
     }
   }
 
+  /* Test matrix expand + pointwise: eager vs lazy.
+   * Fewer iterations since matrix expand is expensive. */
+  for (t = 0; t < NUM_RANDOM_TESTS_SLOW; t++)
+  {
+    randombytes(rho, sizeof(rho));
+    mld_polyvec_matrix_expand_eager(&mat_eager, rho);
+    mld_polyvec_matrix_expand_lazy(&mat_lazy, rho);
+
+    randombytes((uint8_t *)&v, sizeof(v));
+    for (i = 0; i < MLDSA_L; i++)
+    {
+      for (j = 0; j < MLDSA_N; j++)
+      {
+        v.vec[i].coeffs[j] %= (MLD_NTT_BOUND - 1);
+      }
+    }
+
+    mld_polyvec_matrix_pointwise_montgomery_eager(&tk_eager, &mat_eager, &v);
+    mld_polyvec_matrix_pointwise_montgomery_lazy(&tk_lazy, &mat_lazy, &v);
+
+    /* Compare mod q */
+    mld_polyveck_reduce(&tk_eager);
+    mld_polyveck_reduce(&tk_lazy);
+    mld_polyveck_caddq(&tk_eager);
+    mld_polyveck_caddq(&tk_lazy);
+    CHECK(memcmp(&tk_eager, &tk_lazy, sizeof(mld_polyveck)) == 0);
+  }
+
   return 0;
 }