PECOS-packages
diff --git a/‎crates/benchmarks/benches/modules/state_vec_sims.rs‎
Lines changed: 101 additions & 6 deletions b/‎crates/benchmarks/benches/modules/state_vec_sims.rs‎
Lines changed: 101 additions & 6 deletions
diff --git a/‎crates/pecos-engines/src/quantum.rs‎
Lines changed: 40 additions & 10 deletions b/‎crates/pecos-engines/src/quantum.rs‎
Lines changed: 40 additions & 10 deletions
@@ -71,6 +71,8 @@ pub fn benchmarks<M: Measurement>(c: &mut Criterion<M>) {
     bench_state_vec_scaling(c);
     bench_individual_gates(c);
     bench_measurement_scaling(c);
+    bench_subset_measurement(c);
+    bench_flush_scaling(c);
     #[cfg(feature = "parallel")]
     bench_parallel_execution(c);
 }
@@ -241,19 +243,20 @@ fn bench_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
     group.finish();
 }
 
-/// Benchmark measurement performance scaling across qubit counts.
-/// Measures all qubits after applying H to each (maximum uncertainty).
-/// This isolates the GPU measurement optimization (workgroup reduction vs full readback).
+/// Benchmark measurement performance: sequential (per-qubit) vs batch (all at once).
+///
+/// Sequential: `for q in 0..n { sim.mz(&[QubitId(q)]); }` — 2n passes over state vector.
+/// Batch: `sim.mz(&all_qubits)` — uses joint sampling, 2 passes over state vector.
 fn bench_measurement_scaling<M: Measurement>(c: &mut Criterion<M>) {
     let mut group = c.benchmark_group("Measurement Scaling");
     group.sample_size(20);
 
     let qubit_counts = [10, 14, 18, 20, 22];
 
     for &nq in &qubit_counts {
-        // CPU baseline: StateVec
+        // Sequential: one mz() call per qubit (2n passes)
         group.bench_with_input(
-            BenchmarkId::new("StateVec_CPU", nq),
+            BenchmarkId::new("mz_sequential", nq),
             &nq,
             |b, &nq| {
                 let mut sim = StateVecSoA::new(nq);
@@ -269,7 +272,24 @@ fn bench_measurement_scaling<M: Measurement>(c: &mut Criterion<M>) {
             },
         );
 
-        // GPU: GpuStateVec (wgpu)
+        // Batch: one mz() call with all qubits (2 passes via joint sampling)
+        group.bench_with_input(
+            BenchmarkId::new("mz_batch", nq),
+            &nq,
+            |b, &nq| {
+                let mut sim = StateVecSoA::new(nq);
+                let all_qubits: Vec<QubitId> = (0..nq).map(QubitId).collect();
+                b.iter(|| {
+                    sim.reset();
+                    for q in 0..nq {
+                        sim.h(&[QubitId(q)]);
+                    }
+                    black_box(sim.mz(&all_qubits));
+                });
+            },
+        );
+
+        // GPU (sequential per-qubit for comparison)
         #[cfg(feature = "gpu-sims")]
         {
             #[allow(clippy::cast_possible_truncation)]
@@ -296,6 +316,81 @@ fn bench_measurement_scaling<M: Measurement>(c: &mut Criterion<M>) {
     group.finish();
 }
 
+/// Benchmark subset measurement: measure half the qubits (even-indexed).
+/// Tests the mz_joint_subset path (QEC-realistic: measure ancillas, not data qubits).
+fn bench_subset_measurement<M: Measurement>(c: &mut Criterion<M>) {
+    let mut group = c.benchmark_group("Subset Measurement");
+    group.sample_size(20);
+
+    let qubit_counts = [10, 14, 18, 20, 22];
+
+    for &nq in &qubit_counts {
+        let half: Vec<QubitId> = (0..nq).step_by(2).map(QubitId).collect();
+        let half_count = half.len();
+
+        // Sequential: one mz() per qubit
+        group.bench_with_input(
+            BenchmarkId::new("mz_sequential", format!("{nq}q_{half_count}m")),
+            &nq,
+            |b, &nq| {
+                let mut sim = StateVecSoA::new(nq);
+                b.iter(|| {
+                    sim.reset();
+                    for q in 0..nq {
+                        sim.h(&[QubitId(q)]);
+                    }
+                    for &q in &half {
+                        black_box(sim.mz(&[q]));
+                    }
+                });
+            },
+        );
+
+        // Batch: one mz() with all measured qubits
+        group.bench_with_input(
+            BenchmarkId::new("mz_batch_subset", format!("{nq}q_{half_count}m")),
+            &nq,
+            |b, &nq| {
+                let mut sim = StateVecSoA::new(nq);
+                b.iter(|| {
+                    sim.reset();
+                    for q in 0..nq {
+                        sim.h(&[QubitId(q)]);
+                    }
+                    black_box(sim.mz(&half));
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark flush performance: H on all qubits then flush.
+/// Isolates the cache-blocked flush optimization from measurement.
+fn bench_flush_scaling<M: Measurement>(c: &mut Criterion<M>) {
+    let mut group = c.benchmark_group("Flush Scaling");
+    group.sample_size(20);
+
+    let qubit_counts = [14, 18, 20, 22];
+
+    for &nq in &qubit_counts {
+        group.bench_with_input(BenchmarkId::new("h_all_flush", nq), &nq, |b, &nq| {
+            let mut sim = StateVecSoA::new(nq);
+            b.iter(|| {
+                sim.reset();
+                for q in 0..nq {
+                    sim.h(&[QubitId(q)]);
+                }
+                sim.flush();
+                black_box(());
+            });
+        });
+    }
+
+    group.finish();
+}
+
 /// Benchmark parallel vs sequential execution for large state vectors.
 /// Only runs when the `parallel` feature is enabled on pecos-simulators.
 #[cfg(feature = "parallel")]
 
@@ -247,7 +247,11 @@ where
 
         let mut measurements: Vec<usize> = Vec::new();
 
-        for cmd in &batch {
+        // Use indexed iteration so we can batch consecutive MZ commands into
+        // one simulator call, enabling joint-sampling optimizations.
+        let mut cmd_idx = 0;
+        while cmd_idx < batch.len() {
+            let cmd = &batch[cmd_idx];
             match cmd.gate_type {
                 GateType::X => {
                     debug!("Processing X gate on qubits {:?}", cmd.qubits);
@@ -555,12 +559,24 @@ where
                     }
                 }
 
-                // TODO: Fix it so we have multiple result_ids or get rid of result ids...
+                // Batch consecutive MZ commands into one simulator call.
+                // This enables joint-sampling optimizations (fewer state vector passes).
                 GateType::MZ | GateType::MeasureLeaked => {
-                    debug!("Processing measurement on qubits {:?}", cmd.qubits);
-                    let meas_results = self.simulator.mz(&cmd.qubits);
+                    // Collect qubits from consecutive MZ/MeasureLeaked commands
+                    let mut mz_qubits: Vec<QubitId> = cmd.qubits.to_vec();
+                    while cmd_idx + 1 < batch.len()
+                        && matches!(
+                            batch[cmd_idx + 1].gate_type,
+                            GateType::MZ | GateType::MeasureLeaked
+                        )
+                    {
+                        cmd_idx += 1;
+                        mz_qubits.extend_from_slice(&batch[cmd_idx].qubits);
+                    }
+
+                    debug!("Processing batched measurement on {} qubits", mz_qubits.len());
+                    let meas_results = self.simulator.mz(&mz_qubits);
                     for meas_result in meas_results {
-                        // mz() outcome: true if projected to |1⟩, false if projected to |0⟩
                         measurements.push(usize::from(meas_result.outcome));
                     }
                 }
@@ -677,6 +693,7 @@ where
                     self.simulator.u2q(before, interaction, after, &pairs);
                 }
             }
+            cmd_idx += 1;
         }
 
         // Create a message with the measurement results
@@ -860,7 +877,9 @@ impl Engine for SparseStabEngine {
         let batch = message.quantum_ops()?;
         let mut measurements: Vec<usize> = Vec::new();
 
-        for cmd in &batch {
+        let mut cmd_idx = 0;
+        while cmd_idx < batch.len() {
+            let cmd = &batch[cmd_idx];
             match cmd.gate_type {
                 // Single-qubit Clifford gates
                 GateType::X
@@ -885,12 +904,22 @@ impl Engine for SparseStabEngine {
                 | GateType::SYYdg => {
                     self.process_two_qubit_gate(cmd.gate_type, &cmd.qubits);
                 }
-                // Special operations
+                // Batch consecutive MZ commands
                 GateType::MZ | GateType::MeasureLeaked => {
-                    debug!("Processing measurement on qubits {:?}", cmd.qubits);
-                    let meas_results = self.simulator.mz(&cmd.qubits);
+                    let mut mz_qubits: Vec<QubitId> = cmd.qubits.to_vec();
+                    while cmd_idx + 1 < batch.len()
+                        && matches!(
+                            batch[cmd_idx + 1].gate_type,
+                            GateType::MZ | GateType::MeasureLeaked
+                        )
+                    {
+                        cmd_idx += 1;
+                        mz_qubits.extend_from_slice(&batch[cmd_idx].qubits);
+                    }
+
+                    debug!("Processing batched measurement on {} qubits", mz_qubits.len());
+                    let meas_results = self.simulator.mz(&mz_qubits);
                     for meas_result in meas_results {
-                        // mz() outcome: true if projected to |1⟩, false if projected to |0⟩
                         measurements.push(usize::from(meas_result.outcome));
                     }
                 }
@@ -987,6 +1016,7 @@ impl Engine for SparseStabEngine {
                     )));
                 }
             }
+            cmd_idx += 1;
         }
 
         // Create a message with the measurement results