@@ -71,6 +71,8 @@ pub fn benchmarks<M: Measurement>(c: &mut Criterion<M>) {
7171 bench_state_vec_scaling ( c) ;
7272 bench_individual_gates ( c) ;
7373 bench_measurement_scaling ( c) ;
74+ bench_subset_measurement ( c) ;
75+ bench_flush_scaling ( c) ;
7476 #[ cfg( feature = "parallel" ) ]
7577 bench_parallel_execution ( c) ;
7678}
@@ -241,19 +243,20 @@ fn bench_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
241243 group. finish ( ) ;
242244}
243245
244- /// Benchmark measurement performance scaling across qubit counts.
245- /// Measures all qubits after applying H to each (maximum uncertainty).
246- /// This isolates the GPU measurement optimization (workgroup reduction vs full readback).
246+ /// Benchmark measurement performance: sequential (per-qubit) vs batch (all at once).
247+ ///
248+ /// Sequential: `for q in 0..n { sim.mz(&[QubitId(q)]); }` — 2n passes over state vector.
249+ /// Batch: `sim.mz(&all_qubits)` — uses joint sampling, 2 passes over state vector.
247250fn bench_measurement_scaling < M : Measurement > ( c : & mut Criterion < M > ) {
248251 let mut group = c. benchmark_group ( "Measurement Scaling" ) ;
249252 group. sample_size ( 20 ) ;
250253
251254 let qubit_counts = [ 10 , 14 , 18 , 20 , 22 ] ;
252255
253256 for & nq in & qubit_counts {
254- // CPU baseline: StateVec
257+ // Sequential: one mz() call per qubit (2n passes)
255258 group. bench_with_input (
256- BenchmarkId :: new ( "StateVec_CPU " , nq) ,
259+ BenchmarkId :: new ( "mz_sequential " , nq) ,
257260 & nq,
258261 |b, & nq| {
259262 let mut sim = StateVecSoA :: new ( nq) ;
@@ -269,7 +272,24 @@ fn bench_measurement_scaling<M: Measurement>(c: &mut Criterion<M>) {
269272 } ,
270273 ) ;
271274
272- // GPU: GpuStateVec (wgpu)
275+ // Batch: one mz() call with all qubits (2 passes via joint sampling)
276+ group. bench_with_input (
277+ BenchmarkId :: new ( "mz_batch" , nq) ,
278+ & nq,
279+ |b, & nq| {
280+ let mut sim = StateVecSoA :: new ( nq) ;
281+ let all_qubits: Vec < QubitId > = ( 0 ..nq) . map ( QubitId ) . collect ( ) ;
282+ b. iter ( || {
283+ sim. reset ( ) ;
284+ for q in 0 ..nq {
285+ sim. h ( & [ QubitId ( q) ] ) ;
286+ }
287+ black_box ( sim. mz ( & all_qubits) ) ;
288+ } ) ;
289+ } ,
290+ ) ;
291+
292+ // GPU (sequential per-qubit for comparison)
273293 #[ cfg( feature = "gpu-sims" ) ]
274294 {
275295 #[ allow( clippy:: cast_possible_truncation) ]
@@ -296,6 +316,81 @@ fn bench_measurement_scaling<M: Measurement>(c: &mut Criterion<M>) {
296316 group. finish ( ) ;
297317}
298318
319+ /// Benchmark subset measurement: measure half the qubits (even-indexed).
320+ /// Tests the mz_joint_subset path (QEC-realistic: measure ancillas, not data qubits).
321+ fn bench_subset_measurement < M : Measurement > ( c : & mut Criterion < M > ) {
322+ let mut group = c. benchmark_group ( "Subset Measurement" ) ;
323+ group. sample_size ( 20 ) ;
324+
325+ let qubit_counts = [ 10 , 14 , 18 , 20 , 22 ] ;
326+
327+ for & nq in & qubit_counts {
328+ let half: Vec < QubitId > = ( 0 ..nq) . step_by ( 2 ) . map ( QubitId ) . collect ( ) ;
329+ let half_count = half. len ( ) ;
330+
331+ // Sequential: one mz() per qubit
332+ group. bench_with_input (
333+ BenchmarkId :: new ( "mz_sequential" , format ! ( "{nq}q_{half_count}m" ) ) ,
334+ & nq,
335+ |b, & nq| {
336+ let mut sim = StateVecSoA :: new ( nq) ;
337+ b. iter ( || {
338+ sim. reset ( ) ;
339+ for q in 0 ..nq {
340+ sim. h ( & [ QubitId ( q) ] ) ;
341+ }
342+ for & q in & half {
343+ black_box ( sim. mz ( & [ q] ) ) ;
344+ }
345+ } ) ;
346+ } ,
347+ ) ;
348+
349+ // Batch: one mz() with all measured qubits
350+ group. bench_with_input (
351+ BenchmarkId :: new ( "mz_batch_subset" , format ! ( "{nq}q_{half_count}m" ) ) ,
352+ & nq,
353+ |b, & nq| {
354+ let mut sim = StateVecSoA :: new ( nq) ;
355+ b. iter ( || {
356+ sim. reset ( ) ;
357+ for q in 0 ..nq {
358+ sim. h ( & [ QubitId ( q) ] ) ;
359+ }
360+ black_box ( sim. mz ( & half) ) ;
361+ } ) ;
362+ } ,
363+ ) ;
364+ }
365+
366+ group. finish ( ) ;
367+ }
368+
369+ /// Benchmark flush performance: H on all qubits then flush.
370+ /// Isolates the cache-blocked flush optimization from measurement.
371+ fn bench_flush_scaling < M : Measurement > ( c : & mut Criterion < M > ) {
372+ let mut group = c. benchmark_group ( "Flush Scaling" ) ;
373+ group. sample_size ( 20 ) ;
374+
375+ let qubit_counts = [ 14 , 18 , 20 , 22 ] ;
376+
377+ for & nq in & qubit_counts {
378+ group. bench_with_input ( BenchmarkId :: new ( "h_all_flush" , nq) , & nq, |b, & nq| {
379+ let mut sim = StateVecSoA :: new ( nq) ;
380+ b. iter ( || {
381+ sim. reset ( ) ;
382+ for q in 0 ..nq {
383+ sim. h ( & [ QubitId ( q) ] ) ;
384+ }
385+ sim. flush ( ) ;
386+ black_box ( ( ) ) ;
387+ } ) ;
388+ } ) ;
389+ }
390+
391+ group. finish ( ) ;
392+ }
393+
299394/// Benchmark parallel vs sequential execution for large state vectors.
300395/// Only runs when the `parallel` feature is enabled on pecos-simulators.
301396#[ cfg( feature = "parallel" ) ]
0 commit comments