Skip to content

Commit 4787c14

Browse files
committed
Simulator optimization branch: GPU density matrix, perf, audits, shared context
Major work on branch (squash of 28 commits + final clippy cleanup): GPU simulator optimizations - f32 gate fusion, commuting-gate reorder, CY/SWAP/RXX/RYY direct shaders - Persistent kernel for small states with dynamic shared-memory sizing - CPU measurement fast path for small states (f32 <=16q, f64 <=15q) - Parallel CX/CZ/RZZ/RXX/RYY via rayon when .parallel(true) - Parallel scalar CX path for low-qubit pairs (q_lo < 2) - Fused flush_gates + state() readback into single encoder - Adaptive mz path selection: empirical N/M lookup table replacing hardcoded threshold - Raised StateVecSoA parallel threshold 14 -> 21 qubits - Exploration benchmarks for adaptive path decisions GpuDensityMatrix - Choi-Jamiolkowski representation on top of GpuStateVec - Generic over backend (f32 / f64); gates, noise channels, helpers - Cholesky re-purification for mixed states Correctness fixes (from audits) - GPU 2q rotation shaders (RXX/RYY bit_a==bit_b bug) - DensityMatrix phase/amplitude damping trace preservation - mz is_deterministic flag (previously hardcoded false) - GpuPauliProp gate ordering + Pauli X/Y/Z semantics, stale buffer reads - GpuDensityMatrix mz probability formula - GpuStabMulti::mz_queue now snapshots state at call time Shared GPU context (concurrent-use SIGSEGV fix) - Process-wide OnceLock<GpuDeviceContext> in pecos-gpu-sims/src/gpu_probe.rs - All 7 simulators now reuse one wgpu Instance/Adapter/Device/Queue - Fixes crashes under cargo's parallel test harness and MonteCarloEngine shots - Removed the --test-threads=1 workaround from pecos-cli rust test Test infrastructure - New audits: gate_audit, gate_fuzz, pauli_prop_audit, influence_sampler_audit, large_n_audit, noisy_sampler_stats, stab_extra_audits, extra_audits, flush_blocked_audit, concurrent_gpu_test - Removed pecos-quest / pecos-qulacs wrapper crates (bench code only) - Removed dangling quest_sim_test.rs and quest_example.rs Clippy / lint cleanup - GpuError::Startup variant wraps GpuStartupError - Internal GPU constants now usize (casts removed) - Renamed GatePipeline variants SWAP/RXX/RYY/RZZ -> Swap/Rxx/Ryy/Rzz - # Panics / # Errors doc sections added where clippy required - Cholesky loops allow needless_range_loop with justification - Test files allow cast_possible_truncation / cast_precision_loss with rationale
1 parent a9a9d7a commit 4787c14

122 files changed

Lines changed: 15503 additions & 17386 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Cargo.lock

Lines changed: 121 additions & 159 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,6 @@ pecos-qis = { version = "0.2.0-dev.0", path = "crates/pecos-qis" }
192192
pecos-qis-ffi = { version = "0.2.0-dev.0", path = "crates/pecos-qis-ffi" }
193193
pecos-qis-ffi-types = { version = "0.2.0-dev.0", path = "crates/pecos-qis-ffi-types" }
194194
pecos-quantum = { version = "0.2.0-dev.0", path = "crates/pecos-quantum" }
195-
pecos-quest = { version = "0.2.0-dev.0", path = "crates/pecos-quest" }
196-
pecos-qulacs = { version = "0.2.0-dev.0", path = "crates/pecos-qulacs" }
197195
pecos-random = { version = "0.2.0-dev.0", path = "crates/pecos-random" }
198196
pecos-relay-bp = { version = "0.2.0-dev.0", path = "crates/pecos-relay-bp" }
199197
pecos-rslib = { version = "0.2.0-dev.0", path = "python/pecos-rslib" }

crates/benchmarks/Cargo.toml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,13 @@ default = []
1717
parallel = ["pecos-simulators/parallel"]
1818
gpu-sims = ["dep:pecos-gpu-sims"]
1919
cuquantum = ["dep:pecos-cuquantum"]
20-
quest = ["dep:pecos-quest"]
21-
qulacs = ["dep:pecos-qulacs"]
2220
cppsparsestab = ["dep:pecos-cppsparsestab"]
23-
all-sims = ["gpu-sims", "cuquantum", "quest", "qulacs", "cppsparsestab"]
21+
all-sims = ["gpu-sims", "cuquantum", "cppsparsestab"]
2422

2523
[dependencies]
2624
# Optional simulator dependencies for benchmarking
2725
pecos-gpu-sims = { workspace = true, optional = true }
2826
pecos-cuquantum = { workspace = true, optional = true }
29-
pecos-quest = { workspace = true, optional = true }
30-
pecos-qulacs = { workspace = true, optional = true }
3127
pecos-cppsparsestab = { workspace = true, optional = true }
3228
pecos-core.workspace = true
3329
pecos-simulators.workspace = true

crates/benchmarks/benches/modules/native_statevec_comparison.rs

Lines changed: 18 additions & 248 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,10 @@
1212

1313
//! Native state vector comparison benchmarks.
1414
//!
15-
//! Calls `QuEST` and Qulacs FFI directly (bypassing the PECOS wrapper layer's qubit index
16-
//! remapping, bounds checks, and `QubitId`/`Angle64` conversions) to give an apples-to-apples
17-
//! comparison of raw gate computation performance against the pure-Rust PECOS simulators.
18-
//!
19-
//! GPU simulators (`GpuStateVec` via wgpu, `CuStateVec` via cuQuantum) are included when their
20-
//! respective features (`gpu-sims`, `cuquantum`) are enabled.
15+
//! Compares raw gate computation performance across PECOS's internal state vector simulators
16+
//! (`StateVecSoA`, `StateVecSoA32`, `StateVecAoS`) at the trait layer, plus GPU simulators
17+
//! (`GpuStateVec32` via wgpu, `CuStateVec` via cuQuantum) when their respective features
18+
//! (`gpu-sims`, `cuquantum`) are enabled.
2119
2220
use criterion::{BenchmarkId, Criterion, measurement::Measurement};
2321
use pecos_core::{Angle64, QubitId};
@@ -27,14 +25,8 @@ use pecos_simulators::{
2725
};
2826
use std::hint::black_box;
2927

30-
#[cfg(feature = "quest")]
31-
use pecos_quest::bridge::ffi as quest_ffi;
32-
33-
#[cfg(feature = "qulacs")]
34-
use pecos_qulacs::bridge::ffi as qulacs_ffi;
35-
3628
#[cfg(feature = "gpu-sims")]
37-
use pecos_gpu_sims::{GpuStateVec, gates as gpu_gates};
29+
use pecos_gpu_sims::{GpuStateVec32, gates as gpu_gates};
3830

3931
#[cfg(feature = "cuquantum")]
4032
use pecos_cuquantum::CuStateVec;
@@ -60,80 +52,11 @@ fn pecos_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
6052
}
6153

6254
// ---------------------------------------------------------------------------
63-
// QuEST direct FFI helpers
64-
// ---------------------------------------------------------------------------
65-
66-
#[cfg(feature = "quest")]
67-
struct QuestState {
68-
env_ptr: *mut u8,
69-
qureg_ptr: *mut u8,
70-
}
71-
72-
#[cfg(feature = "quest")]
73-
impl QuestState {
74-
fn new(num_qubits: usize) -> Self {
75-
let env_ptr = quest_ffi::quest_create_env();
76-
assert!(!env_ptr.is_null(), "Failed to create QuEST environment");
77-
let qureg_ptr = unsafe { quest_ffi::quest_create_qureg(env_ptr, num_qubits as i32) };
78-
assert!(!qureg_ptr.is_null(), "Failed to create QuEST qureg");
79-
unsafe { quest_ffi::quest_init_zero_state(qureg_ptr) };
80-
Self { env_ptr, qureg_ptr }
81-
}
82-
}
83-
84-
#[cfg(feature = "quest")]
85-
impl Drop for QuestState {
86-
fn drop(&mut self) {
87-
unsafe {
88-
quest_ffi::quest_destroy_qureg(self.qureg_ptr);
89-
quest_ffi::quest_destroy_env(self.env_ptr);
90-
}
91-
}
92-
}
93-
94-
#[cfg(feature = "quest")]
95-
fn quest_circuit(qs: &QuestState, num_qubits: usize, num_layers: usize) {
96-
let qureg = qs.qureg_ptr;
97-
unsafe {
98-
for _layer in 0..num_layers {
99-
for q in 0..num_qubits {
100-
quest_ffi::quest_apply_hadamard(qureg, q as i32);
101-
quest_ffi::quest_apply_rotation_z(qureg, q as i32, 0.1);
102-
}
103-
for q in 0..num_qubits - 1 {
104-
quest_ffi::quest_apply_cnot(qureg, q as i32, (q + 1) as i32);
105-
}
106-
}
107-
}
108-
}
109-
110-
// ---------------------------------------------------------------------------
111-
// Qulacs direct FFI helpers
112-
// ---------------------------------------------------------------------------
113-
114-
#[cfg(feature = "qulacs")]
115-
fn qulacs_circuit(
116-
state: &mut cxx::UniquePtr<qulacs_ffi::QulacsState>,
117-
num_qubits: usize,
118-
num_layers: usize,
119-
) {
120-
for _layer in 0..num_layers {
121-
for q in 0..num_qubits {
122-
qulacs_ffi::csim_h(state.pin_mut(), q);
123-
qulacs_ffi::csim_rz(state.pin_mut(), q, 0.1);
124-
}
125-
for q in 0..num_qubits - 1 {
126-
qulacs_ffi::csim_cnot(state.pin_mut(), q, q + 1);
127-
}
128-
}
129-
}
130-
131-
// ---------------------------------------------------------------------------
132-
// GpuStateVec direct helpers (bypasses trait layer, calls wgpu dispatch directly)
55+
// GpuStateVec32 direct helpers (bypasses trait layer, calls wgpu dispatch directly)
13356
// ---------------------------------------------------------------------------
13457

13558
#[cfg(feature = "gpu-sims")]
136-
fn gpu_circuit(sim: &mut GpuStateVec, num_qubits: usize, num_layers: usize) {
59+
fn gpu_circuit(sim: &mut GpuStateVec32, num_qubits: usize, num_layers: usize) {
13760
let rz_matrix = gpu_gates::rz(0.1);
13861
for _layer in 0..num_layers {
13962
for q in 0..num_qubits {
@@ -290,46 +213,11 @@ fn bench_native_statevec_comparison<M: Measurement>(c: &mut Criterion<M>) {
290213
},
291214
);
292215

293-
// -- QuEST direct FFI --
294-
#[cfg(feature = "quest")]
295-
{
296-
let quest_name = "QuEST_direct";
297-
let qs = QuestState::new(num_qubits);
298-
group.bench_with_input(
299-
BenchmarkId::new(quest_name, &label),
300-
&(num_qubits, num_layers),
301-
|b, &(nq, nl)| {
302-
b.iter(|| {
303-
unsafe { quest_ffi::quest_init_zero_state(qs.qureg_ptr) };
304-
quest_circuit(&qs, nq, nl);
305-
black_box(());
306-
});
307-
},
308-
);
309-
}
310-
311-
// -- Qulacs direct FFI --
312-
#[cfg(feature = "qulacs")]
313-
{
314-
let mut state = qulacs_ffi::create_quantum_state(num_qubits);
315-
group.bench_with_input(
316-
BenchmarkId::new("Qulacs_direct", &label),
317-
&(num_qubits, num_layers),
318-
|b, &(nq, nl)| {
319-
b.iter(|| {
320-
qulacs_ffi::reset(state.pin_mut());
321-
qulacs_circuit(&mut state, nq, nl);
322-
black_box(());
323-
});
324-
},
325-
);
326-
}
327-
328-
// -- GpuStateVec direct (wgpu) --
216+
// -- GpuStateVec32 direct (wgpu) --
329217
#[cfg(feature = "gpu-sims")]
330-
if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
218+
if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
331219
group.bench_with_input(
332-
BenchmarkId::new("GpuStateVec_direct", &label),
220+
BenchmarkId::new("GpuStateVec32_direct", &label),
333221
&(num_qubits, num_layers),
334222
|b, &(nq, nl)| {
335223
b.iter(|| {
@@ -441,38 +329,9 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
441329
});
442330
});
443331

444-
#[cfg(feature = "quest")]
445-
{
446-
let quest_h_name = "H/QuEST_direct";
447-
group.bench_function(quest_h_name, |b| {
448-
let qs = QuestState::new(num_qubits);
449-
b.iter(|| {
450-
for _ in 0..iters {
451-
for q in 0..num_qubits {
452-
unsafe { quest_ffi::quest_apply_hadamard(qs.qureg_ptr, q as i32) };
453-
}
454-
}
455-
black_box(());
456-
});
457-
});
458-
}
459-
460-
#[cfg(feature = "qulacs")]
461-
group.bench_function("H/Qulacs_direct", |b| {
462-
let mut state = qulacs_ffi::create_quantum_state(num_qubits);
463-
b.iter(|| {
464-
for _ in 0..iters {
465-
for q in 0..num_qubits {
466-
qulacs_ffi::csim_h(state.pin_mut(), q);
467-
}
468-
}
469-
black_box(());
470-
});
471-
});
472-
473332
#[cfg(feature = "gpu-sims")]
474-
if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
475-
group.bench_function("H/GpuStateVec_direct", |b| {
333+
if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
334+
group.bench_function("H/GpuStateVec32_direct", |b| {
476335
b.iter(|| {
477336
for _ in 0..iters {
478337
for q in 0..num_qubits {
@@ -564,38 +423,9 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
564423
});
565424
});
566425

567-
#[cfg(feature = "quest")]
568-
{
569-
let quest_x_name = "X/QuEST_direct";
570-
group.bench_function(quest_x_name, |b| {
571-
let qs = QuestState::new(num_qubits);
572-
b.iter(|| {
573-
for _ in 0..iters {
574-
for q in 0..num_qubits {
575-
unsafe { quest_ffi::quest_apply_pauli_x(qs.qureg_ptr, q as i32) };
576-
}
577-
}
578-
black_box(());
579-
});
580-
});
581-
}
582-
583-
#[cfg(feature = "qulacs")]
584-
group.bench_function("X/Qulacs_direct", |b| {
585-
let mut state = qulacs_ffi::create_quantum_state(num_qubits);
586-
b.iter(|| {
587-
for _ in 0..iters {
588-
for q in 0..num_qubits {
589-
qulacs_ffi::csim_x(state.pin_mut(), q);
590-
}
591-
}
592-
black_box(());
593-
});
594-
});
595-
596426
#[cfg(feature = "gpu-sims")]
597-
if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
598-
group.bench_function("X/GpuStateVec_direct", |b| {
427+
if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
428+
group.bench_function("X/GpuStateVec32_direct", |b| {
599429
b.iter(|| {
600430
for _ in 0..iters {
601431
for q in 0..num_qubits {
@@ -687,40 +517,9 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
687517
});
688518
});
689519

690-
#[cfg(feature = "quest")]
691-
{
692-
let quest_cx_name = "CX/QuEST_direct";
693-
group.bench_function(quest_cx_name, |b| {
694-
let qs = QuestState::new(num_qubits);
695-
b.iter(|| {
696-
for _ in 0..iters {
697-
for q in 0..num_qubits - 1 {
698-
unsafe {
699-
quest_ffi::quest_apply_cnot(qs.qureg_ptr, q as i32, (q + 1) as i32);
700-
}
701-
}
702-
}
703-
black_box(());
704-
});
705-
});
706-
}
707-
708-
#[cfg(feature = "qulacs")]
709-
group.bench_function("CX/Qulacs_direct", |b| {
710-
let mut state = qulacs_ffi::create_quantum_state(num_qubits);
711-
b.iter(|| {
712-
for _ in 0..iters {
713-
for q in 0..num_qubits - 1 {
714-
qulacs_ffi::csim_cnot(state.pin_mut(), q, q + 1);
715-
}
716-
}
717-
black_box(());
718-
});
719-
});
720-
721520
#[cfg(feature = "gpu-sims")]
722-
if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
723-
group.bench_function("CX/GpuStateVec_direct", |b| {
521+
if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
522+
group.bench_function("CX/GpuStateVec32_direct", |b| {
724523
b.iter(|| {
725524
for _ in 0..iters {
726525
for q in 0..num_qubits - 1 {
@@ -812,39 +611,10 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
812611
});
813612
});
814613

815-
#[cfg(feature = "quest")]
816-
{
817-
let quest_rz_name = "RZ/QuEST_direct";
818-
group.bench_function(quest_rz_name, |b| {
819-
let qs = QuestState::new(num_qubits);
820-
b.iter(|| {
821-
for _ in 0..iters {
822-
for q in 0..num_qubits {
823-
unsafe { quest_ffi::quest_apply_rotation_z(qs.qureg_ptr, q as i32, 0.1) };
824-
}
825-
}
826-
black_box(());
827-
});
828-
});
829-
}
830-
831-
#[cfg(feature = "qulacs")]
832-
group.bench_function("RZ/Qulacs_direct", |b| {
833-
let mut state = qulacs_ffi::create_quantum_state(num_qubits);
834-
b.iter(|| {
835-
for _ in 0..iters {
836-
for q in 0..num_qubits {
837-
qulacs_ffi::csim_rz(state.pin_mut(), q, 0.1);
838-
}
839-
}
840-
black_box(());
841-
});
842-
});
843-
844614
#[cfg(feature = "gpu-sims")]
845-
if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
615+
if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
846616
let rz_matrix = gpu_gates::rz(0.1);
847-
group.bench_function("RZ/GpuStateVec_direct", |b| {
617+
group.bench_function("RZ/GpuStateVec32_direct", |b| {
848618
b.iter(|| {
849619
for _ in 0..iters {
850620
for q in 0..num_qubits {

0 commit comments

Comments
 (0)