diff --git a/Cargo.lock b/Cargo.lock
index ca49b3380..2b4094e7e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -121,7 +121,7 @@ version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -132,7 +132,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -347,15 +347,13 @@ dependencies = [
  "pecos-neo",
  "pecos-qec",
  "pecos-quantum",
- "pecos-quest",
- "pecos-qulacs",
  "pecos-random",
  "pecos-simulators",
  "quizx",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rand_xoshiro 0.8.0",
  "rapidhash",
- "wide 1.2.0",
+ "wide 1.3.0",
 ]
 
 [[package]]
@@ -575,9 +573,9 @@ dependencies = [
 
 [[package]]
 name = "capnp"
-version = "0.25.3"
+version = "0.25.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d1c82ec25a9501d60e22eef4be1b2c271769b5a96e224d0875baef28529cf30"
+checksum = "63da65e5e9ffc3b8f993d4ad222a548152549351a643f6b850a7773cb6ff2809"
 dependencies = [
  "embedded-io 0.7.1",
 ]
@@ -614,9 +612,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.2.59"
+version = "1.2.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283"
+checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
 dependencies = [
  "find-msvc-tools",
  "jobserver",
@@ -749,9 +747,9 @@ dependencies = [
 
 [[package]]
 name = "clap_complete"
-version = "4.6.0"
+version = "4.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19c9f1dde76b736e3681f28cec9d5a61299cbaae0fce80a68e43724ad56031eb"
+checksum = "406e68b4de5c59cfb8f750a7cbd4d31ae153788b8352167c1e5f4fc26e8c91e9"
 dependencies = [
  "clap",
 ]
@@ -1201,7 +1199,7 @@ checksum = "b0f4697d190a142477b16aef7da8a99bfdc41e7e8b1687583c0d23a79c7afc1e"
 dependencies = [
  "cc",
  "codespan-reporting",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "proc-macro2",
  "quote",
  "scratch",
@@ -1216,7 +1214,7 @@ checksum = "d0956799fa8678d4c50eed028f2de1c0552ae183c76e976cf7ca8c4e36a7c328"
 dependencies = [
  "clap",
  "codespan-reporting",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "proc-macro2",
  "quote",
  "syn 2.0.117",
@@ -1234,7 +1232,7 @@ version = "1.0.194"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6acc6b5822b9526adfb4fc377b67128fdd60aac757cc4a741a6278603f763cf"
 dependencies = [
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "proc-macro2",
  "quote",
  "syn 2.0.117",
@@ -1471,7 +1469,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users 0.5.2",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -1646,14 +1644,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "fastrand"
-version = "2.4.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
 
 [[package]]
 name = "filetime"
@@ -1947,7 +1945,7 @@ checksum = "19e16c5073773ccf057c282be832a59ee53ef5ff98db3aeff7f8314f52ffc196"
 dependencies = [
  "fnv",
  "hashbrown 0.16.1",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "stable_deref_trait",
 ]
 
@@ -2166,7 +2164,10 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
+ "allocator-api2",
+ "equivalent",
  "foldhash 0.1.5",
+ "rayon",
 ]
 
 [[package]]
@@ -2182,6 +2183,12 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
+
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -2286,7 +2293,7 @@ dependencies = [
  "enum_dispatch",
  "html-escape",
  "hugr-model",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
  "ordered-float",
  "pastey",
@@ -2340,7 +2347,7 @@ dependencies = [
  "bumpalo",
  "capnp",
  "derive_more 2.1.1",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
  "ordered-float",
  "pest",
@@ -2402,15 +2409,14 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.27.7"
+version = "0.27.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+checksum = "c2b52f86d1d4bc0d6b4e6826d960b1b333217e07d36b882dca570a5e1c48895b"
 dependencies = [
  "http",
  "hyper",
  "hyper-util",
  "rustls",
- "rustls-pki-types",
  "tokio",
  "tokio-rustls",
  "tower-service",
@@ -2586,18 +2592,18 @@ checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
 dependencies = [
  "autocfg",
  "hashbrown 0.12.3",
- "rayon",
  "serde",
 ]
 
 [[package]]
 name = "indexmap"
-version = "2.13.1"
+version = "2.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
- "hashbrown 0.16.1",
+ "hashbrown 0.17.0",
+ "rayon",
  "serde",
  "serde_core",
 ]
@@ -2694,7 +2700,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
 dependencies = [
  "hermit-abi",
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2816,9 +2822,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.94"
+version = "0.3.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9"
+checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca"
 dependencies = [
  "cfg-if",
  "futures-util",
@@ -2938,14 +2944,14 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
 [[package]]
 name = "libredox"
-version = "0.1.15"
+version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08"
+checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
 dependencies = [
  "bitflags",
  "libc",
  "plain",
- "redox_syscall 0.7.3",
+ "redox_syscall 0.7.4",
 ]
 
 [[package]]
@@ -3128,7 +3134,7 @@ dependencies = [
  "half",
  "hashbrown 0.16.1",
  "hexf-parse",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "libm",
  "log",
  "num-traits",
@@ -3448,7 +3454,7 @@ checksum = "271638cd5fa9cca89c4c304675ca658efc4e64a66c716b7cfe1afb4b9611dbbc"
 dependencies = [
  "crc32fast",
  "hashbrown 0.16.1",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "memchr",
 ]
 
@@ -3609,8 +3615,6 @@ dependencies = [
  "pecos-qec",
  "pecos-qis",
  "pecos-quantum",
- "pecos-quest",
- "pecos-qulacs",
  "pecos-random",
  "pecos-simulators",
  "pecos-wasm",
@@ -3678,7 +3682,7 @@ dependencies = [
  "num-complex 0.4.6",
  "num-traits",
  "pecos-random",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rand_core 0.10.0",
  "rand_xoshiro 0.8.0",
  "serde",
@@ -3761,7 +3765,7 @@ dependencies = [
  "pecos-core",
  "pecos-random",
  "pecos-simulators",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rayon",
  "serde",
  "serde_json",
@@ -3775,8 +3779,8 @@ dependencies = [
  "pecos-quantum",
  "pecos-random",
  "pecos-simulators",
- "rand 0.10.0",
- "wide 1.2.0",
+ "rand 0.10.1",
+ "wide 1.3.0",
 ]
 
 [[package]]
@@ -3827,6 +3831,7 @@ dependencies = [
  "bytemuck",
  "env_logger",
  "log",
+ "num-complex 0.4.6",
  "paste",
  "pecos-core",
  "pecos-qec",
@@ -3834,7 +3839,7 @@ dependencies = [
  "pecos-random",
  "pecos-simulators",
  "pollster",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rand_core 0.10.0",
  "serde_json",
  "wgpu",
@@ -3850,7 +3855,7 @@ dependencies = [
  "pecos-core",
  "pecos-engines",
  "pecos-quantum",
- "pecos-quest",
+ "pecos-simulators",
  "pecos-wasm",
  "serde_json",
  "tempfile",
@@ -3890,7 +3895,7 @@ dependencies = [
  "ndarray 0.17.2",
  "pecos-build",
  "pecos-decoder-core",
- "rand 0.10.0",
+ "rand 0.10.1",
  "thiserror 2.0.18",
 ]
 
@@ -3921,7 +3926,7 @@ dependencies = [
  "pecos-random",
  "pecos-simulators",
  "proptest",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rand_core 0.10.0",
  "rayon",
  "smallvec",
@@ -3939,7 +3944,7 @@ dependencies = [
  "num-complex 0.4.6",
  "num-traits",
  "pecos-random",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rustworkx-core",
  "serde",
  "serde_json",
@@ -3999,7 +4004,7 @@ dependencies = [
  "pecos-decoder-core",
  "pecos-random",
  "petgraph 0.8.3",
- "rand 0.10.0",
+ "rand 0.10.1",
  "thiserror 2.0.18",
 ]
 
@@ -4034,12 +4039,12 @@ dependencies = [
  "pecos-quantum",
  "pecos-random",
  "pecos-simulators",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rand_core 0.10.0",
  "rayon",
  "smallvec",
  "thiserror 2.0.18",
- "wide 1.2.0",
+ "wide 1.3.0",
 ]
 
 [[package]]
@@ -4061,7 +4066,7 @@ dependencies = [
  "pecos-qis-ffi",
  "pecos-qis-ffi-types",
  "pecos-random",
- "rand 0.10.0",
+ "rand 0.10.1",
  "selene-simple-runtime",
  "selene-soft-rz-runtime",
  "serde_json",
@@ -4099,56 +4104,16 @@ dependencies = [
  "tket",
 ]
 
-[[package]]
-name = "pecos-quest"
-version = "0.2.0-dev.0"
-dependencies = [
- "cxx",
- "cxx-build",
- "dirs",
- "env_logger",
- "libloading 0.9.0",
- "log",
- "num-complex 0.4.6",
- "pecos-build",
- "pecos-core",
- "pecos-engines",
- "pecos-num",
- "pecos-random",
- "pecos-simulators",
- "rand 0.10.0",
- "rand_core 0.10.0",
- "thiserror 2.0.18",
-]
-
-[[package]]
-name = "pecos-qulacs"
-version = "0.2.0-dev.0"
-dependencies = [
- "cc",
- "cxx",
- "cxx-build",
- "env_logger",
- "log",
- "num-complex 0.4.6",
- "pecos-build",
- "pecos-core",
- "pecos-random",
- "pecos-simulators",
- "rand 0.10.0",
- "rand_core 0.10.0",
-]
-
 [[package]]
 name = "pecos-random"
 version = "0.2.0-dev.0"
 dependencies = [
- "rand 0.10.0",
+ "rand 0.10.1",
  "rand_core 0.10.0",
  "rand_xoshiro 0.8.0",
  "random_tester",
  "rapidhash",
- "wide 1.2.0",
+ "wide 1.3.0",
 ]
 
 [[package]]
@@ -4191,13 +4156,11 @@ dependencies = [
  "pecos-qec",
  "pecos-qis",
  "pecos-quantum",
- "pecos-quest",
- "pecos-qulacs",
  "pecos-random",
  "pecos-simulators",
  "pecos-wasm",
  "pyo3",
- "rand 0.10.0",
+ "rand 0.10.1",
  "serde_json",
  "tempfile",
 ]
@@ -4277,12 +4240,11 @@ dependencies = [
  "paste",
  "pecos-core",
  "pecos-quantum",
- "pecos-quest",
  "pecos-random",
- "rand 0.10.0",
+ "rand 0.10.1",
  "rayon",
  "smallvec",
- "wide 1.2.0",
+ "wide 1.3.0",
 ]
 
 [[package]]
@@ -4379,7 +4341,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
  "fixedbitset 0.4.2",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
 ]
 
 [[package]]
@@ -4390,7 +4352,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
 dependencies = [
  "fixedbitset 0.5.7",
  "hashbrown 0.15.5",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "serde",
 ]
 
@@ -4411,9 +4373,9 @@ checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.32"
+version = "0.3.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
 
 [[package]]
 name = "plain"
@@ -4599,7 +4561,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "93980406f12d9f8140ed5abe7155acb10bb1e69ea55c88960b9c2f117445ef96"
 dependencies = [
  "equivalent",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "serde",
 ]
 
@@ -4650,7 +4612,7 @@ dependencies = [
  "bit-vec 0.8.0",
  "bitflags",
  "num-traits",
- "rand 0.9.2",
+ "rand 0.9.3",
  "rand_chacha 0.9.0",
  "rand_xorshift",
  "regex-syntax 0.8.10",
@@ -4800,7 +4762,7 @@ dependencies = [
  "bytes",
  "getrandom 0.3.4",
  "lru-slab",
- "rand 0.9.2",
+ "rand 0.9.3",
  "ring",
  "rustc-hash 2.1.2",
  "rustls",
@@ -4895,9 +4857,9 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.9.2"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+checksum = "7ec095654a25171c2124e9e3393a930bddbffdc939556c914957a4c3e0a87166"
 dependencies = [
  "rand_chacha 0.9.0",
  "rand_core 0.9.5",
@@ -4905,9 +4867,9 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8"
+checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
 dependencies = [
  "chacha20",
  "getrandom 0.4.2",
@@ -4966,7 +4928,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
 dependencies = [
  "num-traits",
- "rand 0.9.2",
+ "rand 0.9.3",
 ]
 
 [[package]]
@@ -5093,9 +5055,9 @@ dependencies = [
 
 [[package]]
 name = "redox_syscall"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
+checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a"
 dependencies = [
  "bitflags",
 ]
@@ -5401,14 +5363,14 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "rustls"
-version = "0.23.37"
+version = "0.23.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
+checksum = "69f9466fb2c14ea04357e91413efb882e2a6d4a406e625449bc0a5d360d53a21"
 dependencies = [
  "aws-lc-rs",
  "once_cell",
@@ -5458,7 +5420,7 @@ dependencies = [
  "security-framework",
  "security-framework-sys",
  "webpki-root-certs",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5469,9 +5431,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
 
 [[package]]
 name = "rustls-webpki"
-version = "0.103.10"
+version = "0.103.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
+checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4"
 dependencies = [
  "aws-lc-rs",
  "ring",
@@ -5493,13 +5455,13 @@ checksum = "aaeee6f84153fd6f62507fc22bfe9499c8485075b44186dcbb918166ef75116f"
 dependencies = [
  "fixedbitset 0.5.7",
  "foldhash 0.1.5",
- "hashbrown 0.14.5",
- "indexmap 1.9.3",
+ "hashbrown 0.15.5",
+ "indexmap 2.14.0",
  "ndarray 0.16.1",
  "num-traits",
  "petgraph 0.8.3",
  "priority-queue 2.7.0",
- "rand 0.9.2",
+ "rand 0.9.3",
  "rand_distr",
  "rand_pcg",
  "rayon",
@@ -5759,7 +5721,7 @@ dependencies = [
  "chrono",
  "hex",
  "indexmap 1.9.3",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "schemars 0.9.0",
  "schemars 1.2.1",
  "serde_core",
@@ -5907,7 +5869,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
 dependencies = [
  "libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -6107,7 +6069,7 @@ dependencies = [
  "getrandom 0.4.2",
  "once_cell",
  "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -6268,7 +6230,7 @@ dependencies = [
  "fxhash",
  "hugr",
  "hugr-core",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
  "lazy_static",
  "num-rational",
@@ -6313,7 +6275,7 @@ dependencies = [
  "derive_more 2.1.1",
  "hugr",
  "hugr-core",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "itertools 0.14.0",
  "lazy_static",
  "serde",
@@ -6326,9 +6288,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.51.0"
+version = "1.51.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd"
+checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c"
 dependencies = [
  "bytes",
  "libc",
@@ -6354,7 +6316,7 @@ version = "1.1.2+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee"
 dependencies = [
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "serde_core",
  "serde_spanned",
  "toml_datetime",
@@ -6374,11 +6336,11 @@ dependencies = [
 
 [[package]]
 name = "toml_edit"
-version = "0.25.10+spec-1.1.0"
+version = "0.25.11+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a82418ca169e235e6c399a84e395ab6debeb3bc90edc959bf0f48647c6a32d1b"
+checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b"
 dependencies = [
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "toml_datetime",
  "toml_parser",
  "winnow",
@@ -6679,9 +6641,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0"
+checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
 dependencies = [
  "cfg-if",
  "once_cell",
@@ -6692,9 +6654,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.67"
+version = "0.4.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e"
+checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -6702,9 +6664,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be"
+checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -6712,9 +6674,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2"
+checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
 dependencies = [
  "bumpalo",
  "proc-macro2",
@@ -6725,9 +6687,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.117"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b"
+checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
 dependencies = [
  "unicode-ident",
 ]
@@ -6769,7 +6731,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
 dependencies = [
  "anyhow",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "wasm-encoder 0.244.0",
  "wasmparser 0.244.0",
 ]
@@ -6782,7 +6744,7 @@ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
 dependencies = [
  "bitflags",
  "hashbrown 0.15.5",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "semver",
 ]
 
@@ -6794,7 +6756,7 @@ checksum = "4f08c9adee0428b7bddf3890fc27e015ac4b761cc608c822667102b8bfd6995e"
 dependencies = [
  "bitflags",
  "hashbrown 0.16.1",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "semver",
  "serde",
 ]
@@ -6806,7 +6768,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71cde4757396defafd25417cfb36aa3161027d06d865b0c24baaae229aac005d"
 dependencies = [
  "bitflags",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "semver",
 ]
 
@@ -6871,7 +6833,7 @@ dependencies = [
  "cranelift-entity",
  "gimli",
  "hashbrown 0.16.1",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "log",
  "object",
  "postcard",
@@ -7027,9 +6989,9 @@ checksum = "323f4da9523e9a669e1eaf9c6e763892769b1d38c623913647bfdc1532fe4549"
 
 [[package]]
 name = "web-sys"
-version = "0.3.94"
+version = "0.3.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a"
+checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -7098,7 +7060,7 @@ dependencies = [
  "cfg_aliases",
  "document-features",
  "hashbrown 0.16.1",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "log",
  "naga",
  "once_cell",
@@ -7233,9 +7195,9 @@ dependencies = [
 
 [[package]]
 name = "wide"
-version = "1.2.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "198f6abc41fab83526d10880fa5c17e2b4ee44e763949b4bb34e2fd1e8ca48e4"
+checksum = "c9479f84a757f819cfab37295955906479181395de83add28f74975fde083141"
 dependencies = [
  "bytemuck",
  "safe_arch 1.0.0",
@@ -7263,7 +7225,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -7641,7 +7603,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
 dependencies = [
  "anyhow",
  "heck 0.5.0",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "prettyplease",
  "syn 2.0.117",
  "wasm-metadata",
@@ -7672,7 +7634,7 @@ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
 dependencies = [
  "anyhow",
  "bitflags",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "log",
  "serde",
  "serde_derive",
@@ -7691,7 +7653,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
 dependencies = [
  "anyhow",
  "id-arena",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "log",
  "semver",
  "serde",
@@ -7861,7 +7823,7 @@ dependencies = [
  "crossbeam-utils",
  "displaydoc",
  "flate2",
- "indexmap 2.13.1",
+ "indexmap 2.14.0",
  "memchr",
  "thiserror 2.0.18",
  "zopfli",
diff --git a/Cargo.toml b/Cargo.toml
index 7ea294015..7e2bb9394 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -192,8 +192,6 @@ pecos-qis = { version = "0.2.0-dev.0", path = "crates/pecos-qis" }
 pecos-qis-ffi = { version = "0.2.0-dev.0", path = "crates/pecos-qis-ffi" }
 pecos-qis-ffi-types = { version = "0.2.0-dev.0", path = "crates/pecos-qis-ffi-types" }
 pecos-quantum = { version = "0.2.0-dev.0", path = "crates/pecos-quantum" }
-pecos-quest = { version = "0.2.0-dev.0", path = "crates/pecos-quest" }
-pecos-qulacs = { version = "0.2.0-dev.0", path = "crates/pecos-qulacs" }
 pecos-random = { version = "0.2.0-dev.0", path = "crates/pecos-random" }
 pecos-relay-bp = { version = "0.2.0-dev.0", path = "crates/pecos-relay-bp" }
 pecos-rslib = { version = "0.2.0-dev.0", path = "python/pecos-rslib" }
diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml
index d0b88e648..260bd9666 100644
--- a/crates/benchmarks/Cargo.toml
+++ b/crates/benchmarks/Cargo.toml
@@ -17,17 +17,13 @@ default = []
 parallel = ["pecos-simulators/parallel"]
 gpu-sims = ["dep:pecos-gpu-sims"]
 cuquantum = ["dep:pecos-cuquantum"]
-quest = ["dep:pecos-quest"]
-qulacs = ["dep:pecos-qulacs"]
 cppsparsestab = ["dep:pecos-cppsparsestab"]
-all-sims = ["gpu-sims", "cuquantum", "quest", "qulacs", "cppsparsestab"]
+all-sims = ["gpu-sims", "cuquantum", "cppsparsestab"]
 
 [dependencies]
 # Optional simulator dependencies for benchmarking
 pecos-gpu-sims = { workspace = true, optional = true }
 pecos-cuquantum = { workspace = true, optional = true }
-pecos-quest = { workspace = true, optional = true }
-pecos-qulacs = { workspace = true, optional = true }
 pecos-cppsparsestab = { workspace = true, optional = true }
 pecos-core.workspace = true
 pecos-simulators.workspace = true
diff --git a/crates/benchmarks/benches/modules/native_statevec_comparison.rs b/crates/benchmarks/benches/modules/native_statevec_comparison.rs
index 2d714f3f4..eedf8ee47 100644
--- a/crates/benchmarks/benches/modules/native_statevec_comparison.rs
+++ b/crates/benchmarks/benches/modules/native_statevec_comparison.rs
@@ -12,12 +12,10 @@
 
 //! Native state vector comparison benchmarks.
 //!
-//! Calls `QuEST` and Qulacs FFI directly (bypassing the PECOS wrapper layer's qubit index
-//! remapping, bounds checks, and `QubitId`/`Angle64` conversions) to give an apples-to-apples
-//! comparison of raw gate computation performance against the pure-Rust PECOS simulators.
-//!
-//! GPU simulators (`GpuStateVec` via wgpu, `CuStateVec` via cuQuantum) are included when their
-//! respective features (`gpu-sims`, `cuquantum`) are enabled.
+//! Compares raw gate computation performance across PECOS's internal state vector simulators
+//! (`StateVecSoA`, `StateVecSoA32`, `StateVecAoS`) at the trait layer, plus GPU simulators
+//! (`GpuStateVec32` via wgpu, `CuStateVec` via cuQuantum) when their respective features
+//! (`gpu-sims`, `cuquantum`) are enabled.
 
 use criterion::{BenchmarkId, Criterion, measurement::Measurement};
 use pecos_core::{Angle64, QubitId};
@@ -27,14 +25,8 @@ use pecos_simulators::{
 };
 use std::hint::black_box;
 
-#[cfg(feature = "quest")]
-use pecos_quest::bridge::ffi as quest_ffi;
-
-#[cfg(feature = "qulacs")]
-use pecos_qulacs::bridge::ffi as qulacs_ffi;
-
 #[cfg(feature = "gpu-sims")]
-use pecos_gpu_sims::{GpuStateVec, gates as gpu_gates};
+use pecos_gpu_sims::{GpuStateVec32, gates as gpu_gates};
 
 #[cfg(feature = "cuquantum")]
 use pecos_cuquantum::CuStateVec;
@@ -60,80 +52,11 @@ fn pecos_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
 }
 
 // ---------------------------------------------------------------------------
-// QuEST direct FFI helpers
-// ---------------------------------------------------------------------------
-
-#[cfg(feature = "quest")]
-struct QuestState {
-    env_ptr: *mut u8,
-    qureg_ptr: *mut u8,
-}
-
-#[cfg(feature = "quest")]
-impl QuestState {
-    fn new(num_qubits: usize) -> Self {
-        let env_ptr = quest_ffi::quest_create_env();
-        assert!(!env_ptr.is_null(), "Failed to create QuEST environment");
-        let qureg_ptr = unsafe { quest_ffi::quest_create_qureg(env_ptr, num_qubits as i32) };
-        assert!(!qureg_ptr.is_null(), "Failed to create QuEST qureg");
-        unsafe { quest_ffi::quest_init_zero_state(qureg_ptr) };
-        Self { env_ptr, qureg_ptr }
-    }
-}
-
-#[cfg(feature = "quest")]
-impl Drop for QuestState {
-    fn drop(&mut self) {
-        unsafe {
-            quest_ffi::quest_destroy_qureg(self.qureg_ptr);
-            quest_ffi::quest_destroy_env(self.env_ptr);
-        }
-    }
-}
-
-#[cfg(feature = "quest")]
-fn quest_circuit(qs: &QuestState, num_qubits: usize, num_layers: usize) {
-    let qureg = qs.qureg_ptr;
-    unsafe {
-        for _layer in 0..num_layers {
-            for q in 0..num_qubits {
-                quest_ffi::quest_apply_hadamard(qureg, q as i32);
-                quest_ffi::quest_apply_rotation_z(qureg, q as i32, 0.1);
-            }
-            for q in 0..num_qubits - 1 {
-                quest_ffi::quest_apply_cnot(qureg, q as i32, (q + 1) as i32);
-            }
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Qulacs direct FFI helpers
-// ---------------------------------------------------------------------------
-
-#[cfg(feature = "qulacs")]
-fn qulacs_circuit(
-    state: &mut cxx::UniquePtr<qulacs_ffi::QulacsState>,
-    num_qubits: usize,
-    num_layers: usize,
-) {
-    for _layer in 0..num_layers {
-        for q in 0..num_qubits {
-            qulacs_ffi::csim_h(state.pin_mut(), q);
-            qulacs_ffi::csim_rz(state.pin_mut(), q, 0.1);
-        }
-        for q in 0..num_qubits - 1 {
-            qulacs_ffi::csim_cnot(state.pin_mut(), q, q + 1);
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// GpuStateVec direct helpers (bypasses trait layer, calls wgpu dispatch directly)
+// GpuStateVec32 direct helpers (bypasses trait layer, calls wgpu dispatch directly)
 // ---------------------------------------------------------------------------
 
 #[cfg(feature = "gpu-sims")]
-fn gpu_circuit(sim: &mut GpuStateVec, num_qubits: usize, num_layers: usize) {
+fn gpu_circuit(sim: &mut GpuStateVec32, num_qubits: usize, num_layers: usize) {
     let rz_matrix = gpu_gates::rz(0.1);
     for _layer in 0..num_layers {
         for q in 0..num_qubits {
@@ -290,46 +213,11 @@ fn bench_native_statevec_comparison<M: Measurement>(c: &mut Criterion<M>) {
             },
         );
 
-        // -- QuEST direct FFI --
-        #[cfg(feature = "quest")]
-        {
-            let quest_name = "QuEST_direct";
-            let qs = QuestState::new(num_qubits);
-            group.bench_with_input(
-                BenchmarkId::new(quest_name, &label),
-                &(num_qubits, num_layers),
-                |b, &(nq, nl)| {
-                    b.iter(|| {
-                        unsafe { quest_ffi::quest_init_zero_state(qs.qureg_ptr) };
-                        quest_circuit(&qs, nq, nl);
-                        black_box(());
-                    });
-                },
-            );
-        }
-
-        // -- Qulacs direct FFI --
-        #[cfg(feature = "qulacs")]
-        {
-            let mut state = qulacs_ffi::create_quantum_state(num_qubits);
-            group.bench_with_input(
-                BenchmarkId::new("Qulacs_direct", &label),
-                &(num_qubits, num_layers),
-                |b, &(nq, nl)| {
-                    b.iter(|| {
-                        qulacs_ffi::reset(state.pin_mut());
-                        qulacs_circuit(&mut state, nq, nl);
-                        black_box(());
-                    });
-                },
-            );
-        }
-
-        // -- GpuStateVec direct (wgpu) --
+        // -- GpuStateVec32 direct (wgpu) --
         #[cfg(feature = "gpu-sims")]
-        if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
+        if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
             group.bench_with_input(
-                BenchmarkId::new("GpuStateVec_direct", &label),
+                BenchmarkId::new("GpuStateVec32_direct", &label),
                 &(num_qubits, num_layers),
                 |b, &(nq, nl)| {
                     b.iter(|| {
@@ -441,38 +329,9 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
         });
     });
 
-    #[cfg(feature = "quest")]
-    {
-        let quest_h_name = "H/QuEST_direct";
-        group.bench_function(quest_h_name, |b| {
-            let qs = QuestState::new(num_qubits);
-            b.iter(|| {
-                for _ in 0..iters {
-                    for q in 0..num_qubits {
-                        unsafe { quest_ffi::quest_apply_hadamard(qs.qureg_ptr, q as i32) };
-                    }
-                }
-                black_box(());
-            });
-        });
-    }
-
-    #[cfg(feature = "qulacs")]
-    group.bench_function("H/Qulacs_direct", |b| {
-        let mut state = qulacs_ffi::create_quantum_state(num_qubits);
-        b.iter(|| {
-            for _ in 0..iters {
-                for q in 0..num_qubits {
-                    qulacs_ffi::csim_h(state.pin_mut(), q);
-                }
-            }
-            black_box(());
-        });
-    });
-
     #[cfg(feature = "gpu-sims")]
-    if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
-        group.bench_function("H/GpuStateVec_direct", |b| {
+    if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
+        group.bench_function("H/GpuStateVec32_direct", |b| {
             b.iter(|| {
                 for _ in 0..iters {
                     for q in 0..num_qubits {
@@ -564,38 +423,9 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
         });
     });
 
-    #[cfg(feature = "quest")]
-    {
-        let quest_x_name = "X/QuEST_direct";
-        group.bench_function(quest_x_name, |b| {
-            let qs = QuestState::new(num_qubits);
-            b.iter(|| {
-                for _ in 0..iters {
-                    for q in 0..num_qubits {
-                        unsafe { quest_ffi::quest_apply_pauli_x(qs.qureg_ptr, q as i32) };
-                    }
-                }
-                black_box(());
-            });
-        });
-    }
-
-    #[cfg(feature = "qulacs")]
-    group.bench_function("X/Qulacs_direct", |b| {
-        let mut state = qulacs_ffi::create_quantum_state(num_qubits);
-        b.iter(|| {
-            for _ in 0..iters {
-                for q in 0..num_qubits {
-                    qulacs_ffi::csim_x(state.pin_mut(), q);
-                }
-            }
-            black_box(());
-        });
-    });
-
     #[cfg(feature = "gpu-sims")]
-    if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
-        group.bench_function("X/GpuStateVec_direct", |b| {
+    if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
+        group.bench_function("X/GpuStateVec32_direct", |b| {
             b.iter(|| {
                 for _ in 0..iters {
                     for q in 0..num_qubits {
@@ -687,40 +517,9 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
         });
     });
 
-    #[cfg(feature = "quest")]
-    {
-        let quest_cx_name = "CX/QuEST_direct";
-        group.bench_function(quest_cx_name, |b| {
-            let qs = QuestState::new(num_qubits);
-            b.iter(|| {
-                for _ in 0..iters {
-                    for q in 0..num_qubits - 1 {
-                        unsafe {
-                            quest_ffi::quest_apply_cnot(qs.qureg_ptr, q as i32, (q + 1) as i32);
-                        }
-                    }
-                }
-                black_box(());
-            });
-        });
-    }
-
-    #[cfg(feature = "qulacs")]
-    group.bench_function("CX/Qulacs_direct", |b| {
-        let mut state = qulacs_ffi::create_quantum_state(num_qubits);
-        b.iter(|| {
-            for _ in 0..iters {
-                for q in 0..num_qubits - 1 {
-                    qulacs_ffi::csim_cnot(state.pin_mut(), q, q + 1);
-                }
-            }
-            black_box(());
-        });
-    });
-
     #[cfg(feature = "gpu-sims")]
-    if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
-        group.bench_function("CX/GpuStateVec_direct", |b| {
+    if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
+        group.bench_function("CX/GpuStateVec32_direct", |b| {
             b.iter(|| {
                 for _ in 0..iters {
                     for q in 0..num_qubits - 1 {
@@ -812,39 +611,10 @@ fn bench_native_individual_gates<M: Measurement>(c: &mut Criterion<M>) {
         });
     });
 
-    #[cfg(feature = "quest")]
-    {
-        let quest_rz_name = "RZ/QuEST_direct";
-        group.bench_function(quest_rz_name, |b| {
-            let qs = QuestState::new(num_qubits);
-            b.iter(|| {
-                for _ in 0..iters {
-                    for q in 0..num_qubits {
-                        unsafe { quest_ffi::quest_apply_rotation_z(qs.qureg_ptr, q as i32, 0.1) };
-                    }
-                }
-                black_box(());
-            });
-        });
-    }
-
-    #[cfg(feature = "qulacs")]
-    group.bench_function("RZ/Qulacs_direct", |b| {
-        let mut state = qulacs_ffi::create_quantum_state(num_qubits);
-        b.iter(|| {
-            for _ in 0..iters {
-                for q in 0..num_qubits {
-                    qulacs_ffi::csim_rz(state.pin_mut(), q, 0.1);
-                }
-            }
-            black_box(());
-        });
-    });
-
     #[cfg(feature = "gpu-sims")]
-    if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
+    if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
         let rz_matrix = gpu_gates::rz(0.1);
-        group.bench_function("RZ/GpuStateVec_direct", |b| {
+        group.bench_function("RZ/GpuStateVec32_direct", |b| {
             b.iter(|| {
                 for _ in 0..iters {
                     for q in 0..num_qubits {
diff --git a/crates/benchmarks/benches/modules/state_vec_sims.rs b/crates/benchmarks/benches/modules/state_vec_sims.rs
index 163d14b42..bbbab3859 100644
--- a/crates/benchmarks/benches/modules/state_vec_sims.rs
+++ b/crates/benchmarks/benches/modules/state_vec_sims.rs
@@ -13,18 +13,14 @@
 //! State vector simulator benchmarks comparing GPU and CPU implementations.
 //!
 //! Compares performance of:
-//! - `GpuStateVec` (GPU via wgpu/Vulkan/Metal/DX12)
+//! - `StateVecSoA` (pecos-simulators pure Rust CPU baseline)
+//! - `GpuStateVec32` (GPU via wgpu/Vulkan/Metal/DX12)
 //! - `CuStateVec` (GPU via NVIDIA cuQuantum/CUDA)
-//! - `QuestStateVec` (`QuEST` - CPU or CUDA)
-//! - `QulacsStateVec` (Qulacs - CPU)
-//! - `StateVec` (pecos-simulators pure Rust CPU)
 //!
 //! Run with specific features:
 //! ```
-//! cargo bench -p benchmarks --features gpu-sims        # GpuStateVec only
+//! cargo bench -p benchmarks --features gpu-sims        # GpuStateVec32 only
 //! cargo bench -p benchmarks --features cuquantum       # CuStateVec (NVIDIA CUDA)
-//! cargo bench -p benchmarks --features quest            # QuEST (CPU + CUDA if available at runtime)
-//! cargo bench -p benchmarks --features all-sims        # All simulators
 //! ```
 
 use criterion::{BenchmarkId, Criterion, measurement::Measurement};
@@ -35,20 +31,11 @@ use pecos_simulators::{
 use std::hint::black_box;
 
 #[cfg(feature = "gpu-sims")]
-use pecos_gpu_sims::GpuStateVec;
+use pecos_gpu_sims::GpuStateVec32;
 
 #[cfg(feature = "cuquantum")]
 use pecos_cuquantum::CuStateVec;
 
-#[cfg(feature = "quest")]
-use pecos_quest::QuestCudaStateVecEngine;
-
-#[cfg(feature = "quest")]
-use pecos_quest::QuestStateVec;
-
-#[cfg(feature = "qulacs")]
-use pecos_qulacs::QulacsStateVec;
-
 /// Run a benchmark circuit: layers of H + RZ + CX gates.
 fn benchmark_circuit<S>(sim: &mut S, num_qubits: usize, num_layers: usize)
 where
@@ -285,10 +272,10 @@ fn bench_measurement_scaling<M: Measurement>(c: &mut Criterion<M>) {
         #[cfg(feature = "gpu-sims")]
         {
             #[allow(clippy::cast_possible_truncation)]
-            if let Ok(mut sim) = GpuStateVec::new(nq as u32) {
+            if let Ok(mut sim) = GpuStateVec32::new(nq as u32) {
                 group.bench_with_input(BenchmarkId::new("GpuStateVec_wgpu", nq), &nq, |b, &nq| {
                     b.iter(|| {
-                        sim.reset();
+                        GpuStateVec32::reset(&mut sim);
                         for q in 0..nq {
                             sim.h(&[QubitId(q)]);
                         }
@@ -491,13 +478,13 @@ fn bench_state_vec_scaling<M: Measurement>(c: &mut Criterion<M>) {
         {
             // Safe: num_qubits comes from configs array with small values (10-22)
             #[allow(clippy::cast_possible_truncation)]
-            if let Ok(mut sim) = GpuStateVec::new(num_qubits as u32) {
+            if let Ok(mut sim) = GpuStateVec32::new(num_qubits as u32) {
                 group.bench_with_input(
                     BenchmarkId::new("GpuStateVec_wgpu", &label),
                     &(num_qubits, num_layers),
                     |b, &(nq, nl)| {
                         b.iter(|| {
-                            sim.reset();
+                            GpuStateVec32::reset(&mut sim);
                             benchmark_circuit(&mut sim, nq, nl);
                             black_box(());
                         });
@@ -528,76 +515,6 @@ fn bench_state_vec_scaling<M: Measurement>(c: &mut Criterion<M>) {
                 }
             }
         }
-
-        // Benchmark QuEST (CPU mode)
-        #[cfg(feature = "quest")]
-        {
-            let mut sim = QuestStateVec::new(num_qubits);
-            group.bench_with_input(
-                BenchmarkId::new("QuestStateVec_CPU", &label),
-                &(num_qubits, num_layers),
-                |b, &(nq, nl)| {
-                    b.iter(|| {
-                        sim.reset();
-                        benchmark_circuit(&mut sim, nq, nl);
-                        black_box(());
-                    });
-                },
-            );
-        }
-
-        // NOTE: QuEST CUDA benchmarks are disabled in the loop due to a QuEST bug:
-        // 1. QuEST CUDA only supports ONE qureg at a time
-        // 2. After destroying a qureg, subsequent qureg creation fails
-        // The CUDA benchmark is run separately below for a single configuration.
-
-        // Benchmark Qulacs
-        #[cfg(feature = "qulacs")]
-        {
-            let mut sim = QulacsStateVec::new(num_qubits);
-            group.bench_with_input(
-                BenchmarkId::new("QulacsStateVec_CPU", &label),
-                &(num_qubits, num_layers),
-                |b, &(nq, nl)| {
-                    b.iter(|| {
-                        sim.reset();
-                        benchmark_circuit(&mut sim, nq, nl);
-                        black_box(());
-                    });
-                },
-            );
-        }
-    }
-
-    // QuEST CUDA benchmark - run separately due to QuEST bugs:
-    // 1. Only one qureg can exist at a time
-    // 2. After destroying a qureg, subsequent creations fail
-    // 3. Creating quregs with 12+ qubits fails (QuEST CUDA configuration limit?)
-    // We run a single configuration (10 qubits) to compare against CPU implementations.
-    #[cfg(feature = "quest")]
-    {
-        let cuda_config = (10, 20); // 10 qubits, 20 layers - max reliable size
-        let (num_qubits, num_layers) = cuda_config;
-        let label = format!("{num_qubits}q_{num_layers}l");
-
-        match QuestCudaStateVecEngine::new(num_qubits) {
-            Ok(mut sim) => {
-                group.bench_with_input(
-                    BenchmarkId::new("QuestCuda_GPU", &label),
-                    &(num_qubits, num_layers),
-                    |b, &(nq, nl)| {
-                        b.iter(|| {
-                            sim.reset();
-                            benchmark_circuit(&mut sim, nq, nl);
-                            black_box(());
-                        });
-                    },
-                );
-            }
-            Err(e) => {
-                eprintln!("Warning: Failed to create QuestCudaStateVecEngine: {e}");
-            }
-        }
     }
 
     group.finish();
diff --git a/crates/pecos-cli/src/cli/rust_cmd.rs b/crates/pecos-cli/src/cli/rust_cmd.rs
index e219888f7..f1e6828a1 100644
--- a/crates/pecos-cli/src/cli/rust_cmd.rs
+++ b/crates/pecos-cli/src/cli/rust_cmd.rs
@@ -382,8 +382,6 @@ fn run_test(release: bool, include_ffi: bool) -> Result<()> {
     }
 
     args.extend(&[
-        "--exclude",
-        "pecos-quest",
         "--exclude",
         "pecos-cuquantum", // Requires cuQuantum SDK, test separately if available
         "--exclude",
@@ -400,15 +398,6 @@ fn run_test(release: bool, include_ffi: bool) -> Result<()> {
         return Err(Error::Config("cargo test (workspace) failed".to_string()));
     }
 
-    println!("Testing pecos-quest...");
-    let mut args = vec!["test", "-p", "pecos-quest", "--all-features"];
-    if !release_flag.is_empty() {
-        args.push(release_flag);
-    }
-    if !run_cargo_command(&args) {
-        return Err(Error::Config("cargo test (pecos-quest) failed".to_string()));
-    }
-
     // Test cuQuantum if SDK is available (requires both CUDA and cuQuantum)
     if probe_cuquantum_availability() {
         println!("cuQuantum runtime available - testing pecos-cuquantum");
@@ -425,7 +414,6 @@ fn run_test(release: bool, include_ffi: bool) -> Result<()> {
         println!("cuQuantum runtime not available - skipping pecos-cuquantum");
     }
 
-    // Test GPU simulator if GPU is available
     if include_gpu_sims {
         println!("Including pecos-gpu-sims in Rust tests");
         let mut args = vec!["test", "-p", "pecos-gpu-sims"];
diff --git a/crates/pecos-cuquantum/src/statevec.rs b/crates/pecos-cuquantum/src/statevec.rs
index 575f7b0eb..57d5e1b64 100644
--- a/crates/pecos-cuquantum/src/statevec.rs
+++ b/crates/pecos-cuquantum/src/statevec.rs
@@ -159,6 +159,15 @@ impl CuStateVec {
         Ok(())
     }
 
+    /// Wait for all submitted GPU work to complete.
+    ///
+    /// Call this before timing measurements to ensure all asynchronous GPU
+    /// operations have finished. Without this, timing only captures dispatch
+    /// overhead, not actual GPU execution time.
+    pub fn sync(&self) {
+        unsafe { (self.backend.cudaDeviceSynchronize)() };
+    }
+
     /// Get the number of qubits
     #[must_use]
     pub fn num_qubits(&self) -> usize {
diff --git a/crates/pecos-gpu-sims/Cargo.toml b/crates/pecos-gpu-sims/Cargo.toml
index 2601b71a0..b1e1905b1 100644
--- a/crates/pecos-gpu-sims/Cargo.toml
+++ b/crates/pecos-gpu-sims/Cargo.toml
@@ -23,6 +23,7 @@ rand.workspace = true
 rand_core.workspace = true
 log.workspace = true
 serde_json.workspace = true
+num-complex.workspace = true
 
 # PECOS integration
 pecos-core.workspace = true
diff --git a/crates/pecos-gpu-sims/README.md b/crates/pecos-gpu-sims/README.md
index e728faee2..1c6ba8077 100644
--- a/crates/pecos-gpu-sims/README.md
+++ b/crates/pecos-gpu-sims/README.md
@@ -50,6 +50,34 @@ let result = sim.mz(0);               // Measure
 
 If no GPU is available, `GpuStateVec::new()` returns `Err(GpuError::NoAdapter)`. Use a CPU-based simulator like `StateVec` as a fallback.
 
+### Precision (f64 vs f32)
+
+`GpuStateVec` aliases `GpuStateVec64` (double-precision, canonical). This requires the `SHADER_F64` GPU feature. On adapters without f64 support -- notably Metal on Apple Silicon -- `GpuStateVec::new()` returns `Err(GpuError::UnsupportedFeature("SHADER_F64"))`. Use `GpuStateVec32` for a universally portable f32 backend (about 2x smaller state, ~1e-7 rounding vs ~1e-15 for f64):
+
+```rust
+use pecos_gpu_sims::{GpuStateVec, GpuStateVec32, GpuError};
+
+// Try f64 (canonical), fall back to f32 on adapters without SHADER_F64.
+match GpuStateVec::new(4) {
+    Ok(sim) => { /* use f64 sim */ }
+    Err(GpuError::UnsupportedFeature(_)) => {
+        let sim = GpuStateVec32::new(4)?; // f32 works on Metal, DX12, Vulkan
+        /* use f32 sim */
+    }
+    Err(e) => return Err(e.into()),
+}
+```
+
+If you don't care about precision and just want *some* GPU state vector, use the opt-in `GpuStateVecAuto` wrapper, which tries f64 first and falls back to f32 automatically:
+
+```rust
+use pecos_gpu_sims::GpuStateVecAuto;
+
+let mut sim = GpuStateVecAuto::new(4)?; // f64 where available, else f32
+// sim implements the standard gate traits (CliffordGateable, ArbitraryRotationGateable).
+// Query sim.is_f64() if you need to know which backend was selected.
+```
+
 ## Development
 
 ### Current Optimizations
diff --git a/crates/pecos-gpu-sims/examples/bench_pauli_prop_flush.rs b/crates/pecos-gpu-sims/examples/bench_pauli_prop_flush.rs
new file mode 100644
index 000000000..0b1dd0d29
--- /dev/null
+++ b/crates/pecos-gpu-sims/examples/bench_pauli_prop_flush.rs
@@ -0,0 +1,44 @@
+//! Quick wall-time benchmark for `GpuPauliProp` flush throughput.
+//!
+//! Used to verify the inter-gate poll removal didn't regress correctness
+//! cost; expect dispatches/sec to scale with circuit length, not block on
+//! poll between gates.
+
+use pecos_gpu_sims::GpuPauliProp;
+use std::time::Instant;
+
+fn main() {
+    let n_qubits = 32usize;
+    let n_shots = 1024u32;
+    let n_iters = 500;
+
+    let Ok(mut prop) = GpuPauliProp::with_seed(n_qubits, n_shots, 42) else {
+        eprintln!("no GPU; skipping benchmark");
+        return;
+    };
+
+    // Warm-up: pipelines, allocators, etc.
+    for _ in 0..50 {
+        prop.h(&[0]);
+    }
+    prop.sync();
+
+    let start = Instant::now();
+    for _ in 0..n_iters {
+        // 7 dispatches per iter: 4-qubit H, 2-pair CX, 1-pair CX.
+        prop.h(&[0, 1, 2, 3]);
+        prop.cx(&[(0, 1), (2, 3)]);
+        prop.cx(&[(1, 2)]);
+    }
+    prop.sync();
+    let elapsed = start.elapsed();
+
+    let total = u64::try_from(n_iters * 7).unwrap();
+    #[allow(clippy::cast_precision_loss)] // bench output, not numerically critical
+    let us_per = (elapsed.as_micros() as f64) / (total as f64);
+    #[allow(clippy::cast_precision_loss)]
+    let dispatch_per_s = (total as f64) / elapsed.as_secs_f64();
+    println!(
+        "{total} dispatches in {elapsed:?} -> {us_per:.2} us/dispatch ({dispatch_per_s:.0} dispatch/s)"
+    );
+}
diff --git a/crates/pecos-gpu-sims/src/bin/gpu_check.rs b/crates/pecos-gpu-sims/src/bin/gpu_check.rs
index bd693b8cb..1391857c7 100644
--- a/crates/pecos-gpu-sims/src/bin/gpu_check.rs
+++ b/crates/pecos-gpu-sims/src/bin/gpu_check.rs
@@ -10,7 +10,7 @@
 
 use pecos_core::{QubitId, qid};
 use pecos_gpu_sims::GpuStabMulti;
-use pecos_gpu_sims::gpu_probe::{GpuAdapterInfo, GpuStartupError, request_default_gpu_device};
+use pecos_gpu_sims::gpu_probe::{GpuAdapterInfo, GpuStartupError, gpu_context};
 use pecos_random::PecosRng;
 use serde_json::json;
 use std::process::ExitCode;
@@ -71,7 +71,7 @@ fn main() -> ExitCode {
     let quiet = args.iter().any(|arg| arg == "-q" || arg == "--quiet");
     let json_output = args.iter().any(|arg| arg == "-j" || arg == "--json");
 
-    match request_default_gpu_device("gpu-check device") {
+    match gpu_context() {
         Ok(gpu) => match run_simulator_smoke_test() {
             Ok(()) => {
                 if json_output {
diff --git a/crates/pecos-gpu-sims/src/gpu.rs b/crates/pecos-gpu-sims/src/gpu.rs
index defaa6b3b..b31327751 100644
--- a/crates/pecos-gpu-sims/src/gpu.rs
+++ b/crates/pecos-gpu-sims/src/gpu.rs
@@ -6,27 +6,46 @@ use rand::RngExt;
 use std::borrow::Cow;
 
 use crate::gates;
+use crate::gpu_probe::{GpuStartupError, gpu_context};
 
 /// Alignment for uniform buffer offsets (wgpu minimum is typically 256 bytes)
-const UNIFORM_ALIGNMENT: u64 = 256;
+const UNIFORM_ALIGNMENT: usize = 256;
 
 /// Maximum number of gates that can be batched in a single submission
-const MAX_BATCH_SIZE: u64 = 256;
+const MAX_BATCH_SIZE: usize = 256;
 
 /// Size of `GateParams` struct (padded to alignment)
-const ALIGNED_GATE_PARAMS_SIZE: u64 = UNIFORM_ALIGNMENT;
+const ALIGNED_GATE_PARAMS_SIZE: usize = UNIFORM_ALIGNMENT;
+
+/// A wgpu feature that a simulator may require.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum RequiredFeature {
+    /// Double-precision shaders (Vulkan `shaderFloat64`). Required by
+    /// [`crate::GpuStateVec64`]. Not available on Metal / Apple Silicon.
+    ShaderF64,
+}
+
+impl std::fmt::Display for RequiredFeature {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RequiredFeature::ShaderF64 => write!(f, "SHADER_F64"),
+        }
+    }
+}
 
 /// Error type for GPU operations
 #[derive(Debug)]
 pub enum GpuError {
     /// No suitable GPU adapter found
     NoAdapter,
-    /// Failed to create device
-    DeviceCreation(wgpu::RequestDeviceError),
+    /// Shared GPU startup failed (adapter or device creation via `gpu_context`)
+    Startup(GpuStartupError),
     /// Buffer mapping failed
     BufferMap(wgpu::BufferAsyncError),
     /// Too many qubits for available memory
     TooManyQubits { requested: u32, max: u32 },
+    /// Required GPU feature unavailable on this adapter
+    UnsupportedFeature(RequiredFeature),
 }
 
 impl std::fmt::Display for GpuError {
@@ -34,20 +53,32 @@ impl std::fmt::Display for GpuError {
         match self {
             GpuError::NoAdapter => write!(
                 f,
-                "No GPU adapter found. GpuStateVec requires a GPU with Vulkan, Metal, or DX12 support. \
+                "No GPU adapter found. GpuStateVec32 requires a GPU with Vulkan, Metal, or DX12 support. \
                  Check GPU availability with `gpu-check` or use a CPU-based simulator instead (e.g., StateVec)."
             ),
-            GpuError::DeviceCreation(e) => write!(f, "Failed to create GPU device: {e}"),
+            GpuError::Startup(e) => write!(f, "GPU startup failed: {e}"),
             GpuError::BufferMap(e) => write!(f, "Buffer mapping failed: {e}"),
             GpuError::TooManyQubits { requested, max } => {
                 write!(f, "Too many qubits: {requested} requested, max {max}")
             }
+            GpuError::UnsupportedFeature(feat) => {
+                write!(f, "GPU does not support required feature: {feat}")
+            }
         }
     }
 }
 
 impl std::error::Error for GpuError {}
 
+impl From<GpuStartupError> for GpuError {
+    fn from(err: GpuStartupError) -> Self {
+        match err {
+            GpuStartupError::NoAdapter => GpuError::NoAdapter,
+            GpuStartupError::DeviceCreation { .. } => GpuError::Startup(err),
+        }
+    }
+}
+
 /// Parameters for single-qubit gate (matches WGSL struct)
 #[repr(C)]
 #[derive(Clone, Copy, Pod, Zeroable)]
@@ -73,13 +104,34 @@ struct MeasureParams {
     _padding: u32,
 }
 
+/// Which compute pipeline a queued gate should use.
+#[derive(Clone, Copy, PartialEq, Eq)]
+enum GatePipeline {
+    Single,
+    Diagonal,
+    CX,
+    CY,
+    CZ,
+    Swap,
+    Rxx,
+    Ryy,
+    Rzz,
+}
+
+/// A gate waiting in the CPU-side queue until the next flush.
+#[derive(Clone)]
+struct QueuedGate {
+    pipeline: GatePipeline,
+    params: GateParams,
+}
+
 /// Cross-platform GPU state vector quantum simulator
-pub struct GpuStateVec {
+pub struct GpuStateVec32 {
     device: wgpu::Device,
     queue: wgpu::Queue,
 
     num_qubits: u32,
-    num_amplitudes: u64,
+    num_amplitudes: usize,
 
     // GPU buffers
     state_buffer: wgpu::Buffer,
@@ -89,16 +141,21 @@ pub struct GpuStateVec {
 
     // Compute pipelines
     single_gate_pipeline: wgpu::ComputePipeline,
+    diagonal_gate_pipeline: wgpu::ComputePipeline,
     cx_pipeline: wgpu::ComputePipeline,
+    cy_pipeline: wgpu::ComputePipeline,
     cz_pipeline: wgpu::ComputePipeline,
+    swap_pipeline: wgpu::ComputePipeline,
+    rxx_pipeline: wgpu::ComputePipeline,
+    ryy_pipeline: wgpu::ComputePipeline,
     rzz_pipeline: wgpu::ComputePipeline,
     collapse_pipeline: wgpu::ComputePipeline,
 
-    // Bind group layouts (kept alive — wgpu may hold weak refs)
-    #[allow(dead_code)]
-    gate_bind_group_layout: wgpu::BindGroupLayout,
-    #[allow(dead_code)]
-    collapse_bind_group_layout: wgpu::BindGroupLayout,
+    // Bind group layouts: held to outlive the bind groups built from them
+    // (wgpu may keep only weak references). Underscore-prefixed = intentionally
+    // unread; their job is RAII lifetime, not direct use.
+    _gate_bind_group_layout: wgpu::BindGroupLayout,
+    _collapse_bind_group_layout: wgpu::BindGroupLayout,
 
     // Persistent bind groups
     gate_bind_group: wgpu::BindGroup,
@@ -107,11 +164,22 @@ pub struct GpuStateVec {
 
     // GPU-side marginal probability reduction
     partial_sums_buffer: wgpu::Buffer,
-    #[allow(dead_code)]
-    marginal_bind_group_layout: wgpu::BindGroupLayout,
+    _marginal_bind_group_layout: wgpu::BindGroupLayout,
     marginal_pipeline: wgpu::ComputePipeline,
     num_partial_sums: u64,
 
+    // Persistent kernel: for small states that fit in workgroup shared memory
+    persistent_pipeline: wgpu::ComputePipeline,
+    _persistent_bind_group_layout: wgpu::BindGroupLayout,
+    persistent_bind_group: wgpu::BindGroup,
+    gate_queue_buffer: wgpu::Buffer,
+    /// Max qubits where the state fits in workgroup shared memory (0 if unavailable)
+    persistent_max_qubits: u32,
+
+    // Gate queue: gates accumulate here and are flushed in a single GPU submission
+    gate_queue: Vec<QueuedGate>,
+    params_staging: Vec<u8>,
+
     // RNG for measurements (Send + Sync for parallel Monte Carlo)
     rng: PecosRng,
 }
@@ -119,11 +187,11 @@ pub struct GpuStateVec {
 /// Maximum workgroups per dimension (wgpu limit is 65535)
 const MAX_WORKGROUPS_PER_DIM: u32 = 65535;
 
-impl GpuStateVec {
+impl GpuStateVec32 {
     /// Compute the number of workgroups needed for a given number of elements.
     /// Uses 256 threads per workgroup (standard for GPU compute).
     /// Returns (x, y) dimensions for dispatch, using 2D dispatch when count exceeds limit.
-    fn compute_workgroups(num_elements: u64) -> (u32, u32) {
+    fn compute_workgroups(num_elements: usize) -> (u32, u32) {
         // Safe truncation: with max 30 qubits, max elements is 2^30 = ~1B
         // div_ceil(2^30, 256) = ~4M, well within u32 range
         #[allow(clippy::cast_possible_truncation)]
@@ -165,27 +233,20 @@ impl GpuStateVec {
             });
         }
 
-        let num_amplitudes = 1u64 << num_qubits;
+        let num_amplitudes = 1usize << num_qubits;
 
-        // Initialize wgpu
-        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::new_without_display_handle());
+        let ctx = gpu_context()?;
+        let device = ctx.device;
+        let queue = ctx.queue;
 
-        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
-            power_preference: wgpu::PowerPreference::HighPerformance,
-            compatible_surface: None,
-            force_fallback_adapter: false,
-        }))
-        .map_err(|_| GpuError::NoAdapter)?;
-
-        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
-            label: Some("PECOS wgpu simulator"),
-            required_features: wgpu::Features::empty(),
-            required_limits: adapter.limits(),
-            memory_hints: wgpu::MemoryHints::Performance,
-            trace: wgpu::Trace::Off,
-            experimental_features: wgpu::ExperimentalFeatures::default(),
-        }))
-        .map_err(GpuError::DeviceCreation)?;
+        // Determine max qubits for persistent kernel based on available shared memory.
+        // Each amplitude is vec2<f32> = 8 bytes. State of n qubits = 2^n * 8 bytes.
+        let shared_mem_bytes = device.limits().max_compute_workgroup_storage_size;
+        let persistent_max_qubits = if shared_mem_bytes >= 8 {
+            (shared_mem_bytes / 8).ilog2()
+        } else {
+            0
+        };
 
         // Create shader module
         let shader: wgpu::ShaderModule =
@@ -195,7 +256,7 @@ impl GpuStateVec {
             });
 
         // Create buffers
-        let state_buffer_size = num_amplitudes * 8; // 2 * f32 per amplitude
+        let state_buffer_size = (num_amplitudes * 8) as u64; // 2 * f32 per amplitude
         let state_buffer = device.create_buffer(&wgpu::BufferDescriptor {
             label: Some("State vector"),
             size: state_buffer_size,
@@ -207,7 +268,7 @@ impl GpuStateVec {
 
         let params_buffer = device.create_buffer(&wgpu::BufferDescriptor {
             label: Some("Gate parameters"),
-            size: ALIGNED_GATE_PARAMS_SIZE * MAX_BATCH_SIZE,
+            size: (ALIGNED_GATE_PARAMS_SIZE * MAX_BATCH_SIZE) as u64,
             usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
             mapped_at_creation: false,
         });
@@ -221,7 +282,7 @@ impl GpuStateVec {
 
         let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
             label: Some("Staging buffer"),
-            size: num_amplitudes * 8, // For reading state vector (2 * f32 per amplitude)
+            size: (num_amplitudes * 8) as u64, // For reading state vector (2 * f32 per amplitude)
             usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
             mapped_at_creation: false,
         });
@@ -321,6 +382,16 @@ impl GpuStateVec {
                 cache: None,
             });
 
+        let diagonal_gate_pipeline =
+            device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                label: Some("Diagonal gate pipeline"),
+                layout: Some(&gate_pipeline_layout),
+                module: &shader,
+                entry_point: Some("apply_diagonal_gate"),
+                compilation_options: wgpu::PipelineCompilationOptions::default(),
+                cache: None,
+            });
+
         let cx_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
             label: Some("CX pipeline"),
             layout: Some(&gate_pipeline_layout),
@@ -330,6 +401,15 @@ impl GpuStateVec {
             cache: None,
         });
 
+        let cy_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("CY pipeline"),
+            layout: Some(&gate_pipeline_layout),
+            module: &shader,
+            entry_point: Some("apply_cy"),
+            compilation_options: wgpu::PipelineCompilationOptions::default(),
+            cache: None,
+        });
+
         let cz_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
             label: Some("CZ pipeline"),
             layout: Some(&gate_pipeline_layout),
@@ -339,6 +419,33 @@ impl GpuStateVec {
             cache: None,
         });
 
+        let swap_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("SWAP pipeline"),
+            layout: Some(&gate_pipeline_layout),
+            module: &shader,
+            entry_point: Some("apply_swap"),
+            compilation_options: wgpu::PipelineCompilationOptions::default(),
+            cache: None,
+        });
+
+        let rxx_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("RXX pipeline"),
+            layout: Some(&gate_pipeline_layout),
+            module: &shader,
+            entry_point: Some("apply_rxx"),
+            compilation_options: wgpu::PipelineCompilationOptions::default(),
+            cache: None,
+        });
+
+        let ryy_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("RYY pipeline"),
+            layout: Some(&gate_pipeline_layout),
+            module: &shader,
+            entry_point: Some("apply_ryy"),
+            compilation_options: wgpu::PipelineCompilationOptions::default(),
+            cache: None,
+        });
+
         let rzz_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
             label: Some("RZZ pipeline"),
             layout: Some(&gate_pipeline_layout),
@@ -480,6 +587,84 @@ impl GpuStateVec {
             ],
         });
 
+        // Persistent kernel: gate queue in a storage buffer
+        // Max gate queue: 256 gates * 12 u32 per gate + 2 u32 header = 3074 u32 = ~12KB
+        let gate_queue_buffer_size = (2 + MAX_BATCH_SIZE * 12) * 4;
+        let gate_queue_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Persistent gate queue"),
+            size: gate_queue_buffer_size as u64,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        let persistent_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Persistent kernel bind group layout"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 5,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: true },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
+        let persistent_pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: Some("Persistent kernel pipeline layout"),
+                bind_group_layouts: &[Some(&persistent_bind_group_layout)],
+                immediate_size: 0,
+            });
+
+        // Compile persistent kernel shader with dynamic shared memory size
+        let shared_size = 1u32 << persistent_max_qubits;
+        let persistent_shader_src = include_str!("persistent_kernel_f32.wgsl")
+            .replace("{SHARED_SIZE}", &shared_size.to_string());
+        let persistent_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("Persistent kernel shader (f32)"),
+            source: wgpu::ShaderSource::Wgsl(Cow::Owned(persistent_shader_src)),
+        });
+
+        let persistent_pipeline =
+            device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                label: Some("Persistent kernel pipeline"),
+                layout: Some(&persistent_pipeline_layout),
+                module: &persistent_shader,
+                entry_point: Some("apply_gate_queue_persistent"),
+                compilation_options: wgpu::PipelineCompilationOptions::default(),
+                cache: None,
+            });
+
+        let persistent_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Persistent kernel bind group"),
+            layout: &persistent_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: state_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 5,
+                    resource: gate_queue_buffer.as_entire_binding(),
+                },
+            ],
+        });
+
         let mut sim = Self {
             device,
             queue,
@@ -490,19 +675,31 @@ impl GpuStateVec {
             measure_params_buffer,
             staging_buffer,
             single_gate_pipeline,
+            diagonal_gate_pipeline,
             cx_pipeline,
+            cy_pipeline,
             cz_pipeline,
+            swap_pipeline,
+            rxx_pipeline,
+            ryy_pipeline,
             rzz_pipeline,
             collapse_pipeline,
-            gate_bind_group_layout,
-            collapse_bind_group_layout,
+            _gate_bind_group_layout: gate_bind_group_layout,
+            _collapse_bind_group_layout: collapse_bind_group_layout,
             gate_bind_group,
             collapse_bind_group,
             marginal_bind_group,
             partial_sums_buffer,
-            marginal_bind_group_layout,
+            _marginal_bind_group_layout: marginal_bind_group_layout,
             marginal_pipeline,
             num_partial_sums,
+            persistent_pipeline,
+            _persistent_bind_group_layout: persistent_bind_group_layout,
+            persistent_bind_group,
+            gate_queue_buffer,
+            persistent_max_qubits,
+            gate_queue: Vec::with_capacity(256),
+            params_staging: vec![0u8; ALIGNED_GATE_PARAMS_SIZE * MAX_BATCH_SIZE],
             rng: rand::make_rng(),
         };
 
@@ -530,322 +727,551 @@ impl GpuStateVec {
 
     /// Reset state to |0...0>
     pub fn reset(&mut self) {
+        self.gate_queue.clear();
+
         // Create initial state: |0...0> = [1+0i, 0+0i, 0+0i, ...]
-        // Safe: with max 30 qubits, num_amplitudes is at most 2^30 which fits in usize on 64-bit.
-        // This crate requires 64-bit for practical use (32-bit can't address enough memory anyway).
-        #[allow(clippy::cast_possible_truncation)]
-        let mut initial_state = vec![[0.0f32, 0.0f32]; self.num_amplitudes as usize];
+        let mut initial_state = vec![[0.0f32, 0.0f32]; self.num_amplitudes];
         initial_state[0] = [1.0, 0.0];
 
         self.queue
             .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(&initial_state));
     }
 
-    /// Apply an arbitrary single-qubit gate
-    pub fn apply_single_gate(&mut self, qubit: u32, matrix: [f32; 8]) {
-        let params = GateParams {
-            target_qubit: qubit,
-            control_qubit: 0,
-            num_qubits: self.num_qubits,
-            _padding: 0,
-            matrix_row0: [matrix[0], matrix[1], matrix[2], matrix[3]],
-            matrix_row1: [matrix[4], matrix[5], matrix[6], matrix[7]],
+    /// Multiply two 2x2 complex matrices in [`a_re`, `a_im`, `b_re`, `b_im`, `c_re`, `c_im`, `d_re`, `d_im`] format.
+    fn matrix_mul_f32(a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
+        #[inline]
+        fn cmul(xr: f32, xi: f32, yr: f32, yi: f32) -> (f32, f32) {
+            (xr * yr - xi * yi, xr * yi + xi * yr)
+        }
+
+        let (c0r, c0i) = {
+            let (t1r, t1i) = cmul(a[0], a[1], b[0], b[1]);
+            let (t2r, t2i) = cmul(a[2], a[3], b[4], b[5]);
+            (t1r + t2r, t1i + t2i)
+        };
+        let (c1r, c1i) = {
+            let (t1r, t1i) = cmul(a[0], a[1], b[2], b[3]);
+            let (t2r, t2i) = cmul(a[2], a[3], b[6], b[7]);
+            (t1r + t2r, t1i + t2i)
+        };
+        let (c2r, c2i) = {
+            let (t1r, t1i) = cmul(a[4], a[5], b[0], b[1]);
+            let (t2r, t2i) = cmul(a[6], a[7], b[4], b[5]);
+            (t1r + t2r, t1i + t2i)
+        };
+        let (c3r, c3i) = {
+            let (t1r, t1i) = cmul(a[4], a[5], b[2], b[3]);
+            let (t2r, t2i) = cmul(a[6], a[7], b[6], b[7]);
+            (t1r + t2r, t1i + t2i)
         };
 
-        self.queue
-            .write_buffer(&self.params_buffer, 0, bytemuck::bytes_of(&params));
+        [c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i]
+    }
 
-        let mut encoder = self
-            .device
-            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: Some("Gate encoder"),
-            });
+    /// Reorder single-qubit gates to group same-qubit gates together for fusion.
+    ///
+    /// Single-qubit gates on different qubits commute, so they can be freely
+    /// reordered. Two-qubit gates act as barriers and are not moved.
+    fn reorder_for_fusion(queue: &mut [QueuedGate]) {
+        let mut start = 0;
+        while start < queue.len() {
+            if !matches!(
+                queue[start].pipeline,
+                GatePipeline::Single | GatePipeline::Diagonal
+            ) {
+                start += 1;
+                continue;
+            }
 
-        {
-            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                label: Some("Single gate pass"),
-                timestamp_writes: None,
+            let mut end = start + 1;
+            while end < queue.len()
+                && matches!(
+                    queue[end].pipeline,
+                    GatePipeline::Single | GatePipeline::Diagonal
+                )
+            {
+                end += 1;
+            }
+
+            queue[start..end].sort_by_key(|g| g.params.target_qubit);
+            start = end;
+        }
+    }
+
+    /// Fuse consecutive single-qubit gates on the same qubit by multiplying matrices.
+    fn fuse_gate_queue(queue: &mut [QueuedGate]) -> Vec<QueuedGate> {
+        Self::reorder_for_fusion(queue);
+        if queue.len() <= 1 {
+            return queue.to_vec();
+        }
+
+        let mut fused = Vec::with_capacity(queue.len());
+        let mut i = 0;
+
+        while i < queue.len() {
+            let gate = &queue[i];
+            let is_1q = matches!(gate.pipeline, GatePipeline::Single | GatePipeline::Diagonal);
+            if !is_1q {
+                fused.push(queue[i].clone());
+                i += 1;
+                continue;
+            }
+
+            let target = gate.params.target_qubit;
+            let mut matrix = [
+                gate.params.matrix_row0[0],
+                gate.params.matrix_row0[1],
+                gate.params.matrix_row0[2],
+                gate.params.matrix_row0[3],
+                gate.params.matrix_row1[0],
+                gate.params.matrix_row1[1],
+                gate.params.matrix_row1[2],
+                gate.params.matrix_row1[3],
+            ];
+            let mut j = i + 1;
+
+            while j < queue.len() {
+                let next = &queue[j];
+                let next_is_1q =
+                    matches!(next.pipeline, GatePipeline::Single | GatePipeline::Diagonal);
+                if !next_is_1q || next.params.target_qubit != target {
+                    break;
+                }
+                let next_matrix = [
+                    next.params.matrix_row0[0],
+                    next.params.matrix_row0[1],
+                    next.params.matrix_row0[2],
+                    next.params.matrix_row0[3],
+                    next.params.matrix_row1[0],
+                    next.params.matrix_row1[1],
+                    next.params.matrix_row1[2],
+                    next.params.matrix_row1[3],
+                ];
+                matrix = Self::matrix_mul_f32(&next_matrix, &matrix);
+                j += 1;
+            }
+
+            let is_diagonal =
+                matrix[2] == 0.0 && matrix[3] == 0.0 && matrix[4] == 0.0 && matrix[5] == 0.0;
+
+            fused.push(QueuedGate {
+                pipeline: if is_diagonal {
+                    GatePipeline::Diagonal
+                } else {
+                    GatePipeline::Single
+                },
+                params: GateParams {
+                    target_qubit: target,
+                    control_qubit: 0,
+                    num_qubits: gate.params.num_qubits,
+                    _padding: 0,
+                    matrix_row0: [matrix[0], matrix[1], matrix[2], matrix[3]],
+                    matrix_row1: [matrix[4], matrix[5], matrix[6], matrix[7]],
+                },
             });
-            pass.set_pipeline(&self.single_gate_pipeline);
-            // Use persistent bind group with dynamic offset (offset 0 for single gate)
-            pass.set_bind_group(0, &self.gate_bind_group, &[0]);
 
-            // Dispatch: one thread per pair of amplitudes
-            let num_pairs = self.num_amplitudes / 2;
-            let (wg_x, wg_y) = Self::compute_workgroups(num_pairs);
-            pass.dispatch_workgroups(wg_x, wg_y, 1);
+            i = j;
         }
 
-        self.queue.submit(std::iter::once(encoder.finish()));
+        fused
     }
 
-    /// Apply the same single-qubit gate to multiple qubits in a single GPU submission.
+    /// Flush all queued gates to the GPU in a single command buffer submission.
     ///
-    /// This is more efficient than calling `apply_single_gate` multiple times
-    /// because it batches all operations into a single command buffer submission,
-    /// uses a single buffer write for all parameters, and uses dynamic uniform
-    /// buffer offsets to avoid per-gate bind group creation.
+    /// Gates are accumulated by trait methods (h, cx, rz, etc.) and dispatched
+    /// together here. This amortizes encoder creation and `queue.submit()` overhead
+    /// across all queued gates.
     #[allow(clippy::cast_possible_truncation)]
-    fn apply_single_gate_batch_qubits(&mut self, qubits: &[QubitId], matrix: [f32; 8]) {
-        if qubits.is_empty() {
-            return;
+    /// Encode the fused gate queue into the persistent kernel's storage buffer format.
+    /// Returns the byte slice to write.
+    fn encode_persistent_queue(
+        fused: &[QueuedGate],
+        num_qubits: u32,
+        staging: &mut Vec<u8>,
+    ) -> usize {
+        // Header: [num_gates, num_qubits]
+        // Each gate: 12 x u32 [type, target, control, pad, matrix(8 x f32 as u32)]
+        let num_gates = fused.len();
+        let total_u32 = 2 + num_gates * 12;
+        let total_bytes = total_u32 * 4;
+
+        if staging.len() < total_bytes {
+            staging.resize(total_bytes, 0);
         }
 
-        // Build all gate params on CPU first, then write in a single buffer operation.
-        // Each GateParams is 64 bytes but we need UNIFORM_ALIGNMENT (256) bytes per entry.
-        // We'll write each params at its aligned offset.
-        let num_gates = qubits.len();
-        let total_size = num_gates * ALIGNED_GATE_PARAMS_SIZE as usize;
-        let mut params_data = vec![0u8; total_size];
-
-        for (i, &qubit) in qubits.iter().enumerate() {
-            let params = GateParams {
-                target_qubit: qubit.index() as u32,
-                control_qubit: 0,
-                num_qubits: self.num_qubits,
-                _padding: 0,
-                matrix_row0: [matrix[0], matrix[1], matrix[2], matrix[3]],
-                matrix_row1: [matrix[4], matrix[5], matrix[6], matrix[7]],
+        let buf: &mut [u32] = bytemuck::cast_slice_mut(&mut staging[..total_bytes]);
+
+        buf[0] = num_gates as u32;
+        buf[1] = num_qubits;
+
+        for (i, gate) in fused.iter().enumerate() {
+            let base = 2 + i * 12;
+            buf[base] = match gate.pipeline {
+                GatePipeline::Single => 0,
+                GatePipeline::Diagonal => 1,
+                GatePipeline::CX => 2,
+                GatePipeline::CY => 3,
+                GatePipeline::CZ => 4,
+                GatePipeline::Swap => 5,
+                GatePipeline::Rxx => 6,
+                GatePipeline::Ryy => 7,
+                GatePipeline::Rzz => 8,
             };
-
-            let offset = i * ALIGNED_GATE_PARAMS_SIZE as usize;
-            let params_bytes = bytemuck::bytes_of(&params);
-            params_data[offset..offset + params_bytes.len()].copy_from_slice(params_bytes);
+            buf[base + 1] = gate.params.target_qubit;
+            buf[base + 2] = gate.params.control_qubit;
+            buf[base + 3] = 0;
+            // Matrix: f32 -> u32 bitcast
+            buf[base + 4] = gate.params.matrix_row0[0].to_bits();
+            buf[base + 5] = gate.params.matrix_row0[1].to_bits();
+            buf[base + 6] = gate.params.matrix_row0[2].to_bits();
+            buf[base + 7] = gate.params.matrix_row0[3].to_bits();
+            buf[base + 8] = gate.params.matrix_row1[0].to_bits();
+            buf[base + 9] = gate.params.matrix_row1[1].to_bits();
+            buf[base + 10] = gate.params.matrix_row1[2].to_bits();
+            buf[base + 11] = gate.params.matrix_row1[3].to_bits();
         }
 
-        // Single buffer write for all gate parameters
-        self.queue
-            .write_buffer(&self.params_buffer, 0, &params_data);
+        total_bytes
+    }
 
-        // Create a single command encoder for all dispatches
+    fn flush_gates(&mut self) {
+        if self.gate_queue.is_empty() {
+            return;
+        }
         let mut encoder = self
             .device
             .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: Some("Batched single gate encoder"),
+                label: Some("Flush gates encoder"),
             });
+        self.record_flush_gates(&mut encoder);
+        self.queue.submit(std::iter::once(encoder.finish()));
+    }
+
+    /// Record queued gate dispatches into `encoder` without submitting.
+    /// Callers that follow up with a readback can chain the copy into the
+    /// same encoder, saving a submit round trip.
+    fn record_flush_gates(&mut self, encoder: &mut wgpu::CommandEncoder) {
+        if self.gate_queue.is_empty() {
+            return;
+        }
+
+        // Fuse consecutive single-qubit gates on the same qubit
+        let fused = Self::fuse_gate_queue(&mut self.gate_queue);
+
+        // Use persistent kernel if state fits in shared memory
+        if self.num_qubits <= self.persistent_max_qubits {
+            let total_bytes =
+                Self::encode_persistent_queue(&fused, self.num_qubits, &mut self.params_staging);
+            self.queue.write_buffer(
+                &self.gate_queue_buffer,
+                0,
+                &self.params_staging[..total_bytes],
+            );
+
+            {
+                let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                    label: Some("Persistent kernel pass"),
+                    timestamp_writes: None,
+                });
+                pass.set_pipeline(&self.persistent_pipeline);
+                pass.set_bind_group(0, &self.persistent_bind_group, &[]);
+                pass.dispatch_workgroups(1, 1, 1); // Single workgroup
+            }
+
+            self.gate_queue.clear();
+            return;
+        }
+
+        // Regular path: N dispatches into this encoder
+        let aligned = ALIGNED_GATE_PARAMS_SIZE;
+        let total_size = fused.len() * aligned;
+        for (i, gate) in fused.iter().enumerate() {
+            let offset = i * aligned;
+            let bytes = bytemuck::bytes_of(&gate.params);
+            self.params_staging[offset..offset + bytes.len()].copy_from_slice(bytes);
+        }
+        self.queue
+            .write_buffer(&self.params_buffer, 0, &self.params_staging[..total_size]);
 
         {
             let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                label: Some("Batched single gate pass"),
+                label: Some("Batched gate pass"),
                 timestamp_writes: None,
             });
-            pass.set_pipeline(&self.single_gate_pipeline);
 
             let num_pairs = self.num_amplitudes / 2;
-            let (wg_x, wg_y) = Self::compute_workgroups(num_pairs);
+            let (pair_wg_x, pair_wg_y) = Self::compute_workgroups(num_pairs);
+            let (amp_wg_x, amp_wg_y) = Self::compute_workgroups(self.num_amplitudes);
+
+            let mut current_pipeline = None;
+
+            for (i, gate) in fused.iter().enumerate() {
+                // Only switch pipeline when the gate type changes
+                if current_pipeline != Some(gate.pipeline) {
+                    let pipeline = match gate.pipeline {
+                        GatePipeline::Single => &self.single_gate_pipeline,
+                        GatePipeline::Diagonal => &self.diagonal_gate_pipeline,
+                        GatePipeline::CX => &self.cx_pipeline,
+                        GatePipeline::CY => &self.cy_pipeline,
+                        GatePipeline::CZ => &self.cz_pipeline,
+                        GatePipeline::Swap => &self.swap_pipeline,
+                        GatePipeline::Rxx => &self.rxx_pipeline,
+                        GatePipeline::Ryy => &self.ryy_pipeline,
+                        GatePipeline::Rzz => &self.rzz_pipeline,
+                    };
+                    pass.set_pipeline(pipeline);
+                    current_pipeline = Some(gate.pipeline);
+                }
 
-            // Use dynamic offset with persistent bind group for each gate
-            for i in 0..qubits.len() {
-                let offset = (i as u64 * ALIGNED_GATE_PARAMS_SIZE) as u32;
+                let offset = u32::try_from(i * ALIGNED_GATE_PARAMS_SIZE)
+                    .expect("batch offset always fits in u32 (i < MAX_BATCH_SIZE)");
                 pass.set_bind_group(0, &self.gate_bind_group, &[offset]);
+
+                let (wg_x, wg_y) = match gate.pipeline {
+                    GatePipeline::Single => (pair_wg_x, pair_wg_y),
+                    _ => (amp_wg_x, amp_wg_y),
+                };
                 pass.dispatch_workgroups(wg_x, wg_y, 1);
             }
         }
 
-        // Single submission for all gates
-        self.queue.submit(std::iter::once(encoder.finish()));
+        self.gate_queue.clear();
     }
 
-    /// Apply a single CX (CNOT) gate directly.
+    /// Wait for all submitted GPU work to complete.
     ///
-    /// This bypasses the trait layer and dispatches directly to the GPU.
-    pub fn apply_cx(&mut self, control: u32, target: u32) {
-        let params = GateParams {
-            target_qubit: target,
-            control_qubit: control,
-            num_qubits: self.num_qubits,
-            _padding: 0,
-            matrix_row0: [0.0; 4],
-            matrix_row1: [0.0; 4],
-        };
-
-        self.queue
-            .write_buffer(&self.params_buffer, 0, bytemuck::bytes_of(&params));
-
-        let mut encoder = self
-            .device
-            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: Some("CX encoder"),
-            });
+    /// Flushes any queued gates first, then waits for the GPU to finish.
+    /// Call this before timing measurements to ensure all asynchronous GPU
+    /// operations have finished.
+    pub fn sync(&mut self) {
+        self.flush_gates();
+        let _ = self.device.poll(wgpu::PollType::wait_indefinitely());
+    }
 
-        {
-            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                label: Some("CX pass"),
-                timestamp_writes: None,
-            });
-            pass.set_pipeline(&self.cx_pipeline);
-            pass.set_bind_group(0, &self.gate_bind_group, &[0]);
+    /// Queue an arbitrary single-qubit gate for batched dispatch.
+    fn queue_single_gate(&mut self, qubit: u32, matrix: [f32; 8]) {
+        // Diagonal gates have zero off-diagonal elements (b=0, c=0).
+        // Use the specialized diagonal shader: half the arithmetic, fully coalesced.
+        let is_diagonal =
+            matrix[2] == 0.0 && matrix[3] == 0.0 && matrix[4] == 0.0 && matrix[5] == 0.0;
+        let pipeline = if is_diagonal {
+            GatePipeline::Diagonal
+        } else {
+            GatePipeline::Single
+        };
+        self.gate_queue.push(QueuedGate {
+            pipeline,
+            params: GateParams {
+                target_qubit: qubit,
+                control_qubit: 0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix_row0: [matrix[0], matrix[1], matrix[2], matrix[3]],
+                matrix_row1: [matrix[4], matrix[5], matrix[6], matrix[7]],
+            },
+        });
 
-            let (wg_x, wg_y) = Self::compute_workgroups(self.num_amplitudes);
-            pass.dispatch_workgroups(wg_x, wg_y, 1);
+        // Flush when we hit the buffer capacity
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
         }
-
-        self.queue.submit(std::iter::once(encoder.finish()));
     }
 
-    /// Apply CX gates to multiple qubit pairs in a single GPU submission.
-    #[allow(clippy::cast_possible_truncation)]
-    fn cx_batch_pairs(&mut self, pairs: &[(QubitId, QubitId)]) {
-        if pairs.is_empty() {
-            return;
-        }
-
-        // Build all gate params on CPU first, then write in a single buffer operation
-        let total_size = pairs.len() * ALIGNED_GATE_PARAMS_SIZE as usize;
-        let mut params_data = vec![0u8; total_size];
-
-        for (i, &(q0, q1)) in pairs.iter().enumerate() {
-            let params = GateParams {
-                target_qubit: q1.index() as u32,
-                control_qubit: q0.index() as u32,
+    /// Queue a CX gate for batched dispatch.
+    fn queue_cx(&mut self, control: u32, target: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::CX,
+            params: GateParams {
+                target_qubit: target,
+                control_qubit: control,
                 num_qubits: self.num_qubits,
                 _padding: 0,
                 matrix_row0: [0.0; 4],
                 matrix_row1: [0.0; 4],
-            };
+            },
+        });
 
-            let offset = i * ALIGNED_GATE_PARAMS_SIZE as usize;
-            let params_bytes = bytemuck::bytes_of(&params);
-            params_data[offset..offset + params_bytes.len()].copy_from_slice(params_bytes);
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
         }
+    }
 
-        // Single buffer write for all gate parameters
-        self.queue
-            .write_buffer(&self.params_buffer, 0, &params_data);
-
-        let mut encoder = self
-            .device
-            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: Some("Batched CX encoder"),
-            });
-
-        {
-            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                label: Some("Batched CX pass"),
-                timestamp_writes: None,
-            });
-            pass.set_pipeline(&self.cx_pipeline);
+    /// Queue a CZ gate for batched dispatch.
+    fn queue_cz(&mut self, control: u32, target: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::CZ,
+            params: GateParams {
+                target_qubit: target,
+                control_qubit: control,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix_row0: [0.0; 4],
+                matrix_row1: [0.0; 4],
+            },
+        });
 
-            let (wg_x, wg_y) = Self::compute_workgroups(self.num_amplitudes);
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
 
-            // Use dynamic offset with persistent bind group for each gate pair
-            for i in 0..pairs.len() {
-                let offset = (i as u64 * ALIGNED_GATE_PARAMS_SIZE) as u32;
-                pass.set_bind_group(0, &self.gate_bind_group, &[offset]);
-                pass.dispatch_workgroups(wg_x, wg_y, 1);
-            }
+    fn queue_cy(&mut self, control: u32, target: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::CY,
+            params: GateParams {
+                target_qubit: target,
+                control_qubit: control,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix_row0: [0.0; 4],
+                matrix_row1: [0.0; 4],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
         }
+    }
 
-        self.queue.submit(std::iter::once(encoder.finish()));
+    fn queue_swap(&mut self, qubit0: u32, qubit1: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Swap,
+            params: GateParams {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix_row0: [0.0; 4],
+                matrix_row1: [0.0; 4],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
     }
 
-    /// Apply CZ gates to multiple qubit pairs in a single GPU submission.
-    #[allow(clippy::cast_possible_truncation)]
-    fn cz_batch_pairs(&mut self, pairs: &[(QubitId, QubitId)]) {
-        if pairs.is_empty() {
-            return;
+    fn queue_rxx(&mut self, qubit0: u32, qubit1: u32, theta: f32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Rxx,
+            params: GateParams {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix_row0: [theta, 0.0, 0.0, 0.0],
+                matrix_row1: [0.0; 4],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
         }
+    }
 
-        // Build all gate params on CPU first, then write in a single buffer operation
-        let total_size = pairs.len() * ALIGNED_GATE_PARAMS_SIZE as usize;
-        let mut params_data = vec![0u8; total_size];
+    fn queue_ryy(&mut self, qubit0: u32, qubit1: u32, theta: f32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Ryy,
+            params: GateParams {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix_row0: [theta, 0.0, 0.0, 0.0],
+                matrix_row1: [0.0; 4],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
 
-        for (i, &(q0, q1)) in pairs.iter().enumerate() {
-            let params = GateParams {
-                target_qubit: q1.index() as u32,
-                control_qubit: q0.index() as u32,
+    /// Queue an RZZ gate for batched dispatch.
+    fn queue_rzz(&mut self, qubit0: u32, qubit1: u32, theta: f32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Rzz,
+            params: GateParams {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
                 num_qubits: self.num_qubits,
                 _padding: 0,
-                matrix_row0: [0.0; 4],
+                matrix_row0: [theta, 0.0, 0.0, 0.0],
                 matrix_row1: [0.0; 4],
-            };
+            },
+        });
 
-            let offset = i * ALIGNED_GATE_PARAMS_SIZE as usize;
-            let params_bytes = bytemuck::bytes_of(&params);
-            params_data[offset..offset + params_bytes.len()].copy_from_slice(params_bytes);
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
         }
+    }
+
+    /// Apply an arbitrary single-qubit gate
+    pub fn apply_single_gate(&mut self, qubit: u32, matrix: [f32; 8]) {
+        let params = GateParams {
+            target_qubit: qubit,
+            control_qubit: 0,
+            num_qubits: self.num_qubits,
+            _padding: 0,
+            matrix_row0: [matrix[0], matrix[1], matrix[2], matrix[3]],
+            matrix_row1: [matrix[4], matrix[5], matrix[6], matrix[7]],
+        };
 
-        // Single buffer write for all gate parameters
         self.queue
-            .write_buffer(&self.params_buffer, 0, &params_data);
+            .write_buffer(&self.params_buffer, 0, bytemuck::bytes_of(&params));
 
         let mut encoder = self
             .device
             .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: Some("Batched CZ encoder"),
+                label: Some("Gate encoder"),
             });
 
         {
             let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                label: Some("Batched CZ pass"),
+                label: Some("Single gate pass"),
                 timestamp_writes: None,
             });
-            pass.set_pipeline(&self.cz_pipeline);
-
-            let (wg_x, wg_y) = Self::compute_workgroups(self.num_amplitudes);
+            pass.set_pipeline(&self.single_gate_pipeline);
+            // Use persistent bind group with dynamic offset (offset 0 for single gate)
+            pass.set_bind_group(0, &self.gate_bind_group, &[0]);
 
-            // Use dynamic offset with persistent bind group for each gate pair
-            for i in 0..pairs.len() {
-                let offset = (i as u64 * ALIGNED_GATE_PARAMS_SIZE) as u32;
-                pass.set_bind_group(0, &self.gate_bind_group, &[offset]);
-                pass.dispatch_workgroups(wg_x, wg_y, 1);
-            }
+            // Dispatch: one thread per pair of amplitudes
+            let num_pairs = self.num_amplitudes / 2;
+            let (wg_x, wg_y) = Self::compute_workgroups(num_pairs);
+            pass.dispatch_workgroups(wg_x, wg_y, 1);
         }
 
         self.queue.submit(std::iter::once(encoder.finish()));
     }
 
-    /// Apply RZZ gates to multiple qubit pairs in a single GPU submission.
-    #[allow(clippy::cast_possible_truncation)]
-    fn rzz_batch_pairs(&mut self, theta: f64, pairs: &[(QubitId, QubitId)]) {
-        if pairs.is_empty() {
-            return;
-        }
-
-        // Build all gate params on CPU first, then write in a single buffer operation
-        let total_size = pairs.len() * ALIGNED_GATE_PARAMS_SIZE as usize;
-        let mut params_data = vec![0u8; total_size];
-
-        for (i, &(q0, q1)) in pairs.iter().enumerate() {
-            let params = GateParams {
-                target_qubit: q1.index() as u32,
-                control_qubit: q0.index() as u32,
-                num_qubits: self.num_qubits,
-                _padding: 0,
-                matrix_row0: [theta as f32, 0.0, 0.0, 0.0],
-                matrix_row1: [0.0; 4],
-            };
-
-            let offset = i * ALIGNED_GATE_PARAMS_SIZE as usize;
-            let params_bytes = bytemuck::bytes_of(&params);
-            params_data[offset..offset + params_bytes.len()].copy_from_slice(params_bytes);
-        }
+    /// Apply a single CX (CNOT) gate directly.
+    ///
+    /// This bypasses the trait layer and dispatches directly to the GPU.
+    pub fn apply_cx(&mut self, control: u32, target: u32) {
+        let params = GateParams {
+            target_qubit: target,
+            control_qubit: control,
+            num_qubits: self.num_qubits,
+            _padding: 0,
+            matrix_row0: [0.0; 4],
+            matrix_row1: [0.0; 4],
+        };
 
-        // Single buffer write for all gate parameters
         self.queue
-            .write_buffer(&self.params_buffer, 0, &params_data);
+            .write_buffer(&self.params_buffer, 0, bytemuck::bytes_of(&params));
 
         let mut encoder = self
             .device
             .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                label: Some("Batched RZZ encoder"),
+                label: Some("CX encoder"),
             });
 
         {
             let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                label: Some("Batched RZZ pass"),
+                label: Some("CX pass"),
                 timestamp_writes: None,
             });
-            pass.set_pipeline(&self.rzz_pipeline);
+            pass.set_pipeline(&self.cx_pipeline);
+            pass.set_bind_group(0, &self.gate_bind_group, &[0]);
 
             let (wg_x, wg_y) = Self::compute_workgroups(self.num_amplitudes);
-
-            // Use dynamic offset with persistent bind group for each gate pair
-            for i in 0..pairs.len() {
-                let offset = (i as u64 * ALIGNED_GATE_PARAMS_SIZE) as u32;
-                pass.set_bind_group(0, &self.gate_bind_group, &[offset]);
-                pass.dispatch_workgroups(wg_x, wg_y, 1);
-            }
+            pass.dispatch_workgroups(wg_x, wg_y, 1);
         }
 
         self.queue.submit(std::iter::once(encoder.finish()));
@@ -867,7 +1293,61 @@ impl GpuStateVec {
     /// Instead of reading back all 2^n probabilities (O(2^n) transfer), this uses
     /// a reduction kernel that produces ~2^n/256 partial sums, reducing the readback
     /// by 256x. The CPU sums the partial sums and samples the outcome.
-    fn mz_gpu(&mut self, qubit: u32) -> u32 {
+    /// CPU-side measurement for small states. Reads the full state, computes
+    /// probability, samples outcome, collapses and writes back. Faster than
+    /// GPU dispatches when the state is small enough. Returns (outcome, `is_deterministic`).
+    fn mz_cpu_path(&mut self, qubit: u32) -> (u32, bool) {
+        const DET_EPS: f32 = 1e-6;
+
+        let mut state_data = self.state();
+        let target_mask = 1usize << qubit;
+
+        let prob_one: f32 = state_data
+            .iter()
+            .enumerate()
+            .filter(|(i, _)| i & target_mask != 0)
+            .map(|(_, [re, im])| re * re + im * im)
+            .sum();
+
+        let is_deterministic = !(DET_EPS..=1.0 - DET_EPS).contains(&prob_one);
+        let outcome = if is_deterministic {
+            u32::from(prob_one > 0.5)
+        } else {
+            let random: f32 = self.rng.random();
+            u32::from(random < prob_one)
+        };
+
+        let norm_factor = if outcome == 1 {
+            1.0 / prob_one.sqrt()
+        } else {
+            1.0 / (1.0 - prob_one).sqrt()
+        };
+
+        for (i, amp) in state_data.iter_mut().enumerate() {
+            let qubit_val = u32::from(i & target_mask != 0);
+            if qubit_val == outcome {
+                amp[0] *= norm_factor;
+                amp[1] *= norm_factor;
+            } else {
+                *amp = [0.0, 0.0];
+            }
+        }
+
+        self.queue
+            .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(&state_data));
+
+        (outcome, is_deterministic)
+    }
+
+    fn mz_gpu(&mut self, qubit: u32) -> (u32, bool) {
+        const DET_EPS: f32 = 1e-6;
+
+        // Fast path for small states: read entire state, compute probability + collapse on CPU.
+        // Avoids 2 GPU dispatches (reduction + collapse) and 2 buffer writes.
+        if self.num_qubits <= self.persistent_max_qubits {
+            return self.mz_cpu_path(qubit);
+        }
+
         // Write target qubit to params buffer
         let params = GateParams {
             target_qubit: qubit,
@@ -922,11 +1402,14 @@ impl GpuStateVec {
         };
         self.staging_buffer.unmap();
 
-        // Sample outcome
-        let random: f32 = self.rng.random();
-        let outcome = u32::from(random < prob_one);
+        let is_deterministic = !(DET_EPS..=1.0 - DET_EPS).contains(&prob_one);
+        let outcome = if is_deterministic {
+            u32::from(prob_one > 0.5)
+        } else {
+            let random: f32 = self.rng.random();
+            u32::from(random < prob_one)
+        };
 
-        // Collapse the state using persistent bind group
         let norm_factor = if outcome == 1 {
             1.0 / prob_one.sqrt()
         } else {
@@ -962,7 +1445,7 @@ impl GpuStateVec {
         }
         self.queue.submit(std::iter::once(encoder.finish()));
 
-        outcome
+        (outcome, is_deterministic)
     }
 
     /// Get the number of qubits
@@ -986,20 +1469,23 @@ impl GpuStateVec {
     ///
     /// Panics if the GPU device poll fails (indicates a driver or hardware failure).
     #[must_use]
-    pub fn state(&self) -> Vec<[f32; 2]> {
-        // Copy state buffer to staging buffer
+    pub fn state(&mut self) -> Vec<[f32; 2]> {
+        // Combine any pending gate dispatches with the readback copy into a
+        // single encoder/submit -- saves one submit round trip vs separate
+        // flush + copy submissions.
         let mut encoder = self
             .device
             .create_command_encoder(&wgpu::CommandEncoderDescriptor {
                 label: Some("State readback encoder"),
             });
+        self.record_flush_gates(&mut encoder);
 
         encoder.copy_buffer_to_buffer(
             &self.state_buffer,
             0,
             &self.staging_buffer,
             0,
-            self.num_amplitudes * 8,
+            (self.num_amplitudes * 8) as u64,
         );
 
         self.queue.submit(std::iter::once(encoder.finish()));
@@ -1025,11 +1511,28 @@ impl GpuStateVec {
     /// # Arguments
     /// * `basis_state` - The computational basis state index (little-endian)
     #[must_use]
-    pub fn probability(&self, basis_state: usize) -> f32 {
+    pub fn probability(&mut self, basis_state: usize) -> f32 {
         let state = self.state();
         let [re, im] = state[basis_state];
         re * re + im * im
     }
+
+    /// Overwrite the GPU state buffer with `amps`. Length must equal
+    /// `num_amplitudes`; caller is responsible for the state being normalized.
+    /// Pending queued gates are flushed first.
+    ///
+    /// # Panics
+    /// Panics if `amps.len() != num_amplitudes`.
+    pub fn write_state(&mut self, amps: &[[f32; 2]]) {
+        assert_eq!(
+            amps.len(),
+            self.num_amplitudes,
+            "write_state: slice length mismatch"
+        );
+        self.flush_gates();
+        self.queue
+            .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(amps));
+    }
 }
 
 // Trait implementations for PECOS integration
@@ -1039,12 +1542,10 @@ use pecos_simulators::{
     ArbitraryRotationGateable, CliffordGateable, MeasurementResult, QuantumSimulator,
 };
 
-impl QuantumSimulator for GpuStateVec {
+impl QuantumSimulator for GpuStateVec32 {
     fn reset(&mut self) -> &mut Self {
         // Create initial state: |0...0> = [1+0i, 0+0i, 0+0i, ...]
-        // Safe: with max 30 qubits, num_amplitudes fits in usize on 64-bit systems
-        #[allow(clippy::cast_possible_truncation)]
-        let mut initial_state = vec![[0.0f32, 0.0f32]; self.num_amplitudes as usize];
+        let mut initial_state = vec![[0.0f32, 0.0f32]; self.num_amplitudes];
         initial_state[0] = [1.0, 0.0];
 
         self.queue
@@ -1053,105 +1554,352 @@ impl QuantumSimulator for GpuStateVec {
     }
 }
 
-// Trait implementations use internal batch methods directly to avoid allocations.
-impl CliffordGateable for GpuStateVec {
-    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
-        self.apply_single_gate_batch_qubits(qubits, gates::S);
-        self
-    }
-
+// Trait implementations queue gates for batched dispatch.
+#[allow(clippy::cast_possible_truncation)] // Qubit indices from QubitId fit in u32
+impl CliffordGateable for GpuStateVec32 {
     fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
-        self.apply_single_gate_batch_qubits(qubits, gates::H);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::H);
+        }
         self
     }
 
     fn x(&mut self, qubits: &[QubitId]) -> &mut Self {
-        self.apply_single_gate_batch_qubits(qubits, gates::X);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::X);
+        }
         self
     }
 
     fn y(&mut self, qubits: &[QubitId]) -> &mut Self {
-        self.apply_single_gate_batch_qubits(qubits, gates::Y);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::Y);
+        }
         self
     }
 
     fn z(&mut self, qubits: &[QubitId]) -> &mut Self {
-        self.apply_single_gate_batch_qubits(qubits, gates::Z);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::Z);
+        }
+        self
+    }
+
+    fn sx(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::SX);
+        }
+        self
+    }
+
+    fn sxdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::SXDG);
+        }
+        self
+    }
+
+    fn sy(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::SY);
+        }
+        self
+    }
+
+    fn sydg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::SYDG);
+        }
+        self
+    }
+
+    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::S);
+        }
+        self
+    }
+
+    fn szdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::SDG);
+        }
         self
     }
 
     fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        self.cx_batch_pairs(pairs);
+        for &(c, t) in pairs {
+            self.queue_cx(c.index() as u32, t.index() as u32);
+        }
+        self
+    }
+
+    fn cy(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        for &(c, t) in pairs {
+            self.queue_cy(c.index() as u32, t.index() as u32);
+        }
         self
     }
 
     fn cz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        self.cz_batch_pairs(pairs);
+        for &(c, t) in pairs {
+            self.queue_cz(c.index() as u32, t.index() as u32);
+        }
+        self
+    }
+
+    fn swap(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        for &(q0, q1) in pairs {
+            self.queue_swap(q0.index() as u32, q1.index() as u32);
+        }
+        self
+    }
+
+    fn szz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        // SZZ = RZZ(pi/2) -- reuse the existing RZZ shader
+        let theta = std::f32::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rzz(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn szzdg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        // SZZdg = RZZ(-pi/2)
+        let theta = -std::f32::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rzz(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn sxx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        // SXX = RXX(pi/2) -- 1 dispatch instead of 5
+        let theta = std::f32::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rxx(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn sxxdg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = -std::f32::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rxx(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn syy(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = std::f32::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_ryy(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn syydg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = -std::f32::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_ryy(q0.index() as u32, q1.index() as u32, theta);
+        }
         self
     }
 
-    #[allow(clippy::cast_possible_truncation)] // Qubit indices from QubitId fit in u32
     fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        self.flush_gates();
+
+        // Empirical mz path selection (RTX 4090 / PCIe 4.0, 2026-04-11).
+        // CPU batch wins only when the state fits in ~128KB (N<=14) and at
+        // least 2 qubits are measured. Above N=14, GPU sequential mz beats
+        // readback + CPU loop by 2-13x even at full M=N.
+        // M=1 always takes the GPU path: a single measurement amortizes the
+        // CPU readback poorly (one collapse vs N elements transferred), and
+        // the GPU reduction+collapse fuses into one submit.
+        // Re-run scripts/native_bench/bench_pecos for a different GPU.
+        if qubits.len() >= 2 && self.num_qubits <= 14 {
+            self.mz_cpu_batch(qubits)
+        } else {
+            self.mz_gpu_sequential(qubits)
+        }
+    }
+}
+
+impl GpuStateVec32 {
+    /// Read state, measure all qubits on CPU, write state back. Skips path
+    /// selection -- intended for benchmarking and tests that need to force a
+    /// specific path. Production code should call `mz()`.
+    pub fn mz_cpu_batch(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        const DET_EPS: f32 = 1e-6;
+
+        self.flush_gates();
+        let mut state_data = self.state();
+        let results: Vec<MeasurementResult> = qubits
+            .iter()
+            .map(|&q| {
+                let target_mask = 1usize << q.index();
+
+                let prob_one: f32 = state_data
+                    .iter()
+                    .enumerate()
+                    .filter(|(i, _)| i & target_mask != 0)
+                    .map(|(_, [re, im])| re * re + im * im)
+                    .sum();
+
+                // prob_one very close to 0 or 1 means the measurement outcome
+                // is forced by the state -- report it as deterministic.
+                let is_deterministic = !(DET_EPS..=1.0 - DET_EPS).contains(&prob_one);
+
+                let random: f32 = self.rng.random();
+                let outcome = if is_deterministic {
+                    u32::from(prob_one > 0.5)
+                } else {
+                    u32::from(random < prob_one)
+                };
+
+                let norm_factor = if outcome == 1 {
+                    1.0 / prob_one.sqrt()
+                } else {
+                    1.0 / (1.0 - prob_one).sqrt()
+                };
+
+                for (i, amp) in state_data.iter_mut().enumerate() {
+                    let qubit_val = u32::from(i & target_mask != 0);
+                    if qubit_val == outcome {
+                        amp[0] *= norm_factor;
+                        amp[1] *= norm_factor;
+                    } else {
+                        *amp = [0.0, 0.0];
+                    }
+                }
+
+                MeasurementResult {
+                    outcome: outcome == 1,
+                    is_deterministic,
+                }
+            })
+            .collect();
+
+        self.queue
+            .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(&state_data));
+        results
+    }
+
+    /// Sequential per-qubit GPU measurement. Skips path selection -- intended
+    /// for benchmarking and tests that need to force a specific path.
+    /// Production code should call `mz()`.
+    pub fn mz_gpu_sequential(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        self.flush_gates();
         qubits
             .iter()
             .map(|&q| {
-                let outcome = self.mz_gpu(q.index() as u32);
+                #[allow(clippy::cast_possible_truncation)]
+                let (outcome, is_deterministic) = self.mz_gpu(q.index() as u32);
                 MeasurementResult {
                     outcome: outcome == 1,
-                    is_deterministic: false, // State vector sim is never deterministic unless in eigenstate
+                    is_deterministic,
                 }
             })
             .collect()
     }
 }
 
-impl ArbitraryRotationGateable for GpuStateVec {
+#[allow(clippy::cast_possible_truncation)] // Qubit indices from QubitId fit in u32
+impl ArbitraryRotationGateable for GpuStateVec32 {
     fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
         let theta = theta.to_radians_signed();
-        self.apply_single_gate_batch_qubits(qubits, gates::rx(theta));
+        let matrix = gates::rx(theta);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, matrix);
+        }
+        self
+    }
+
+    fn ry(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        let theta = theta.to_radians_signed();
+        let matrix = gates::ry(theta);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, matrix);
+        }
         self
     }
 
     fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
         let theta = theta.to_radians_signed();
-        self.apply_single_gate_batch_qubits(qubits, gates::rz(theta));
+        let matrix = gates::rz(theta);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, matrix);
+        }
+        self
+    }
+
+    fn t(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::T);
+        }
+        self
+    }
+
+    fn tdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, gates::TDG);
+        }
+        self
+    }
+
+    #[allow(clippy::cast_possible_truncation)]
+    fn rxx(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = theta.to_radians_signed() as f32;
+        for &(q0, q1) in pairs {
+            self.queue_rxx(q0.index() as u32, q1.index() as u32, theta);
+        }
         self
     }
 
+    #[allow(clippy::cast_possible_truncation)]
+    fn ryy(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = theta.to_radians_signed() as f32;
+        for &(q0, q1) in pairs {
+            self.queue_ryy(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    #[allow(clippy::cast_possible_truncation)]
     fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        self.rzz_batch_pairs(theta, pairs);
+        let theta = theta.to_radians_signed() as f32;
+        for &(q0, q1) in pairs {
+            self.queue_rzz(q0.index() as u32, q1.index() as u32, theta);
+        }
         self
     }
 }
 
-crate::impl_gpu_drop!(GpuStateVec);
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use pecos_core::qid;
     use pecos_simulators::CliffordGateable;
 
-    // Compile-time assertions that GpuStateVec is Send + Sync.
+    // Compile-time assertions that GpuStateVec32 is Send + Sync.
     // This is required for parallel Monte Carlo simulations.
     const _: fn() = || {
         fn assert_send<T: Send>() {}
         fn assert_sync<T: Sync>() {}
-        assert_send::<GpuStateVec>();
-        assert_sync::<GpuStateVec>();
+        assert_send::<GpuStateVec32>();
+        assert_sync::<GpuStateVec32>();
     };
 
     #[test]
     fn test_initial_state() {
         // Just test that we can create a simulator
-        let sim = GpuStateVec::new(2);
+        let sim = GpuStateVec32::new(2);
         assert!(sim.is_ok());
     }
 
     #[test]
     fn test_hadamard_creates_superposition() {
-        let mut sim = GpuStateVec::new(1).unwrap();
+        let mut sim = GpuStateVec32::new(1).unwrap();
         sim.h(&qid(0));
 
         // Measure many times - should get roughly 50/50
@@ -1174,7 +1922,7 @@ mod tests {
 
     #[test]
     fn test_bell_state() {
-        let mut sim = GpuStateVec::new(2).unwrap();
+        let mut sim = GpuStateVec32::new(2).unwrap();
 
         // Create Bell state: H(0), CX(0,1)
         // Should always measure same value on both qubits
@@ -1194,7 +1942,7 @@ mod tests {
     #[test]
     fn test_derived_clifford_gates() {
         // Test that we get derived gates from the CliffordGateable trait
-        let mut sim = GpuStateVec::new(2).unwrap();
+        let mut sim = GpuStateVec32::new(2).unwrap();
 
         // Test X gate (derived from H and Z, which is derived from SZ)
         sim.x(&qid(0)); // Should flip qubit 0 to |1>
@@ -1234,7 +1982,7 @@ mod tests {
     #[test]
     fn test_derived_rotation_gates() {
         // Test that we get derived gates from the ArbitraryRotationGateable trait
-        let mut sim = GpuStateVec::new(2).unwrap();
+        let mut sim = GpuStateVec32::new(2).unwrap();
 
         // Test RY gate (derived from RX and SZ)
         // RY(pi) should flip |0> to |1>
@@ -1258,7 +2006,7 @@ mod tests {
 
     /// Compare GPU and CPU state vectors with tolerance for f32 vs f64 precision.
     /// Returns the maximum absolute difference found.
-    fn compare_states(gpu: &GpuStateVec, cpu: &mut StateVec) -> f64 {
+    fn compare_states(gpu: &mut GpuStateVec32, cpu: &mut StateVec) -> f64 {
         let gpu_state = gpu.state();
         let cpu_state = cpu.state();
 
@@ -1294,10 +2042,10 @@ mod tests {
 
     #[test]
     fn test_compare_initial_state() {
-        let gpu = GpuStateVec::new(3).unwrap();
+        let mut gpu = GpuStateVec32::new(3).unwrap();
         let mut cpu = StateVec::new(3);
 
-        let max_diff = compare_states(&gpu, &mut cpu);
+        let max_diff = compare_states(&mut gpu, &mut cpu);
         assert!(
             max_diff < TOLERANCE,
             "Initial state mismatch: max_diff = {max_diff}"
@@ -1306,19 +2054,19 @@ mod tests {
 
     #[test]
     fn test_compare_hadamard() {
-        let mut gpu = GpuStateVec::new(2).unwrap();
+        let mut gpu = GpuStateVec32::new(2).unwrap();
         let mut cpu = StateVec::new(2);
 
         // H on qubit 0
         gpu.h(&qid(0));
         cpu.h(&qid(0));
-        let max_diff = compare_states(&gpu, &mut cpu);
+        let max_diff = compare_states(&mut gpu, &mut cpu);
         assert!(max_diff < TOLERANCE, "H(0) mismatch: max_diff = {max_diff}");
 
         // H on qubit 1
         gpu.h(&qid(1));
         cpu.h(&qid(1));
-        let max_diff = compare_states(&gpu, &mut cpu);
+        let max_diff = compare_states(&mut gpu, &mut cpu);
         assert!(
             max_diff < TOLERANCE,
             "H(0)H(1) mismatch: max_diff = {max_diff}"
@@ -1329,33 +2077,33 @@ mod tests {
     fn test_compare_pauli_gates() {
         // Test X gate
         {
-            let mut gpu = GpuStateVec::new(2).unwrap();
+            let mut gpu = GpuStateVec32::new(2).unwrap();
             let mut cpu = StateVec::new(2);
             gpu.x(&qid(0));
             cpu.x(&qid(0));
-            let max_diff = compare_states(&gpu, &mut cpu);
+            let max_diff = compare_states(&mut gpu, &mut cpu);
             assert!(max_diff < TOLERANCE, "X(0) mismatch: max_diff = {max_diff}");
         }
 
         // Test Y gate
         {
-            let mut gpu = GpuStateVec::new(2).unwrap();
+            let mut gpu = GpuStateVec32::new(2).unwrap();
             let mut cpu = StateVec::new(2);
             gpu.y(&qid(1));
             cpu.y(&qid(1));
-            let max_diff = compare_states(&gpu, &mut cpu);
+            let max_diff = compare_states(&mut gpu, &mut cpu);
             assert!(max_diff < TOLERANCE, "Y(1) mismatch: max_diff = {max_diff}");
         }
 
         // Test Z gate
         {
-            let mut gpu = GpuStateVec::new(2).unwrap();
+            let mut gpu = GpuStateVec32::new(2).unwrap();
             let mut cpu = StateVec::new(2);
             gpu.h(&qid(0)); // Put in superposition first so Z has an effect
             cpu.h(&qid(0));
             gpu.z(&qid(0));
             cpu.z(&qid(0));
-            let max_diff = compare_states(&gpu, &mut cpu);
+            let max_diff = compare_states(&mut gpu, &mut cpu);
             assert!(
                 max_diff < TOLERANCE,
                 "H(0)Z(0) mismatch: max_diff = {max_diff}"
@@ -1367,13 +2115,13 @@ mod tests {
     fn test_compare_phase_gates() {
         // Test S gate
         {
-            let mut gpu = GpuStateVec::new(1).unwrap();
+            let mut gpu = GpuStateVec32::new(1).unwrap();
             let mut cpu = StateVec::new(1);
             gpu.h(&qid(0));
             cpu.h(&qid(0));
             gpu.sz(&qid(0));
             cpu.sz(&qid(0));
-            let max_diff = compare_states(&gpu, &mut cpu);
+            let max_diff = compare_states(&mut gpu, &mut cpu);
             assert!(
                 max_diff < TOLERANCE,
                 "H(0)S(0) mismatch: max_diff = {max_diff}"
@@ -1382,13 +2130,13 @@ mod tests {
 
         // Test T gate
         {
-            let mut gpu = GpuStateVec::new(1).unwrap();
+            let mut gpu = GpuStateVec32::new(1).unwrap();
             let mut cpu = StateVec::new(1);
             gpu.h(&qid(0));
             cpu.h(&qid(0));
             gpu.t(&qid(0));
             cpu.t(&qid(0));
-            let max_diff = compare_states(&gpu, &mut cpu);
+            let max_diff = compare_states(&mut gpu, &mut cpu);
             assert!(
                 max_diff < TOLERANCE,
                 "H(0)T(0) mismatch: max_diff = {max_diff}"
@@ -1403,11 +2151,11 @@ mod tests {
         for &theta in &angles {
             // Test RX
             {
-                let mut gpu = GpuStateVec::new(1).unwrap();
+                let mut gpu = GpuStateVec32::new(1).unwrap();
                 let mut cpu = StateVec::new(1);
                 gpu.rx(Angle64::from_radians(theta), &qid(0));
                 cpu.rx(Angle64::from_radians(theta), &qid(0));
-                let max_diff = compare_states(&gpu, &mut cpu);
+                let max_diff = compare_states(&mut gpu, &mut cpu);
                 assert!(
                     max_diff < TOLERANCE,
                     "RX({theta}) mismatch: max_diff = {max_diff}"
@@ -1416,11 +2164,11 @@ mod tests {
 
             // Test RY
             {
-                let mut gpu = GpuStateVec::new(1).unwrap();
+                let mut gpu = GpuStateVec32::new(1).unwrap();
                 let mut cpu = StateVec::new(1);
                 gpu.ry(Angle64::from_radians(theta), &qid(0));
                 cpu.ry(Angle64::from_radians(theta), &qid(0));
-                let max_diff = compare_states(&gpu, &mut cpu);
+                let max_diff = compare_states(&mut gpu, &mut cpu);
                 assert!(
                     max_diff < TOLERANCE,
                     "RY({theta}) mismatch: max_diff = {max_diff}"
@@ -1429,13 +2177,13 @@ mod tests {
 
             // Test RZ
             {
-                let mut gpu = GpuStateVec::new(1).unwrap();
+                let mut gpu = GpuStateVec32::new(1).unwrap();
                 let mut cpu = StateVec::new(1);
                 gpu.h(&qid(0)); // Put in superposition so RZ has visible effect
                 cpu.h(&qid(0));
                 gpu.rz(Angle64::from_radians(theta), &qid(0));
                 cpu.rz(Angle64::from_radians(theta), &qid(0));
-                let max_diff = compare_states(&gpu, &mut cpu);
+                let max_diff = compare_states(&mut gpu, &mut cpu);
                 assert!(
                     max_diff < TOLERANCE,
                     "H RZ({theta}) mismatch: max_diff = {max_diff}"
@@ -1453,7 +2201,7 @@ mod tests {
                     continue;
                 }
 
-                let mut gpu = GpuStateVec::new(3).unwrap();
+                let mut gpu = GpuStateVec32::new(3).unwrap();
                 let mut cpu = StateVec::new(3);
 
                 // Create superposition on control
@@ -1464,7 +2212,7 @@ mod tests {
                 gpu.cx(&[(QubitId(control), QubitId(target))]);
                 cpu.cx(&[(QubitId(control), QubitId(target))]);
 
-                let max_diff = compare_states(&gpu, &mut cpu);
+                let max_diff = compare_states(&mut gpu, &mut cpu);
                 assert!(
                     max_diff < TOLERANCE,
                     "CX({control},{target}) mismatch: max_diff = {max_diff}"
@@ -1475,7 +2223,7 @@ mod tests {
 
     #[test]
     fn test_compare_cz_gate() {
-        let mut gpu = GpuStateVec::new(2).unwrap();
+        let mut gpu = GpuStateVec32::new(2).unwrap();
         let mut cpu = StateVec::new(2);
 
         // Create |++> state
@@ -1488,7 +2236,7 @@ mod tests {
         gpu.cz(&[(QubitId(0), QubitId(1))]);
         cpu.cz(&[(QubitId(0), QubitId(1))]);
 
-        let max_diff = compare_states(&gpu, &mut cpu);
+        let max_diff = compare_states(&mut gpu, &mut cpu);
         assert!(
             max_diff < TOLERANCE,
             "H(0)H(1)CZ(0,1) mismatch: max_diff = {max_diff}"
@@ -1500,7 +2248,7 @@ mod tests {
         let angles = [0.1, 0.5, 1.0, std::f64::consts::PI];
 
         for &theta in &angles {
-            let mut gpu = GpuStateVec::new(2).unwrap();
+            let mut gpu = GpuStateVec32::new(2).unwrap();
             let mut cpu = StateVec::new(2);
 
             // Create superposition
@@ -1513,7 +2261,7 @@ mod tests {
             gpu.rzz(Angle64::from_radians(theta), &[(QubitId(0), QubitId(1))]);
             cpu.rzz(Angle64::from_radians(theta), &[(QubitId(0), QubitId(1))]);
 
-            let max_diff = compare_states(&gpu, &mut cpu);
+            let max_diff = compare_states(&mut gpu, &mut cpu);
             assert!(
                 max_diff < TOLERANCE,
                 "RZZ({theta}) mismatch: max_diff = {max_diff}"
@@ -1524,7 +2272,7 @@ mod tests {
     #[test]
     fn test_compare_complex_circuit() {
         // Test a more complex circuit with multiple gates
-        let mut gpu = GpuStateVec::new(4).unwrap();
+        let mut gpu = GpuStateVec32::new(4).unwrap();
         let mut cpu = StateVec::new(4);
 
         // Layer 1: Hadamards
@@ -1559,7 +2307,7 @@ mod tests {
         gpu.cx(&[(QubitId(1), QubitId(2))]);
         cpu.cx(&[(QubitId(1), QubitId(2))]);
 
-        let max_diff = compare_states(&gpu, &mut cpu);
+        let max_diff = compare_states(&mut gpu, &mut cpu);
         assert!(
             max_diff < TOLERANCE,
             "Complex circuit mismatch: max_diff = {max_diff}"
@@ -1568,7 +2316,7 @@ mod tests {
 
     #[test]
     fn test_compare_reset() {
-        let mut gpu = GpuStateVec::new(2).unwrap();
+        let mut gpu = GpuStateVec32::new(2).unwrap();
         let mut cpu = StateVec::new(2);
 
         // Apply some gates
@@ -1581,7 +2329,7 @@ mod tests {
         gpu.reset();
         cpu.reset();
 
-        let max_diff = compare_states(&gpu, &mut cpu);
+        let max_diff = compare_states(&mut gpu, &mut cpu);
         assert!(
             max_diff < TOLERANCE,
             "Reset state mismatch: max_diff = {max_diff}"
diff --git a/crates/pecos-gpu-sims/src/gpu64.rs b/crates/pecos-gpu-sims/src/gpu64.rs
new file mode 100644
index 000000000..81387dd0d
--- /dev/null
+++ b/crates/pecos-gpu-sims/src/gpu64.rs
@@ -0,0 +1,1570 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under
+// the License.
+
+//! wgpu-based state vector simulator with f64 (double) precision.
+//!
+//! This is the f64 counterpart of [`GpuStateVec`](crate::GpuStateVec). It uses the
+//! `SHADER_F64` wgpu feature (Vulkan `shaderFloat64`) for full double-precision
+//! computation on the GPU.
+//!
+//! Note: f64 throughput on consumer GPUs is typically 1/64th of f32 throughput
+//! (NVIDIA disables most FP64 units on `GeForce` cards). This simulator is intended
+//! for precision-critical workloads and for benchmarking against cuStateVec (which
+//! also uses f64).
+
+use bytemuck::{Pod, Zeroable};
+use pecos_random::PecosRng;
+use rand::RngExt;
+use std::borrow::Cow;
+
+use crate::gates;
+use crate::gpu::{GpuError, RequiredFeature};
+use crate::gpu_probe::gpu_context;
+
+const UNIFORM_ALIGNMENT: usize = 256;
+const MAX_BATCH_SIZE: usize = 256;
+const ALIGNED_GATE_PARAMS_SIZE: usize = UNIFORM_ALIGNMENT;
+const MAX_WORKGROUPS_PER_DIM: u32 = 65535;
+
+/// Gate parameters for f64 precision (matches WGSL struct in `shaders_f64.wgsl`)
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct GateParams64 {
+    target_qubit: u32,
+    control_qubit: u32,
+    num_qubits: u32,
+    _padding: u32,
+    // Matrix elements as f64: a_re, a_im, b_re, b_im, c_re, c_im, d_re, d_im
+    matrix: [f64; 8],
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Pod, Zeroable)]
+struct MeasureParams64 {
+    target_qubit: u32,
+    outcome: u32,
+    norm_factor: f64,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+enum GatePipeline {
+    Single,
+    Diagonal,
+    CX,
+    CY,
+    CZ,
+    Swap,
+    Rxx,
+    Ryy,
+    Rzz,
+}
+
+#[derive(Clone)]
+struct QueuedGate {
+    pipeline: GatePipeline,
+    params: GateParams64,
+}
+
+/// Cross-platform GPU state vector simulator with f64 (double) precision.
+///
+/// Requires a GPU that supports Vulkan `shaderFloat64`.
+pub struct GpuStateVec64 {
+    device: wgpu::Device,
+    queue: wgpu::Queue,
+
+    num_qubits: u32,
+    num_amplitudes: usize,
+
+    state_buffer: wgpu::Buffer,
+    params_buffer: wgpu::Buffer,
+    measure_params_buffer: wgpu::Buffer,
+    staging_buffer: wgpu::Buffer,
+
+    single_gate_pipeline: wgpu::ComputePipeline,
+    diagonal_gate_pipeline: wgpu::ComputePipeline,
+    cx_pipeline: wgpu::ComputePipeline,
+    cy_pipeline: wgpu::ComputePipeline,
+    cz_pipeline: wgpu::ComputePipeline,
+    swap_pipeline: wgpu::ComputePipeline,
+    rxx_pipeline: wgpu::ComputePipeline,
+    ryy_pipeline: wgpu::ComputePipeline,
+    rzz_pipeline: wgpu::ComputePipeline,
+    collapse_pipeline: wgpu::ComputePipeline,
+
+    // Bind group layouts: held to outlive the bind groups built from them
+    // (wgpu may keep only weak references). Underscore-prefixed = intentionally
+    // unread; their job is RAII lifetime, not direct use.
+    _gate_bind_group_layout: wgpu::BindGroupLayout,
+    _collapse_bind_group_layout: wgpu::BindGroupLayout,
+
+    gate_bind_group: wgpu::BindGroup,
+    collapse_bind_group: wgpu::BindGroup,
+    marginal_bind_group: wgpu::BindGroup,
+
+    partial_sums_buffer: wgpu::Buffer,
+    _marginal_bind_group_layout: wgpu::BindGroupLayout,
+    marginal_pipeline: wgpu::ComputePipeline,
+    num_partial_sums: u64,
+
+    // Persistent kernel for small states
+    persistent_pipeline: wgpu::ComputePipeline,
+    _persistent_bind_group_layout: wgpu::BindGroupLayout,
+    persistent_bind_group: wgpu::BindGroup,
+    gate_queue_buffer: wgpu::Buffer,
+    persistent_max_qubits: u32,
+
+    gate_queue: Vec<QueuedGate>,
+    params_staging: Vec<u8>,
+    rng: PecosRng,
+}
+
+impl GpuStateVec64 {
+    fn compute_workgroups(num_elements: usize) -> (u32, u32) {
+        #[allow(clippy::cast_possible_truncation)]
+        let total_workgroups = num_elements.div_ceil(256) as u32;
+        if total_workgroups <= MAX_WORKGROUPS_PER_DIM {
+            (total_workgroups, 1)
+        } else {
+            let y = total_workgroups.div_ceil(MAX_WORKGROUPS_PER_DIM);
+            let x = total_workgroups.div_ceil(y);
+            (x, y)
+        }
+    }
+
+    /// Create a new f64 GPU state vector simulator.
+    ///
+    /// # Errors
+    /// Returns an error if no GPU with f64 support is available.
+    #[allow(clippy::too_many_lines, clippy::similar_names)]
+    pub fn new(num_qubits: u32) -> Result<Self, GpuError> {
+        // 29 qubits = 2^29 * 16 bytes = 8 GB (f64 complex is 16 bytes vs f32's 8)
+        if num_qubits > 29 {
+            return Err(GpuError::TooManyQubits {
+                requested: num_qubits,
+                max: 29,
+            });
+        }
+
+        let num_amplitudes = 1usize << num_qubits;
+
+        let ctx = gpu_context()?;
+        if !ctx.supports_f64 {
+            return Err(GpuError::UnsupportedFeature(RequiredFeature::ShaderF64));
+        }
+        let device = ctx.device;
+        let queue = ctx.queue;
+
+        // f64: each amplitude is vec2<f64> = 16 bytes.
+        let shared_mem_bytes = device.limits().max_compute_workgroup_storage_size;
+        let persistent_max_qubits = if shared_mem_bytes >= 16 {
+            (shared_mem_bytes / 16).ilog2()
+        } else {
+            0
+        };
+
+        let shader: wgpu::ShaderModule =
+            device.create_shader_module(wgpu::ShaderModuleDescriptor {
+                label: Some("Quantum simulation shaders (f64)"),
+                source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(include_str!("shaders_f64.wgsl"))),
+            });
+
+        // State buffer: 16 bytes per amplitude (2 x f64)
+        let state_buffer_size = (num_amplitudes * 16) as u64;
+        let state_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("State vector (f64)"),
+            size: state_buffer_size,
+            usage: wgpu::BufferUsages::STORAGE
+                | wgpu::BufferUsages::COPY_DST
+                | wgpu::BufferUsages::COPY_SRC,
+            mapped_at_creation: false,
+        });
+
+        let params_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Gate parameters (f64)"),
+            size: (ALIGNED_GATE_PARAMS_SIZE * MAX_BATCH_SIZE) as u64,
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        let measure_params_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Measure parameters (f64)"),
+            size: std::mem::size_of::<MeasureParams64>() as u64,
+            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Staging buffer (f64)"),
+            size: state_buffer_size,
+            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        // Bind group layouts (same structure as f32 version)
+        let gate_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Gate bind group layout (f64)"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: true,
+                            min_binding_size: std::num::NonZeroU64::new(std::mem::size_of::<
+                                GateParams64,
+                            >(
+                            )
+                                as u64),
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
+        let collapse_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Collapse bind group layout (f64)"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 3,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
+        let gate_pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("Gate pipeline layout (f64)"),
+            bind_group_layouts: &[Some(&gate_bind_group_layout)],
+            immediate_size: 0,
+        });
+
+        let collapse_pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: Some("Collapse pipeline layout (f64)"),
+                bind_group_layouts: &[Some(&collapse_bind_group_layout)],
+                immediate_size: 0,
+            });
+
+        let make_pipeline = |label, entry_point, layout: &wgpu::PipelineLayout| {
+            device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                label: Some(label),
+                layout: Some(layout),
+                module: &shader,
+                entry_point: Some(entry_point),
+                compilation_options: wgpu::PipelineCompilationOptions::default(),
+                cache: None,
+            })
+        };
+
+        let single_gate_pipeline = make_pipeline(
+            "Single gate (f64)",
+            "apply_single_gate",
+            &gate_pipeline_layout,
+        );
+        let diagonal_gate_pipeline = make_pipeline(
+            "Diagonal gate (f64)",
+            "apply_diagonal_gate",
+            &gate_pipeline_layout,
+        );
+        let cx_pipeline = make_pipeline("CX (f64)", "apply_cx", &gate_pipeline_layout);
+        let cy_pipeline = make_pipeline("CY (f64)", "apply_cy", &gate_pipeline_layout);
+        let cz_pipeline = make_pipeline("CZ (f64)", "apply_cz", &gate_pipeline_layout);
+        let swap_pipeline = make_pipeline("SWAP (f64)", "apply_swap", &gate_pipeline_layout);
+        let rxx_pipeline = make_pipeline("RXX (f64)", "apply_rxx", &gate_pipeline_layout);
+        let ryy_pipeline = make_pipeline("RYY (f64)", "apply_ryy", &gate_pipeline_layout);
+        let rzz_pipeline = make_pipeline("RZZ (f64)", "apply_rzz", &gate_pipeline_layout);
+        let collapse_pipeline = make_pipeline(
+            "Collapse (f64)",
+            "collapse_state",
+            &collapse_pipeline_layout,
+        );
+
+        let gate_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Gate bind group (f64)"),
+            layout: &gate_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: state_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: wgpu::BindingResource::Buffer(wgpu::BufferBinding {
+                        buffer: &params_buffer,
+                        offset: 0,
+                        size: std::num::NonZeroU64::new(std::mem::size_of::<GateParams64>() as u64),
+                    }),
+                },
+            ],
+        });
+
+        let collapse_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Collapse bind group (f64)"),
+            layout: &collapse_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: state_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: params_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 3,
+                    resource: measure_params_buffer.as_entire_binding(),
+                },
+            ],
+        });
+
+        // Marginal probability reduction (partial sums are f64)
+        let (meas_wg_x, meas_wg_y) = Self::compute_workgroups(num_amplitudes);
+        let num_partial_sums = u64::from(meas_wg_x) * u64::from(meas_wg_y);
+
+        let partial_sums_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Marginal partial sums (f64)"),
+            size: num_partial_sums * 8, // f64 = 8 bytes
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+            mapped_at_creation: false,
+        });
+
+        let marginal_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Marginal bind group layout (f64)"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 4,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
+        let marginal_pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: Some("Marginal pipeline layout (f64)"),
+                bind_group_layouts: &[Some(&marginal_bind_group_layout)],
+                immediate_size: 0,
+            });
+
+        let marginal_pipeline = make_pipeline(
+            "Marginal probability (f64)",
+            "reduce_marginal_probability",
+            &marginal_pipeline_layout,
+        );
+
+        let marginal_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Marginal bind group (f64)"),
+            layout: &marginal_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: state_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: params_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 4,
+                    resource: partial_sums_buffer.as_entire_binding(),
+                },
+            ],
+        });
+
+        // Persistent kernel: gate queue as array<f64>
+        // 12 f64 per gate (type + tgt + ctrl + pad + 8 matrix elements) + 2 f64 header
+        let gate_queue_buffer_size = (2 + MAX_BATCH_SIZE * 12) * 8;
+        let gate_queue_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Persistent gate queue (f64)"),
+            size: gate_queue_buffer_size as u64,
+            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        let persistent_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: Some("Persistent bind group layout (f64)"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: false },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 5,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: true },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+
+        let persistent_pipeline_layout =
+            device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: Some("Persistent pipeline layout (f64)"),
+                bind_group_layouts: &[Some(&persistent_bind_group_layout)],
+                immediate_size: 0,
+            });
+
+        // Compile persistent kernel shader with dynamic shared memory size
+        let shared_size = 1u32 << persistent_max_qubits;
+        let persistent_shader_src = include_str!("persistent_kernel_f64.wgsl")
+            .replace("{SHARED_SIZE}", &shared_size.to_string());
+        let persistent_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("Persistent kernel shader (f64)"),
+            source: wgpu::ShaderSource::Wgsl(Cow::Owned(persistent_shader_src)),
+        });
+
+        let persistent_pipeline =
+            device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                label: Some("Persistent kernel pipeline (f64)"),
+                layout: Some(&persistent_pipeline_layout),
+                module: &persistent_shader,
+                entry_point: Some("apply_gate_queue_persistent"),
+                compilation_options: wgpu::PipelineCompilationOptions::default(),
+                cache: None,
+            });
+
+        let persistent_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Persistent bind group (f64)"),
+            layout: &persistent_bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: state_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 5,
+                    resource: gate_queue_buffer.as_entire_binding(),
+                },
+            ],
+        });
+
+        let mut sim = Self {
+            device,
+            queue,
+            num_qubits,
+            num_amplitudes,
+            state_buffer,
+            params_buffer,
+            measure_params_buffer,
+            staging_buffer,
+            single_gate_pipeline,
+            diagonal_gate_pipeline,
+            cx_pipeline,
+            cy_pipeline,
+            cz_pipeline,
+            swap_pipeline,
+            rxx_pipeline,
+            ryy_pipeline,
+            rzz_pipeline,
+            collapse_pipeline,
+            _gate_bind_group_layout: gate_bind_group_layout,
+            _collapse_bind_group_layout: collapse_bind_group_layout,
+            gate_bind_group,
+            collapse_bind_group,
+            marginal_bind_group,
+            partial_sums_buffer,
+            _marginal_bind_group_layout: marginal_bind_group_layout,
+            marginal_pipeline,
+            num_partial_sums,
+            persistent_pipeline,
+            _persistent_bind_group_layout: persistent_bind_group_layout,
+            persistent_bind_group,
+            gate_queue_buffer,
+            persistent_max_qubits,
+            gate_queue: Vec::with_capacity(256),
+            params_staging: vec![0u8; ALIGNED_GATE_PARAMS_SIZE * MAX_BATCH_SIZE],
+            rng: rand::make_rng(),
+        };
+
+        sim.reset();
+        Ok(sim)
+    }
+
+    /// Create with a specific RNG seed for reproducibility.
+    ///
+    /// # Errors
+    /// Returns [`GpuError`] if GPU init fails (no adapter, `SHADER_F64` not supported,
+    /// or too many qubits).
+    pub fn with_seed(num_qubits: u32, seed: u64) -> Result<Self, GpuError> {
+        let mut sim = Self::new(num_qubits)?;
+        sim.rng = PecosRng::seed_from_u64(seed);
+        Ok(sim)
+    }
+
+    pub fn reset(&mut self) {
+        self.gate_queue.clear();
+        let mut initial_state = vec![[0.0f64, 0.0f64]; self.num_amplitudes];
+        initial_state[0] = [1.0, 0.0];
+        self.queue
+            .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(&initial_state));
+    }
+
+    // -- Gate fusion --
+
+    /// Multiply two 2x2 complex matrices in [`a_re`, `a_im`, `b_re`, `b_im`, `c_re`, `c_im`, `d_re`, `d_im`] format.
+    fn matrix_mul(a: &[f64; 8], b: &[f64; 8]) -> [f64; 8] {
+        // Complex multiply helper: (xr + xi*i) * (yr + yi*i)
+        #[inline]
+        fn cmul(xr: f64, xi: f64, yr: f64, yi: f64) -> (f64, f64) {
+            (xr * yr - xi * yi, xr * yi + xi * yr)
+        }
+
+        // C = A * B where A = [[a0, a1], [a2, a3]], B = [[b0, b1], [b2, b3]]
+        let (c0r, c0i) = {
+            let (t1r, t1i) = cmul(a[0], a[1], b[0], b[1]);
+            let (t2r, t2i) = cmul(a[2], a[3], b[4], b[5]);
+            (t1r + t2r, t1i + t2i)
+        };
+        let (c1r, c1i) = {
+            let (t1r, t1i) = cmul(a[0], a[1], b[2], b[3]);
+            let (t2r, t2i) = cmul(a[2], a[3], b[6], b[7]);
+            (t1r + t2r, t1i + t2i)
+        };
+        let (c2r, c2i) = {
+            let (t1r, t1i) = cmul(a[4], a[5], b[0], b[1]);
+            let (t2r, t2i) = cmul(a[6], a[7], b[4], b[5]);
+            (t1r + t2r, t1i + t2i)
+        };
+        let (c3r, c3i) = {
+            let (t1r, t1i) = cmul(a[4], a[5], b[2], b[3]);
+            let (t2r, t2i) = cmul(a[6], a[7], b[6], b[7]);
+            (t1r + t2r, t1i + t2i)
+        };
+
+        [c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i]
+    }
+
+    /// Reorder single-qubit gates to group same-qubit gates together for fusion.
+    ///
+    /// Single-qubit gates on different qubits commute, so they can be freely
+    /// reordered. Two-qubit gates act as barriers and are not moved.
+    /// This turns `H(0), H(1), RZ(0), RZ(1)` into `H(0), RZ(0), H(1), RZ(1)`.
+    fn reorder_for_fusion(queue: &mut [QueuedGate]) {
+        // Find runs of single-qubit gates (between two-qubit gates) and sort by target
+        let mut start = 0;
+        while start < queue.len() {
+            // Skip two-qubit gates
+            if !matches!(
+                queue[start].pipeline,
+                GatePipeline::Single | GatePipeline::Diagonal
+            ) {
+                start += 1;
+                continue;
+            }
+
+            // Find end of single-qubit run
+            let mut end = start + 1;
+            while end < queue.len()
+                && matches!(
+                    queue[end].pipeline,
+                    GatePipeline::Single | GatePipeline::Diagonal
+                )
+            {
+                end += 1;
+            }
+
+            // Sort the run by target qubit (stable sort preserves order within same qubit)
+            queue[start..end].sort_by_key(|g| g.params.target_qubit);
+
+            start = end;
+        }
+    }
+
+    /// Fuse consecutive single-qubit gates on the same qubit by multiplying matrices.
+    /// Returns a new queue with fewer gates.
+    fn fuse_gate_queue(queue: &mut [QueuedGate]) -> Vec<QueuedGate> {
+        Self::reorder_for_fusion(queue);
+        if queue.len() <= 1 {
+            return queue.to_vec();
+        }
+
+        let mut fused = Vec::with_capacity(queue.len());
+        let mut i = 0;
+
+        while i < queue.len() {
+            let gate = &queue[i];
+
+            // Only fuse single-qubit gates (Single or Diagonal)
+            let is_1q = matches!(gate.pipeline, GatePipeline::Single | GatePipeline::Diagonal);
+            if !is_1q {
+                fused.push(queue[i].clone());
+                i += 1;
+                continue;
+            }
+
+            // Accumulate consecutive single-qubit gates on the same qubit
+            let target = gate.params.target_qubit;
+            let mut matrix = gate.params.matrix;
+            let mut j = i + 1;
+
+            while j < queue.len() {
+                let next = &queue[j];
+                let next_is_1q =
+                    matches!(next.pipeline, GatePipeline::Single | GatePipeline::Diagonal);
+                if !next_is_1q || next.params.target_qubit != target {
+                    break;
+                }
+                matrix = Self::matrix_mul(&next.params.matrix, &matrix);
+                j += 1;
+            }
+
+            // Check if the fused result is diagonal
+            let is_diagonal =
+                matrix[2] == 0.0 && matrix[3] == 0.0 && matrix[4] == 0.0 && matrix[5] == 0.0;
+
+            fused.push(QueuedGate {
+                pipeline: if is_diagonal {
+                    GatePipeline::Diagonal
+                } else {
+                    GatePipeline::Single
+                },
+                params: GateParams64 {
+                    target_qubit: target,
+                    control_qubit: 0,
+                    num_qubits: gate.params.num_qubits,
+                    _padding: 0,
+                    matrix,
+                },
+            });
+
+            i = j;
+        }
+
+        fused
+    }
+
+    // -- Gate queue methods --
+
+    #[allow(clippy::cast_possible_truncation)]
+    /// Encode fused gates into the persistent kernel's storage buffer format.
+    /// Buffer is array<f64>. Metadata fields stored as f64-encoded u32 values.
+    /// Each gate: 12 f64 [type, tgt, ctrl, pad, matrix(8 x f64)]
+    /// Header: [`num_gates`, `num_qubits`] as f64.
+    fn encode_persistent_queue_f64(
+        fused: &[QueuedGate],
+        num_qubits: u32,
+        staging: &mut Vec<u8>,
+    ) -> usize {
+        let num_gates = fused.len();
+        let total_f64 = 2 + num_gates * 12;
+        let total_bytes = total_f64 * 8;
+
+        if staging.len() < total_bytes {
+            staging.resize(total_bytes, 0);
+        }
+
+        let buf: &mut [f64] = bytemuck::cast_slice_mut(&mut staging[..total_bytes]);
+        #[allow(clippy::cast_precision_loss)] // num_gates <= MAX_BATCH_SIZE (256), safe for f64
+        {
+            buf[0] = num_gates as f64;
+        }
+        buf[1] = f64::from(num_qubits);
+
+        for (i, gate) in fused.iter().enumerate() {
+            let base = 2 + i * 12;
+            buf[base] = match gate.pipeline {
+                GatePipeline::Single => 0.0,
+                GatePipeline::Diagonal => 1.0,
+                GatePipeline::CX => 2.0,
+                GatePipeline::CY => 3.0,
+                GatePipeline::CZ => 4.0,
+                GatePipeline::Swap => 5.0,
+                GatePipeline::Rxx => 6.0,
+                GatePipeline::Ryy => 7.0,
+                GatePipeline::Rzz => 8.0,
+            };
+            buf[base + 1] = f64::from(gate.params.target_qubit);
+            buf[base + 2] = f64::from(gate.params.control_qubit);
+            buf[base + 3] = 0.0;
+            buf[base + 4..base + 12].copy_from_slice(&gate.params.matrix);
+        }
+
+        total_bytes
+    }
+
+    fn flush_gates(&mut self) {
+        if self.gate_queue.is_empty() {
+            return;
+        }
+        let mut encoder = self
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: Some("Flush gates encoder (f64)"),
+            });
+        self.record_flush_gates(&mut encoder);
+        self.queue.submit(std::iter::once(encoder.finish()));
+    }
+
+    fn record_flush_gates(&mut self, encoder: &mut wgpu::CommandEncoder) {
+        if self.gate_queue.is_empty() {
+            return;
+        }
+
+        // Fuse consecutive single-qubit gates on the same qubit
+        let fused = Self::fuse_gate_queue(&mut self.gate_queue);
+
+        // Use persistent kernel if state fits in shared memory
+        if self.num_qubits <= self.persistent_max_qubits {
+            let total_bytes = Self::encode_persistent_queue_f64(
+                &fused,
+                self.num_qubits,
+                &mut self.params_staging,
+            );
+            self.queue.write_buffer(
+                &self.gate_queue_buffer,
+                0,
+                &self.params_staging[..total_bytes],
+            );
+
+            {
+                let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                    label: Some("Persistent kernel pass (f64)"),
+                    timestamp_writes: None,
+                });
+                pass.set_pipeline(&self.persistent_pipeline);
+                pass.set_bind_group(0, &self.persistent_bind_group, &[]);
+                pass.dispatch_workgroups(1, 1, 1);
+            }
+
+            self.gate_queue.clear();
+            return;
+        }
+
+        // Regular path: N dispatches into this encoder
+        let aligned = ALIGNED_GATE_PARAMS_SIZE;
+        let total_size = fused.len() * aligned;
+        for (i, gate) in fused.iter().enumerate() {
+            let offset = i * aligned;
+            let bytes = bytemuck::bytes_of(&gate.params);
+            self.params_staging[offset..offset + bytes.len()].copy_from_slice(bytes);
+        }
+        self.queue
+            .write_buffer(&self.params_buffer, 0, &self.params_staging[..total_size]);
+
+        {
+            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("Batched gate pass (f64)"),
+                timestamp_writes: None,
+            });
+
+            let num_pairs = self.num_amplitudes / 2;
+            let (pair_wg_x, pair_wg_y) = Self::compute_workgroups(num_pairs);
+            let (amp_wg_x, amp_wg_y) = Self::compute_workgroups(self.num_amplitudes);
+
+            let mut current_pipeline = None;
+
+            for (i, gate) in fused.iter().enumerate() {
+                if current_pipeline != Some(gate.pipeline) {
+                    let pipeline = match gate.pipeline {
+                        GatePipeline::Single => &self.single_gate_pipeline,
+                        GatePipeline::Diagonal => &self.diagonal_gate_pipeline,
+                        GatePipeline::CX => &self.cx_pipeline,
+                        GatePipeline::CY => &self.cy_pipeline,
+                        GatePipeline::CZ => &self.cz_pipeline,
+                        GatePipeline::Swap => &self.swap_pipeline,
+                        GatePipeline::Rxx => &self.rxx_pipeline,
+                        GatePipeline::Ryy => &self.ryy_pipeline,
+                        GatePipeline::Rzz => &self.rzz_pipeline,
+                    };
+                    pass.set_pipeline(pipeline);
+                    current_pipeline = Some(gate.pipeline);
+                }
+
+                let offset = u32::try_from(i * ALIGNED_GATE_PARAMS_SIZE)
+                    .expect("batch offset always fits in u32 (i < MAX_BATCH_SIZE)");
+                pass.set_bind_group(0, &self.gate_bind_group, &[offset]);
+
+                let (wg_x, wg_y) = match gate.pipeline {
+                    GatePipeline::Single => (pair_wg_x, pair_wg_y),
+                    _ => (amp_wg_x, amp_wg_y),
+                };
+                pass.dispatch_workgroups(wg_x, wg_y, 1);
+            }
+        }
+
+        self.gate_queue.clear();
+    }
+
+    pub fn sync(&mut self) {
+        self.flush_gates();
+        let _ = self.device.poll(wgpu::PollType::wait_indefinitely());
+    }
+
+    /// Convert an f32 gate matrix to f64 for the params struct.
+    fn matrix_f32_to_f64(m: [f32; 8]) -> [f64; 8] {
+        [
+            f64::from(m[0]),
+            f64::from(m[1]),
+            f64::from(m[2]),
+            f64::from(m[3]),
+            f64::from(m[4]),
+            f64::from(m[5]),
+            f64::from(m[6]),
+            f64::from(m[7]),
+        ]
+    }
+
+    fn queue_single_gate(&mut self, qubit: u32, matrix: [f64; 8]) {
+        // Diagonal gates have zero off-diagonal elements (b=0, c=0).
+        // Use the specialized diagonal shader: half the arithmetic, fully coalesced.
+        let is_diagonal =
+            matrix[2] == 0.0 && matrix[3] == 0.0 && matrix[4] == 0.0 && matrix[5] == 0.0;
+        let pipeline = if is_diagonal {
+            GatePipeline::Diagonal
+        } else {
+            GatePipeline::Single
+        };
+        self.gate_queue.push(QueuedGate {
+            pipeline,
+            params: GateParams64 {
+                target_qubit: qubit,
+                control_qubit: 0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix,
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    fn queue_cx(&mut self, control: u32, target: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::CX,
+            params: GateParams64 {
+                target_qubit: target,
+                control_qubit: control,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix: [0.0; 8],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    fn queue_cz(&mut self, control: u32, target: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::CZ,
+            params: GateParams64 {
+                target_qubit: target,
+                control_qubit: control,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix: [0.0; 8],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    fn queue_cy(&mut self, control: u32, target: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::CY,
+            params: GateParams64 {
+                target_qubit: target,
+                control_qubit: control,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix: [0.0; 8],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    fn queue_swap(&mut self, qubit0: u32, qubit1: u32) {
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Swap,
+            params: GateParams64 {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix: [0.0; 8],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    fn queue_rxx(&mut self, qubit0: u32, qubit1: u32, theta: f64) {
+        // Precompute cos/sin on the CPU -- wgpu+Vulkan doesn't reliably support
+        // f64 transcendental functions in the shader. Pass (c, s) as f64 instead.
+        let (c, s) = ((theta / 2.0).cos(), (theta / 2.0).sin());
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Rxx,
+            params: GateParams64 {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix: [c, s, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    fn queue_ryy(&mut self, qubit0: u32, qubit1: u32, theta: f64) {
+        let (c, s) = ((theta / 2.0).cos(), (theta / 2.0).sin());
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Ryy,
+            params: GateParams64 {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix: [c, s, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    fn queue_rzz(&mut self, qubit0: u32, qubit1: u32, theta: f64) {
+        let (c, s) = ((theta / 2.0).cos(), (theta / 2.0).sin());
+        self.gate_queue.push(QueuedGate {
+            pipeline: GatePipeline::Rzz,
+            params: GateParams64 {
+                target_qubit: qubit1,
+                control_qubit: qubit0,
+                num_qubits: self.num_qubits,
+                _padding: 0,
+                matrix: [c, s, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            },
+        });
+        if self.gate_queue.len() >= MAX_BATCH_SIZE {
+            self.flush_gates();
+        }
+    }
+
+    // -- Measurement --
+
+    #[allow(clippy::too_many_lines, clippy::cast_possible_truncation)]
+    fn mz_cpu_path(&mut self, qubit: u32) -> (u32, bool) {
+        const DET_EPS: f64 = 1e-10;
+
+        let mut state_data = self.state();
+        let target_mask = 1usize << qubit;
+
+        let prob_one: f64 = state_data
+            .iter()
+            .enumerate()
+            .filter(|(i, _)| i & target_mask != 0)
+            .map(|(_, [re, im])| re * re + im * im)
+            .sum();
+
+        let is_deterministic = !(DET_EPS..=1.0 - DET_EPS).contains(&prob_one);
+        let outcome = if is_deterministic {
+            u32::from(prob_one > 0.5)
+        } else {
+            let random: f64 = self.rng.random();
+            u32::from(random < prob_one)
+        };
+
+        let norm_factor = if outcome == 1 {
+            1.0 / prob_one.sqrt()
+        } else {
+            1.0 / (1.0 - prob_one).sqrt()
+        };
+
+        for (i, amp) in state_data.iter_mut().enumerate() {
+            let qubit_val = u32::from(i & target_mask != 0);
+            if qubit_val == outcome {
+                amp[0] *= norm_factor;
+                amp[1] *= norm_factor;
+            } else {
+                *amp = [0.0, 0.0];
+            }
+        }
+
+        self.queue
+            .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(&state_data));
+
+        (outcome, is_deterministic)
+    }
+
+    fn mz_gpu(&mut self, qubit: u32) -> (u32, bool) {
+        const DET_EPS: f64 = 1e-10;
+
+        if self.num_qubits <= self.persistent_max_qubits {
+            return self.mz_cpu_path(qubit);
+        }
+        let params = GateParams64 {
+            target_qubit: qubit,
+            control_qubit: 0,
+            num_qubits: self.num_qubits,
+            _padding: 0,
+            matrix: [0.0; 8],
+        };
+        self.queue
+            .write_buffer(&self.params_buffer, 0, bytemuck::bytes_of(&params));
+
+        let mut encoder = self
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: Some("Measurement encoder (f64)"),
+            });
+
+        {
+            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("Marginal probability pass (f64)"),
+                timestamp_writes: None,
+            });
+            pass.set_pipeline(&self.marginal_pipeline);
+            pass.set_bind_group(0, &self.marginal_bind_group, &[]);
+            let (wg_x, wg_y) = Self::compute_workgroups(self.num_amplitudes);
+            pass.dispatch_workgroups(wg_x, wg_y, 1);
+        }
+
+        let readback_size = self.num_partial_sums * 8; // f64
+        encoder.copy_buffer_to_buffer(
+            &self.partial_sums_buffer,
+            0,
+            &self.staging_buffer,
+            0,
+            readback_size,
+        );
+        self.queue.submit(std::iter::once(encoder.finish()));
+
+        let buffer_slice = self.staging_buffer.slice(..readback_size);
+        buffer_slice.map_async(wgpu::MapMode::Read, |_| {});
+        self.device
+            .poll(wgpu::PollType::wait_indefinitely())
+            .expect("GPU device poll failed");
+
+        let prob_one: f64 = {
+            let data = buffer_slice.get_mapped_range();
+            let partial_sums: &[f64] = bytemuck::cast_slice(&data);
+            partial_sums.iter().sum()
+        };
+        self.staging_buffer.unmap();
+
+        let is_deterministic = !(DET_EPS..=1.0 - DET_EPS).contains(&prob_one);
+        let outcome: u32 = if is_deterministic {
+            u32::from(prob_one > 0.5)
+        } else {
+            let random: f64 = self.rng.random();
+            u32::from(random < prob_one)
+        };
+
+        // Collapse
+        let norm_factor = if outcome == 1 {
+            1.0 / prob_one.sqrt()
+        } else {
+            1.0 / (1.0 - prob_one).sqrt()
+        };
+
+        let measure_params = MeasureParams64 {
+            target_qubit: qubit,
+            outcome,
+            norm_factor,
+        };
+        self.queue.write_buffer(
+            &self.measure_params_buffer,
+            0,
+            bytemuck::bytes_of(&measure_params),
+        );
+
+        let mut encoder = self
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: Some("Collapse encoder (f64)"),
+            });
+
+        {
+            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("Collapse pass (f64)"),
+                timestamp_writes: None,
+            });
+            pass.set_pipeline(&self.collapse_pipeline);
+            pass.set_bind_group(0, &self.collapse_bind_group, &[]);
+            let (wg_x, wg_y) = Self::compute_workgroups(self.num_amplitudes);
+            pass.dispatch_workgroups(wg_x, wg_y, 1);
+        }
+
+        self.queue.submit(std::iter::once(encoder.finish()));
+        (outcome, is_deterministic)
+    }
+
+    /// Read back the full state vector from GPU.
+    ///
+    /// # Panics
+    /// Panics if the GPU device poll or buffer readback fails.
+    #[must_use]
+    pub fn state(&mut self) -> Vec<[f64; 2]> {
+        let mut encoder = self
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: Some("State readback encoder (f64)"),
+            });
+        self.record_flush_gates(&mut encoder);
+
+        encoder.copy_buffer_to_buffer(
+            &self.state_buffer,
+            0,
+            &self.staging_buffer,
+            0,
+            (self.num_amplitudes * 16) as u64,
+        );
+
+        self.queue.submit(std::iter::once(encoder.finish()));
+
+        let buffer_slice = self.staging_buffer.slice(..);
+        buffer_slice.map_async(wgpu::MapMode::Read, |_| {});
+        self.device
+            .poll(wgpu::PollType::wait_indefinitely())
+            .expect("GPU device poll failed");
+
+        let state: Vec<[f64; 2]> = {
+            let data = buffer_slice.get_mapped_range();
+            bytemuck::cast_slice(&data).to_vec()
+        };
+        self.staging_buffer.unmap();
+        state
+    }
+
+    /// Overwrite the GPU state buffer with `amps`. Length must equal
+    /// `num_amplitudes`; caller is responsible for the state being normalized.
+    /// Pending queued gates are flushed first.
+    ///
+    /// # Panics
+    /// Panics if `amps.len() != num_amplitudes`.
+    pub fn write_state(&mut self, amps: &[[f64; 2]]) {
+        assert_eq!(
+            amps.len(),
+            self.num_amplitudes,
+            "write_state: slice length mismatch"
+        );
+        self.flush_gates();
+        self.queue
+            .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(amps));
+    }
+}
+
+// -- Trait implementations --
+
+use pecos_core::{Angle64, QubitId};
+use pecos_simulators::{
+    ArbitraryRotationGateable, CliffordGateable, MeasurementResult, QuantumSimulator,
+};
+
+impl QuantumSimulator for GpuStateVec64 {
+    fn reset(&mut self) -> &mut Self {
+        self.reset();
+        self
+    }
+}
+
+#[allow(clippy::cast_possible_truncation)]
+impl CliffordGateable for GpuStateVec64 {
+    fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::H);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn x(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::X);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn y(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::Y);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn z(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::Z);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn sx(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::SX);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn sxdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::SXDG);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn sy(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::SY);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn sydg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::SYDG);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::S);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn szdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::SDG);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        for &(c, t) in pairs {
+            self.queue_cx(c.index() as u32, t.index() as u32);
+        }
+        self
+    }
+
+    fn cy(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        for &(c, t) in pairs {
+            self.queue_cy(c.index() as u32, t.index() as u32);
+        }
+        self
+    }
+
+    fn cz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        for &(c, t) in pairs {
+            self.queue_cz(c.index() as u32, t.index() as u32);
+        }
+        self
+    }
+
+    fn swap(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        for &(q0, q1) in pairs {
+            self.queue_swap(q0.index() as u32, q1.index() as u32);
+        }
+        self
+    }
+
+    fn szz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = std::f64::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rzz(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn szzdg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = -std::f64::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rzz(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn sxx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = std::f64::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rxx(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn sxxdg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = -std::f64::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_rxx(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn syy(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = std::f64::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_ryy(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn syydg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = -std::f64::consts::FRAC_PI_2;
+        for &(q0, q1) in pairs {
+            self.queue_ryy(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        self.flush_gates();
+
+        // Empirical f64 mz path selection (RTX 4090, 2026-04-11).
+        // M=1 always GPU: single readback amortizes poorly vs reduction+collapse.
+        // Larger M covers the readback and the CPU loop wins for small N.
+        // Re-run native_bench's f64 (N,M) probe to recalibrate.
+        let min_m_for_batch = match self.num_qubits {
+            0..=13 => 2,
+            14 => 4,
+            _ => usize::MAX,
+        };
+        if qubits.len() >= min_m_for_batch {
+            self.mz_cpu_batch(qubits)
+        } else {
+            self.mz_gpu_sequential(qubits)
+        }
+    }
+}
+
+impl GpuStateVec64 {
+    /// Read state, measure all qubits on CPU, write state back. Skips path
+    /// selection -- intended for benchmarking and tests that need to force a
+    /// specific path. Production code should call `mz()`.
+    pub fn mz_cpu_batch(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        const DET_EPS: f64 = 1e-10;
+
+        self.flush_gates();
+        let mut state_data = self.state();
+        let results: Vec<MeasurementResult> = qubits
+            .iter()
+            .map(|&q| {
+                let target_mask = 1usize << q.index();
+
+                let prob_one: f64 = state_data
+                    .iter()
+                    .enumerate()
+                    .filter(|(i, _)| i & target_mask != 0)
+                    .map(|(_, [re, im])| re * re + im * im)
+                    .sum();
+
+                let is_deterministic = !(DET_EPS..=1.0 - DET_EPS).contains(&prob_one);
+                let outcome = if is_deterministic {
+                    u32::from(prob_one > 0.5)
+                } else {
+                    let random: f64 = self.rng.random();
+                    u32::from(random < prob_one)
+                };
+
+                let norm_factor = if outcome == 1 {
+                    1.0 / prob_one.sqrt()
+                } else {
+                    1.0 / (1.0 - prob_one).sqrt()
+                };
+
+                for (i, amp) in state_data.iter_mut().enumerate() {
+                    let qubit_val = u32::from(i & target_mask != 0);
+                    if qubit_val == outcome {
+                        amp[0] *= norm_factor;
+                        amp[1] *= norm_factor;
+                    } else {
+                        *amp = [0.0, 0.0];
+                    }
+                }
+
+                MeasurementResult {
+                    outcome: outcome == 1,
+                    is_deterministic,
+                }
+            })
+            .collect();
+
+        self.queue
+            .write_buffer(&self.state_buffer, 0, bytemuck::cast_slice(&state_data));
+        results
+    }
+
+    /// Sequential per-qubit GPU measurement. Skips path selection -- intended
+    /// for benchmarking and tests that need to force a specific path.
+    /// Production code should call `mz()`.
+    pub fn mz_gpu_sequential(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        self.flush_gates();
+        qubits
+            .iter()
+            .map(|&q| {
+                #[allow(clippy::cast_possible_truncation)]
+                let (outcome, is_deterministic) = self.mz_gpu(q.index() as u32);
+                MeasurementResult {
+                    outcome: outcome == 1,
+                    is_deterministic,
+                }
+            })
+            .collect()
+    }
+}
+
+#[allow(clippy::cast_possible_truncation)] // Qubit indices from QubitId fit in u32
+impl ArbitraryRotationGateable for GpuStateVec64 {
+    fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::rx(theta.to_radians_signed()));
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn ry(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::ry(theta.to_radians_signed()));
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::rz(theta.to_radians_signed()));
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn t(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::T);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn tdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        let m = Self::matrix_f32_to_f64(gates::TDG);
+        for &q in qubits {
+            self.queue_single_gate(q.index() as u32, m);
+        }
+        self
+    }
+
+    fn rxx(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = theta.to_radians_signed();
+        for &(q0, q1) in pairs {
+            self.queue_rxx(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    fn ryy(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = theta.to_radians_signed();
+        for &(q0, q1) in pairs {
+            self.queue_ryy(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+
+    #[allow(clippy::cast_possible_truncation)]
+    fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        let theta = theta.to_radians_signed();
+        for &(q0, q1) in pairs {
+            self.queue_rzz(q0.index() as u32, q1.index() as u32, theta);
+        }
+        self
+    }
+}
diff --git a/crates/pecos-gpu-sims/src/gpu_auto.rs b/crates/pecos-gpu-sims/src/gpu_auto.rs
new file mode 100644
index 000000000..02ba8ae48
--- /dev/null
+++ b/crates/pecos-gpu-sims/src/gpu_auto.rs
@@ -0,0 +1,123 @@
+//! Precision-auto wrapper that picks f64 when available and falls back to f32.
+//!
+//! Provides [`GpuStateVecAuto`] for cross-platform code that wants *some* GPU
+//! state vector without caring whether the adapter supports `SHADER_F64`.
+//! Explicit users should keep reaching for [`GpuStateVec32`] or [`GpuStateVec64`]
+//! directly -- this wrapper is intentionally opt-in.
+
+use pecos_core::{Angle64, QubitId};
+use pecos_simulators::{
+    ArbitraryRotationGateable, CliffordGateable, MeasurementResult, QuantumSimulator,
+};
+
+use crate::{GpuError, GpuStateVec32, GpuStateVec64, RequiredFeature};
+
+/// GPU state vector simulator that selects precision at runtime.
+///
+/// Construct with [`GpuStateVecAuto::new`]. It first tries f64
+/// ([`GpuStateVec64`]); if the adapter lacks `SHADER_F64` (e.g. Metal on Apple
+/// Silicon) it falls back to f32 ([`GpuStateVec32`]).
+///
+/// Implements the standard gate traits so it can be used interchangeably with
+/// either concrete backend in code that does not depend on precision.
+pub enum GpuStateVecAuto {
+    /// f64 backend (preferred; selected when `SHADER_F64` is available).
+    F64(GpuStateVec64),
+    /// f32 backend (fallback for adapters without `SHADER_F64`).
+    F32(GpuStateVec32),
+}
+
+impl GpuStateVecAuto {
+    /// Create a GPU state vector simulator, preferring f64 precision.
+    ///
+    /// Falls back to f32 only when the f64 path reports
+    /// `UnsupportedFeature(ShaderF64)`. Any other error (no adapter, too many
+    /// qubits, etc.) is propagated as-is so callers don't silently get a less
+    /// precise simulator for an unrelated reason.
+    ///
+    /// # Errors
+    /// Returns a [`GpuError`] from whichever constructor was used. The f64
+    /// error is *not* preserved if the fallback succeeds; if the fallback also
+    /// fails, only its error is surfaced.
+    pub fn new(num_qubits: u32) -> Result<Self, GpuError> {
+        match GpuStateVec64::new(num_qubits) {
+            Ok(sim) => Ok(GpuStateVecAuto::F64(sim)),
+            Err(GpuError::UnsupportedFeature(RequiredFeature::ShaderF64)) => {
+                GpuStateVec32::new(num_qubits).map(GpuStateVecAuto::F32)
+            }
+            Err(e) => Err(e),
+        }
+    }
+
+    /// True if the selected backend is f64.
+    #[must_use]
+    pub fn is_f64(&self) -> bool {
+        matches!(self, Self::F64(_))
+    }
+}
+
+/// Dispatch a `&mut self -> &mut Self` method to the inner backend.
+macro_rules! dispatch_mut {
+    ($self:ident, $method:ident ( $($arg:expr),* $(,)? )) => {{
+        match $self {
+            Self::F64(s) => { s.$method($($arg),*); }
+            Self::F32(s) => { s.$method($($arg),*); }
+        }
+        $self
+    }};
+}
+
+impl QuantumSimulator for GpuStateVecAuto {
+    fn reset(&mut self) -> &mut Self {
+        dispatch_mut!(self, reset())
+    }
+}
+
+impl CliffordGateable for GpuStateVecAuto {
+    fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
+        dispatch_mut!(self, h(qubits))
+    }
+    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
+        dispatch_mut!(self, sz(qubits))
+    }
+    fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        dispatch_mut!(self, cx(pairs))
+    }
+    fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        match self {
+            Self::F64(s) => s.mz(qubits),
+            Self::F32(s) => s.mz(qubits),
+        }
+    }
+}
+
+impl ArbitraryRotationGateable for GpuStateVecAuto {
+    fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        dispatch_mut!(self, rx(theta, qubits))
+    }
+    fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        dispatch_mut!(self, rz(theta, qubits))
+    }
+    fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        dispatch_mut!(self, rzz(theta, pairs))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pecos_core::qid;
+
+    #[test]
+    fn auto_falls_back_or_uses_f64() {
+        // Either succeeds with whatever backend the adapter supports, or skips
+        // cleanly (no GPU available at all).
+        let Ok(mut sim) = GpuStateVecAuto::new(3) else {
+            return;
+        };
+        sim.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
+        let results = sim.mz(&[QubitId(0), QubitId(1)]);
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0].outcome, results[1].outcome);
+    }
+}
diff --git a/crates/pecos-gpu-sims/src/gpu_density_matrix.rs b/crates/pecos-gpu-sims/src/gpu_density_matrix.rs
new file mode 100644
index 000000000..a9d28593e
--- /dev/null
+++ b/crates/pecos-gpu-sims/src/gpu_density_matrix.rs
@@ -0,0 +1,1257 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! GPU density matrix simulator via Choi-Jamiolkowski isomorphism.
+//!
+//! Represents an N-qubit density matrix as a 2N-qubit state vector on the GPU.
+//! Each physical-qubit gate G becomes two state-vector gates: G on qubit q
+//! (system) and G-dagger on qubit q+N (environment).
+//!
+//! Generic over the backing GPU state vector (f32 or f64). Use the
+//! [`GpuDensityMatrix64`] alias for f64 precision (canonical) or
+//! [`GpuDensityMatrix32`] to trade precision for a ~2x smaller state.
+
+use crate::{GpuError, GpuStateVec32, GpuStateVec64};
+use num_complex::Complex64;
+use pecos_core::{Angle64, QubitId, RngManageable};
+use pecos_random::{PecosRng, RngExt, SeedableRng};
+use pecos_simulators::arbitrary_rotation_gateable::ArbitraryRotationGateable;
+use pecos_simulators::clifford_gateable::{CliffordGateable, MeasurementResult};
+use pecos_simulators::quantum_simulator::QuantumSimulator;
+
+// =============================================================================
+// Backend trait
+// =============================================================================
+
+/// Abstraction over a GPU state vector simulator that `GpuDensityMatrix` can
+/// be built on top of. The trait exposes state readback and write-back in
+/// f64 (the backend converts as needed) plus the standard gate traits.
+pub trait GpuStateVecBackend:
+    CliffordGateable + ArbitraryRotationGateable + QuantumSimulator + Sized
+{
+    /// Construct a backend with `num_qubits` qubits.
+    ///
+    /// # Errors
+    /// Returns [`GpuError`] if the backend GPU initialization fails.
+    fn new_backend(num_qubits: u32) -> Result<Self, GpuError>;
+
+    /// Readback the state vector as f64 amplitudes `[re, im]`.
+    fn state_f64(&mut self) -> Vec<[f64; 2]>;
+
+    /// Overwrite the state vector from f64 amplitudes.
+    fn write_state_f64(&mut self, amps: &[[f64; 2]]);
+
+    /// Force all pending GPU work to complete (for honest timing).
+    fn sync_backend(&mut self);
+}
+
+impl GpuStateVecBackend for GpuStateVec32 {
+    fn new_backend(num_qubits: u32) -> Result<Self, GpuError> {
+        GpuStateVec32::new(num_qubits)
+    }
+
+    fn state_f64(&mut self) -> Vec<[f64; 2]> {
+        self.state()
+            .into_iter()
+            .map(|[re, im]| [f64::from(re), f64::from(im)])
+            .collect()
+    }
+
+    fn write_state_f64(&mut self, amps: &[[f64; 2]]) {
+        #[allow(clippy::cast_possible_truncation)] // f32 backend: accept f64→f32 precision loss
+        let f32_amps: Vec<[f32; 2]> = amps
+            .iter()
+            .map(|[re, im]| [*re as f32, *im as f32])
+            .collect();
+        self.write_state(&f32_amps);
+    }
+
+    fn sync_backend(&mut self) {
+        self.sync();
+    }
+}
+
+impl GpuStateVecBackend for GpuStateVec64 {
+    fn new_backend(num_qubits: u32) -> Result<Self, GpuError> {
+        GpuStateVec64::new(num_qubits)
+    }
+
+    fn state_f64(&mut self) -> Vec<[f64; 2]> {
+        self.state()
+    }
+
+    fn write_state_f64(&mut self, amps: &[[f64; 2]]) {
+        self.write_state(amps);
+    }
+
+    fn sync_backend(&mut self) {
+        self.sync();
+    }
+}
+
+// =============================================================================
+// GpuDensityMatrix
+// =============================================================================
+
+/// GPU-backed density matrix simulator, generic over the backend precision.
+pub struct GpuDensityMatrix<SV: GpuStateVecBackend> {
+    num_physical_qubits: usize,
+    state_vector: SV,
+    rng: PecosRng,
+}
+
+/// f32 GPU density matrix: ~2x smaller state, single-precision amplitudes.
+/// Use when memory is the bottleneck or you need an extra physical qubit.
+pub type GpuDensityMatrix32 = GpuDensityMatrix<GpuStateVec32>;
+
+/// f64 GPU density matrix: canonical precision, matches CPU reference to
+/// ~1e-10 in isolation.
+pub type GpuDensityMatrix64 = GpuDensityMatrix<GpuStateVec64>;
+
+impl<SV: GpuStateVecBackend> GpuDensityMatrix<SV> {
+    /// Create a density matrix for `n` physical qubits, initialized to |0..0><0..0|.
+    ///
+    /// # Errors
+    /// Returns [`GpuError`] if the backend GPU state vector cannot be created.
+    pub fn new(num_physical_qubits: usize) -> Result<Self, GpuError> {
+        let sv_qubits =
+            u32::try_from(2 * num_physical_qubits).map_err(|_| GpuError::TooManyQubits {
+                requested: u32::MAX,
+                max: 15,
+            })?;
+        let state_vector = SV::new_backend(sv_qubits)?;
+        Ok(Self {
+            num_physical_qubits,
+            state_vector,
+            rng: PecosRng::from_seed([0u8; 32]),
+        })
+    }
+
+    /// Create a density matrix with a deterministic RNG seed.
+    ///
+    /// # Errors
+    /// Returns [`GpuError`] if the backend GPU state vector cannot be created.
+    pub fn with_seed(num_physical_qubits: usize, seed: u64) -> Result<Self, GpuError> {
+        let sv_qubits =
+            u32::try_from(2 * num_physical_qubits).map_err(|_| GpuError::TooManyQubits {
+                requested: u32::MAX,
+                max: 15,
+            })?;
+        let state_vector = SV::new_backend(sv_qubits)?;
+        Ok(Self {
+            num_physical_qubits,
+            state_vector,
+            rng: PecosRng::seed_from_u64(seed),
+        })
+    }
+
+    #[must_use]
+    pub fn num_qubits(&self) -> usize {
+        self.num_physical_qubits
+    }
+
+    #[must_use]
+    pub fn state_vector(&self) -> &SV {
+        &self.state_vector
+    }
+
+    pub fn state_vector_mut(&mut self) -> &mut SV {
+        &mut self.state_vector
+    }
+
+    /// Force all pending GPU work to complete. Call before timing measurements.
+    pub fn sync(&mut self) {
+        self.state_vector.sync_backend();
+    }
+
+    // -------------------------------------------------------------------------
+    // Helpers: probability / density matrix / purity
+    // -------------------------------------------------------------------------
+
+    /// Probability of measuring the computational basis state `basis_state`.
+    /// P(k) = rho_{k,k} = `sum_i` |psi[(k << n) | i]|^2 in the Choi representation.
+    ///
+    /// # Panics
+    /// Panics if `basis_state >= 2^num_physical_qubits`.
+    #[must_use]
+    pub fn probability(&mut self, basis_state: usize) -> f64 {
+        assert!(basis_state < 1 << self.num_physical_qubits);
+        let n = self.num_physical_qubits;
+        let sv = self.state_vector.state_f64();
+        let mut prob = 0.0;
+        for i in 0..(1 << n) {
+            let idx = (basis_state << n) | i;
+            let [re, im] = sv[idx];
+            prob += re * re + im * im;
+        }
+        prob
+    }
+
+    /// Full `NxN` density matrix as a flat row-major complex slab.
+    /// `rho[row * dim + col] = [re, im]`.
+    #[must_use]
+    pub fn get_density_matrix(&mut self) -> Vec<Vec<Complex64>> {
+        let n = self.num_physical_qubits;
+        let dim = 1 << n;
+        let sv = self.state_vector.state_f64();
+
+        let mut rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        for (row, row_vec) in rho.iter_mut().enumerate() {
+            for (col, cell) in row_vec.iter_mut().enumerate() {
+                let mut re = 0.0f64;
+                let mut im = 0.0f64;
+                for k in 0..dim {
+                    let idx1 = (row << n) | k;
+                    let idx2 = (col << n) | k;
+                    let [ar, ai] = sv[idx1];
+                    let [br, bi] = sv[idx2];
+                    // a * conj(b)
+                    re += ar * br + ai * bi;
+                    im += ai * br - ar * bi;
+                }
+                *cell = Complex64::new(re, im);
+            }
+        }
+        rho
+    }
+
+    /// Tr(rho^2). 1 for pure states; 1/2^n for maximally mixed.
+    ///
+    /// # Panics
+    /// Panics if the backing state vector has unexpected size.
+    #[must_use]
+    pub fn purity(&mut self) -> f64 {
+        self.get_density_matrix()
+            .iter()
+            .flatten()
+            .map(Complex64::norm_sqr)
+            .sum()
+    }
+
+    /// Returns true if the state is pure (purity within `tol` of 1.0).
+    /// The default `is_pure` uses `1e-5`, reflecting the f32-precision gate
+    /// constants in the backing state vectors. Pass a tighter tolerance if
+    /// you need stricter purity checks on known-noise-free states.
+    #[must_use]
+    pub fn is_pure_with_tol(&mut self, tol: f64) -> bool {
+        (self.purity() - 1.0).abs() < tol
+    }
+
+    #[must_use]
+    pub fn is_pure(&mut self) -> bool {
+        self.is_pure_with_tol(1e-5)
+    }
+
+    // -------------------------------------------------------------------------
+    // State preparation
+    // -------------------------------------------------------------------------
+
+    /// Prepare |`basis_state`><`basis_state`|.
+    ///
+    /// # Panics
+    /// Panics if `basis_state >= 2^num_physical_qubits`.
+    pub fn prepare_computational_basis(&mut self, basis_state: usize) -> &mut Self {
+        assert!(basis_state < 1 << self.num_physical_qubits);
+        let n = self.num_physical_qubits;
+        let sv_size = 1usize << (2 * n);
+        let mut new_state = vec![[0.0f64, 0.0f64]; sv_size];
+        let idx = (basis_state << n) | basis_state;
+        new_state[idx] = [1.0, 0.0];
+        self.state_vector.write_state_f64(&new_state);
+        self
+    }
+
+    /// Prepare |+>^N: tensor product of |+> states on all qubits.
+    pub fn prepare_plus_state(&mut self) -> &mut Self {
+        let n = self.num_physical_qubits;
+        self.prepare_computational_basis(0);
+        for q in 0..n {
+            self.h(&[QubitId(q)]);
+        }
+        self
+    }
+
+    /// Prepare the maximally mixed state I / 2^n.
+    pub fn prepare_maximally_mixed(&mut self) -> &mut Self {
+        let n = self.num_physical_qubits;
+        let sv_size = 1usize << (2 * n);
+        let dim = 1usize << n;
+        #[allow(clippy::cast_precision_loss)] // dim = 2^n with n <= 15, exact in f64
+        let factor = 1.0 / (dim as f64).sqrt();
+        let mut new_state = vec![[0.0f64, 0.0f64]; sv_size];
+        for i in 0..dim {
+            new_state[(i << n) | i] = [factor, 0.0];
+        }
+        self.state_vector.write_state_f64(&new_state);
+        self
+    }
+
+    // -------------------------------------------------------------------------
+    // Noise channels
+    // -------------------------------------------------------------------------
+
+    /// Amplitude damping: `rho -> E_0 rho E_0^dagger + E_1 rho E_1^dagger` with
+    /// `E_0 = |0><0| + sqrt(1-gamma)|1><1|`, `E_1 = sqrt(gamma)|0><1|`. Applies
+    /// the channel on the density matrix then Cholesky-re-purifies the Choi
+    /// state so `probability()` / `purity()` stay consistent.
+    pub fn apply_amplitude_damping(&mut self, qubit: usize, gamma: f64) -> &mut Self {
+        let gamma = gamma.clamp(0.0, 1.0);
+        if gamma < f64::EPSILON {
+            return self;
+        }
+        let n = self.num_physical_qubits;
+        let dim = 1usize << n;
+        let qubit_mask = 1usize << qubit;
+
+        let rho = self.get_density_matrix();
+        let mut new_rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        let sqrt_1mg = (1.0 - gamma).sqrt();
+        for i in 0..dim {
+            let i1 = (i & qubit_mask) != 0;
+            for j in 0..dim {
+                let j1 = (j & qubit_mask) != 0;
+                new_rho[i][j] = match (i1, j1) {
+                    (false, false) => {
+                        let ii = i | qubit_mask;
+                        let jj = j | qubit_mask;
+                        rho[i][j] + gamma * rho[ii][jj]
+                    }
+                    (true, true) => (1.0 - gamma) * rho[i][j],
+                    _ => sqrt_1mg * rho[i][j],
+                };
+            }
+        }
+        self.set_from_density_matrix(&new_rho);
+        self
+    }
+
+    /// Phase damping (pure dephasing): diagonals preserved, off-diagonals
+    /// (w.r.t. the target qubit) scaled by sqrt(1-lambda). Applies the channel
+    /// on the density matrix then Cholesky-re-purifies the Choi state.
+    pub fn apply_phase_damping(&mut self, qubit: usize, lambda: f64) -> &mut Self {
+        let lambda = lambda.clamp(0.0, 1.0);
+        if lambda < f64::EPSILON {
+            return self;
+        }
+        let n = self.num_physical_qubits;
+        let dim = 1usize << n;
+        let qubit_mask = 1usize << qubit;
+
+        let rho = self.get_density_matrix();
+        let mut new_rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        let sqrt_1ml = (1.0 - lambda).sqrt();
+        for i in 0..dim {
+            let i1 = (i & qubit_mask) != 0;
+            for j in 0..dim {
+                let j1 = (j & qubit_mask) != 0;
+                new_rho[i][j] = if i1 == j1 {
+                    rho[i][j]
+                } else {
+                    sqrt_1ml * rho[i][j]
+                };
+            }
+        }
+        self.set_from_density_matrix(&new_rho);
+        self
+    }
+
+    /// Depolarizing: `rho -> (1-p) rho + (p/3)(X rho X + Y rho Y + Z rho Z)`.
+    /// Uses Cholesky re-purification of the transformed density matrix, so has
+    /// a readback + O(dim^3) CPU cost + writeback round trip.
+    pub fn apply_depolarizing_noise(&mut self, qubit: usize, probability: f64) -> &mut Self {
+        let p = probability.clamp(0.0, 1.0);
+        if p < f64::EPSILON {
+            return self;
+        }
+        let n = self.num_physical_qubits;
+        let dim = 1usize << n;
+        let qubit_mask = 1usize << qubit;
+
+        let rho = self.get_density_matrix();
+        let mut new_rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        for i in 0..dim {
+            for j in 0..dim {
+                let i1 = (i & qubit_mask) != 0;
+                let j1 = (j & qubit_mask) != 0;
+                if i1 == j1 {
+                    let i_flip = i ^ qubit_mask;
+                    let j_flip = j ^ qubit_mask;
+                    new_rho[i][j] =
+                        (1.0 - 2.0 * p / 3.0) * rho[i][j] + (2.0 * p / 3.0) * rho[i_flip][j_flip];
+                } else {
+                    new_rho[i][j] = (1.0 - 4.0 * p / 3.0) * rho[i][j];
+                }
+            }
+        }
+        self.set_from_density_matrix(&new_rho);
+        self
+    }
+
+    /// Bit flip: `rho -> (1-p) rho + p X rho X`.
+    pub fn apply_bit_flip(&mut self, qubit: usize, probability: f64) -> &mut Self {
+        let p = probability.clamp(0.0, 1.0);
+        if p < f64::EPSILON {
+            return self;
+        }
+        let n = self.num_physical_qubits;
+        let dim = 1usize << n;
+        let qubit_mask = 1usize << qubit;
+
+        let rho = self.get_density_matrix();
+        let mut new_rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        for i in 0..dim {
+            for j in 0..dim {
+                let i_flip = i ^ qubit_mask;
+                let j_flip = j ^ qubit_mask;
+                new_rho[i][j] = (1.0 - p) * rho[i][j] + p * rho[i_flip][j_flip];
+            }
+        }
+        self.set_from_density_matrix(&new_rho);
+        self
+    }
+
+    /// Phase flip: `rho -> (1-p) rho + p Z rho Z`.
+    pub fn apply_phase_flip(&mut self, qubit: usize, probability: f64) -> &mut Self {
+        let p = probability.clamp(0.0, 1.0);
+        if p < f64::EPSILON {
+            return self;
+        }
+        let n = self.num_physical_qubits;
+        let dim = 1usize << n;
+        let qubit_mask = 1usize << qubit;
+
+        let rho = self.get_density_matrix();
+        let mut new_rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        for i in 0..dim {
+            for j in 0..dim {
+                let i1 = (i & qubit_mask) != 0;
+                let j1 = (j & qubit_mask) != 0;
+                new_rho[i][j] = if i1 == j1 {
+                    rho[i][j]
+                } else {
+                    (1.0 - 2.0 * p) * rho[i][j]
+                };
+            }
+        }
+        self.set_from_density_matrix(&new_rho);
+        self
+    }
+
+    /// Re-establish the Choi state from a density matrix via Cholesky.
+    /// `rho = L L^dagger`, then `psi[(i<<n)|j] = L[i][j]`.
+    ///
+    /// Two numerical guards: the diagonal sqrt clamps slight negatives to zero
+    /// (numerical noise on a true PSD rho), and off-diagonals on a near-zero
+    /// pivot are left at zero. Both fire only on rounding noise for a
+    /// well-formed channel; a buggy channel that sends rho non-PSD will trip
+    /// the `debug_assert` below.
+    #[allow(clippy::needless_range_loop)] // Cholesky: indexed access into multiple matrices
+    fn set_from_density_matrix(&mut self, rho: &[Vec<Complex64>]) {
+        // Tolerance for "legitimate numerical noise" on the diagonal.
+        // For a properly PSD rho with trace 1, diagonals are in [0, 1] and
+        // accumulated f64 rounding error is bounded by ~dim * eps_f64.
+        // 1e-9 is well above that for any practical dim and well below any
+        // physically meaningful negative eigenvalue.
+        const PSD_NEG_TOL: f64 = -1e-9;
+        // Pivot threshold for off-diagonal division. Same scale as the
+        // diagonal noise floor: pivots below this are treated as zero rather
+        // than dividing by them and amplifying noise.
+        const PIVOT_EPS: f64 = 1e-15;
+
+        let n = self.num_physical_qubits;
+        let dim = rho.len();
+
+        let mut l: Vec<Vec<Complex64>> = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        for i in 0..dim {
+            for j in 0..=i {
+                let mut sum: Complex64 = rho[i][j];
+                for k in 0..j {
+                    sum -= l[i][k] * l[j][k].conj();
+                }
+                if i == j {
+                    debug_assert!(
+                        sum.re > PSD_NEG_TOL,
+                        "set_from_density_matrix: rho not PSD at diag[{i}]: {} (tol {PSD_NEG_TOL:e}); \
+                         a noise channel likely violated trace preservation or positivity",
+                        sum.re
+                    );
+                    let diag = sum.re.max(0.0);
+                    l[i][j] = Complex64::new(diag.sqrt(), 0.0);
+                } else if l[j][j].norm() > PIVOT_EPS {
+                    l[i][j] = sum / l[j][j];
+                }
+            }
+        }
+
+        let sv_size = 1usize << (2 * n);
+        let mut new_state = vec![[0.0f64, 0.0f64]; sv_size];
+        for i in 0..dim {
+            for j in 0..dim {
+                let idx = (i << n) | j;
+                let c: Complex64 = l[i][j];
+                new_state[idx] = [c.re, c.im];
+            }
+        }
+        self.state_vector.write_state_f64(&new_state);
+    }
+
+    // -------------------------------------------------------------------------
+    // Internal gate helpers
+    // -------------------------------------------------------------------------
+
+    fn apply_1q_sys_env<F, G>(&mut self, qubits: &[QubitId], sys_op: F, env_op: G)
+    where
+        F: Fn(&mut SV, &[QubitId]),
+        G: Fn(&mut SV, &[QubitId]),
+    {
+        let n = self.num_physical_qubits;
+        for &q in qubits {
+            let qi = q.index();
+            sys_op(&mut self.state_vector, &[QubitId(qi)]);
+            env_op(&mut self.state_vector, &[QubitId(qi + n)]);
+        }
+    }
+
+    fn apply_2q_sys_env<F, G>(&mut self, pairs: &[(QubitId, QubitId)], sys_op: F, env_op: G)
+    where
+        F: Fn(&mut SV, &[(QubitId, QubitId)]),
+        G: Fn(&mut SV, &[(QubitId, QubitId)]),
+    {
+        let n = self.num_physical_qubits;
+        for &(c, t) in pairs {
+            let ci = c.index();
+            let ti = t.index();
+            sys_op(&mut self.state_vector, &[(QubitId(ci), QubitId(ti))]);
+            env_op(
+                &mut self.state_vector,
+                &[(QubitId(ci + n), QubitId(ti + n))],
+            );
+        }
+    }
+}
+
+impl<SV: GpuStateVecBackend> QuantumSimulator for GpuDensityMatrix<SV> {
+    fn reset(&mut self) -> &mut Self {
+        self.state_vector.reset();
+        self
+    }
+}
+
+impl<SV: GpuStateVecBackend> RngManageable for GpuDensityMatrix<SV> {
+    type Rng = PecosRng;
+
+    fn set_rng(&mut self, rng: Self::Rng) {
+        self.rng = rng;
+    }
+
+    fn rng(&self) -> &Self::Rng {
+        &self.rng
+    }
+
+    fn rng_mut(&mut self) -> &mut Self::Rng {
+        &mut self.rng
+    }
+}
+
+impl<SV: GpuStateVecBackend> CliffordGateable for GpuDensityMatrix<SV> {
+    // --- Hermitian 1q: apply identically on system and environment ---
+
+    fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.h(q);
+            },
+            |s, q| {
+                s.h(q);
+            },
+        );
+        self
+    }
+
+    fn x(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.x(q);
+            },
+            |s, q| {
+                s.x(q);
+            },
+        );
+        self
+    }
+
+    fn y(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.y(q);
+            },
+            |s, q| {
+                s.y(q);
+            },
+        );
+        self
+    }
+
+    fn z(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.z(q);
+            },
+            |s, q| {
+                s.z(q);
+            },
+        );
+        self
+    }
+
+    // --- Non-Hermitian 1q: env gets the dagger ---
+
+    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.sz(q);
+            },
+            |s, q| {
+                s.szdg(q);
+            },
+        );
+        self
+    }
+
+    fn szdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.szdg(q);
+            },
+            |s, q| {
+                s.sz(q);
+            },
+        );
+        self
+    }
+
+    fn sx(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.sx(q);
+            },
+            |s, q| {
+                s.sxdg(q);
+            },
+        );
+        self
+    }
+
+    fn sxdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.sxdg(q);
+            },
+            |s, q| {
+                s.sx(q);
+            },
+        );
+        self
+    }
+
+    fn sy(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.sy(q);
+            },
+            |s, q| {
+                s.sydg(q);
+            },
+        );
+        self
+    }
+
+    fn sydg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.sydg(q);
+            },
+            |s, q| {
+                s.sy(q);
+            },
+        );
+        self
+    }
+
+    // --- 2q gates ---
+
+    fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.cx(p);
+            },
+            |s, p| {
+                s.cx(p);
+            },
+        );
+        self
+    }
+
+    fn cy(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.cy(p);
+            },
+            |s, p| {
+                s.cy(p);
+            },
+        );
+        self
+    }
+
+    fn cz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.cz(p);
+            },
+            |s, p| {
+                s.cz(p);
+            },
+        );
+        self
+    }
+
+    fn swap(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.swap(p);
+            },
+            |s, p| {
+                s.swap(p);
+            },
+        );
+        self
+    }
+
+    // SZZ/SXX/SYY family: non-Hermitian, env gets dagger
+
+    fn szz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.szz(p);
+            },
+            |s, p| {
+                s.szzdg(p);
+            },
+        );
+        self
+    }
+
+    fn szzdg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.szzdg(p);
+            },
+            |s, p| {
+                s.szz(p);
+            },
+        );
+        self
+    }
+
+    fn sxx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.sxx(p);
+            },
+            |s, p| {
+                s.sxxdg(p);
+            },
+        );
+        self
+    }
+
+    fn sxxdg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.sxxdg(p);
+            },
+            |s, p| {
+                s.sxx(p);
+            },
+        );
+        self
+    }
+
+    fn syy(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.syy(p);
+            },
+            |s, p| {
+                s.syydg(p);
+            },
+        );
+        self
+    }
+
+    fn syydg(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        self.apply_2q_sys_env(
+            pairs,
+            |s, p| {
+                s.syydg(p);
+            },
+            |s, p| {
+                s.syy(p);
+            },
+        );
+        self
+    }
+
+    /// Z-basis projective measurement. Reads state back, samples + projects on
+    /// the CPU, writes collapsed state back. O(2^(2N)) per measurement.
+    fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
+        let n = self.num_physical_qubits;
+        let sv_size = 1usize << (2 * n);
+        let mut results = Vec::with_capacity(qubits.len());
+
+        for &q in qubits {
+            let qubit = q.index();
+            let state = self.state_vector.state_f64();
+
+            // P(qubit = 1) = sum_k rho_{k,k} over k with bit_q(k) = 1.
+            // In the purification convention, rho_{k,k} = sum_i |psi[(k<<n)|i]|^2,
+            // so we sum over every Choi index whose system-row has bit_q set.
+            let qubit_mask = 1usize << qubit;
+            let prob_one: f64 = state
+                .iter()
+                .enumerate()
+                .filter(|(idx, _)| ((idx >> n) & qubit_mask) != 0)
+                .map(|(_, [re, im])| re * re + im * im)
+                .sum();
+
+            let is_deterministic = !(1e-10..=1.0 - 1e-10).contains(&prob_one);
+            let outcome = if is_deterministic {
+                prob_one > 0.5
+            } else {
+                self.rng.random_range(0.0..1.0) < prob_one
+            };
+
+            let target_bit = if outcome { qubit_mask } else { 0 };
+            let mut new_state = vec![[0.0f64, 0.0f64]; sv_size];
+            let mut norm_sq = 0.0;
+
+            for idx in 0..sv_size {
+                let row = idx >> n;
+                let col = idx & ((1 << n) - 1);
+                if (row & qubit_mask) == target_bit && (col & qubit_mask) == target_bit {
+                    new_state[idx] = state[idx];
+                    let [re, im] = state[idx];
+                    norm_sq += re * re + im * im;
+                }
+            }
+
+            if norm_sq > 1e-15 {
+                let norm = norm_sq.sqrt();
+                for amp in &mut new_state {
+                    amp[0] /= norm;
+                    amp[1] /= norm;
+                }
+            }
+
+            self.state_vector.write_state_f64(&new_state);
+
+            results.push(MeasurementResult {
+                outcome,
+                is_deterministic,
+            });
+        }
+
+        results
+    }
+}
+
+impl<SV: GpuStateVecBackend> ArbitraryRotationGateable for GpuDensityMatrix<SV> {
+    fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        // RX(-theta) = Z * RX(theta) * Z
+        let n = self.num_physical_qubits;
+        for &q in qubits {
+            let qi = q.index();
+            self.state_vector.rx(theta, &[QubitId(qi)]);
+            self.state_vector.z(&[QubitId(qi + n)]);
+            self.state_vector.rx(theta, &[QubitId(qi + n)]);
+            self.state_vector.z(&[QubitId(qi + n)]);
+        }
+        self
+    }
+
+    fn ry(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        // RY is real, RY* = RY
+        let n = self.num_physical_qubits;
+        for &q in qubits {
+            let qi = q.index();
+            self.state_vector.ry(theta, &[QubitId(qi)]);
+            self.state_vector.ry(theta, &[QubitId(qi + n)]);
+        }
+        self
+    }
+
+    fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
+        // RZ(-theta) = X * RZ(theta) * X
+        let n = self.num_physical_qubits;
+        for &q in qubits {
+            let qi = q.index();
+            self.state_vector.rz(theta, &[QubitId(qi)]);
+            self.state_vector.x(&[QubitId(qi + n)]);
+            self.state_vector.rz(theta, &[QubitId(qi + n)]);
+            self.state_vector.x(&[QubitId(qi + n)]);
+        }
+        self
+    }
+
+    fn t(&mut self, qubits: &[QubitId]) -> &mut Self {
+        // T* = Tdg
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.t(q);
+            },
+            |s, q| {
+                s.tdg(q);
+            },
+        );
+        self
+    }
+
+    fn tdg(&mut self, qubits: &[QubitId]) -> &mut Self {
+        self.apply_1q_sys_env(
+            qubits,
+            |s, q| {
+                s.tdg(q);
+            },
+            |s, q| {
+                s.t(q);
+            },
+        );
+        self
+    }
+
+    // NOTE: we deliberately do NOT override rxx/ryy here. The default trait
+    // impls decompose them into H-RZZ-H and SX-RZZ-SXdg sequences, which route
+    // through our overridden h/sx/rzz (correct sys/env handling per gate).
+    // The raw GpuStateVec RXX/RYY shaders have a pre-existing correctness bug
+    // (only half the basis pairs updated) -- keeping this decomposition until
+    // that's fixed.
+
+    fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
+        // RZZ(-theta) = (X tensor I) RZZ(theta) (X tensor I). X on just one
+        // qubit anticommutes with Z (x) Z; X on both commutes and doesn't flip.
+        let n = self.num_physical_qubits;
+        for &(c, t) in pairs {
+            let ci = c.index();
+            let ti = t.index();
+            self.state_vector.rzz(theta, &[(QubitId(ci), QubitId(ti))]);
+            self.state_vector.x(&[QubitId(ci + n)]);
+            self.state_vector
+                .rzz(theta, &[(QubitId(ci + n), QubitId(ti + n))]);
+            self.state_vector.x(&[QubitId(ci + n)]);
+        }
+        self
+    }
+}
+
+// =============================================================================
+// Tests
+// =============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pecos_simulators::DensityMatrix;
+
+    // Primary tests run on the f32 backend (GpuDensityMatrix32), because the
+    // f64 backend has pre-existing shader bugs in RZZ/RXX/RYY we haven't
+    // fixed yet. Tolerance ~1e-3 reflects f32 precision for f64 comparisons.
+    const TOL: f64 = 1e-3;
+
+    fn gpu_dm_matrix<SV: GpuStateVecBackend>(sim: &mut GpuDensityMatrix<SV>) -> Vec<Complex64> {
+        let rho = sim.get_density_matrix();
+        rho.into_iter().flatten().collect()
+    }
+
+    fn cpu_dm_matrix(sim: &mut DensityMatrix) -> Vec<Complex64> {
+        sim.get_density_matrix().into_iter().flatten().collect()
+    }
+
+    fn assert_dm_close(gpu: &[Complex64], cpu: &[Complex64], tol: f64, label: &str) {
+        assert_eq!(gpu.len(), cpu.len(), "{label}: dim mismatch");
+        for (i, (g, c)) in gpu.iter().zip(cpu.iter()).enumerate() {
+            let d = (g - c).norm();
+            assert!(d < tol, "{label}: idx {i} gpu={g} cpu={c} diff={d}");
+        }
+    }
+
+    // --- Regression: RZZ on |0000> for sv64 (was broken, now fixed) ---
+
+    #[test]
+    fn regression_sv64_rzz_on_zero() {
+        use crate::GpuStateVec64;
+        let Ok(mut sv) = GpuStateVec64::new(4) else {
+            return;
+        };
+        let t = Angle64::from_radians(0.37);
+        sv.rzz(t, &[(QubitId(0), QubitId(1))]);
+        let state = sv.state();
+        let t_rad = t.to_radians_signed();
+        let (c, s) = ((t_rad / 2.0).cos(), (t_rad / 2.0).sin());
+        let [re, im] = state[0];
+        assert!(
+            (re - c).abs() < 1e-5 && (im + s).abs() < 1e-5,
+            "sv64 rzz: ({re}, {im}) vs expected ({c}, {})",
+            -s
+        );
+    }
+
+    // --- f32 backend tests (primary) ---
+
+    #[test]
+    fn test_bell_state() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(2);
+        gpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        cpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "Bell",
+        );
+    }
+
+    #[test]
+    fn test_rzz_only() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(2);
+        let t = Angle64::from_radians(0.37);
+        gpu.h(&[QubitId(0), QubitId(1)])
+            .rzz(t, &[(QubitId(0), QubitId(1))]);
+        cpu.h(&[QubitId(0), QubitId(1)])
+            .rzz(t, &[(QubitId(0), QubitId(1))]);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "rzz",
+        );
+    }
+
+    #[test]
+    fn test_rotations_and_2q() {
+        // Exercises rz + rxx (default decomposition) + ryy (default
+        // decomposition) + cz + t in one circuit.
+        let Ok(mut gpu) = GpuDensityMatrix32::new(3) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(3);
+        let t = Angle64::from_radians(0.37);
+        gpu.h(&[QubitId(0), QubitId(1), QubitId(2)])
+            .rz(t, &[QubitId(0)])
+            .rxx(t, &[(QubitId(0), QubitId(1))])
+            .ryy(t, &[(QubitId(1), QubitId(2))])
+            .cz(&[(QubitId(0), QubitId(2))])
+            .t(&[QubitId(1)]);
+        cpu.h(&[QubitId(0), QubitId(1), QubitId(2)])
+            .rz(t, &[QubitId(0)])
+            .rxx(t, &[(QubitId(0), QubitId(1))])
+            .ryy(t, &[(QubitId(1), QubitId(2))])
+            .cz(&[(QubitId(0), QubitId(2))])
+            .t(&[QubitId(1)]);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "rot+2q",
+        );
+    }
+
+    #[test]
+    fn test_probability_and_purity() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        gpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        let p00 = gpu.probability(0);
+        let p11 = gpu.probability(3);
+        assert!((p00 - 0.5).abs() < TOL, "P(00)={p00}");
+        assert!((p11 - 0.5).abs() < TOL, "P(11)={p11}");
+        assert!((gpu.purity() - 1.0).abs() < TOL);
+        assert!(gpu.is_pure());
+    }
+
+    #[test]
+    fn test_prepare_maximally_mixed() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        gpu.prepare_maximally_mixed();
+        assert!((gpu.purity() - 0.25).abs() < TOL);
+        for k in 0..4 {
+            assert!((gpu.probability(k) - 0.25).abs() < TOL);
+        }
+    }
+
+    #[test]
+    fn test_prepare_computational_basis() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        gpu.prepare_computational_basis(2);
+        assert!((gpu.probability(2) - 1.0).abs() < TOL);
+        assert!(gpu.probability(0) < TOL);
+        assert!(gpu.is_pure());
+    }
+
+    #[test]
+    fn test_phase_damping_matches_cpu() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(1) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(1);
+        gpu.h(&[QubitId(0)]);
+        cpu.h(&[QubitId(0)]);
+        gpu.apply_phase_damping(0, 0.5);
+        cpu.apply_phase_damping(0, 0.5);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "phase damp",
+        );
+    }
+
+    #[test]
+    fn test_phase_damping_preserves_diagonal() {
+        // Regression: after full dephasing of |+>, rho = I/2, so P(0)=P(1)=0.5.
+        // Pre-Cholesky implementations broke this identity.
+        let Ok(mut gpu) = GpuDensityMatrix32::new(1) else {
+            return;
+        };
+        gpu.h(&[QubitId(0)]);
+        gpu.apply_phase_damping(0, 1.0);
+        let p0 = gpu.probability(0);
+        let p1 = gpu.probability(1);
+        assert!((p0 - 0.5).abs() < TOL, "P(0)={p0}");
+        assert!((p1 - 0.5).abs() < TOL, "P(1)={p1}");
+        assert!((p0 + p1 - 1.0).abs() < TOL, "probabilities don't sum to 1");
+    }
+
+    #[test]
+    fn test_amplitude_damping_preserves_trace() {
+        // Regression: partial amp damping on |+><+| should keep tr(rho) = 1.
+        let Ok(mut gpu) = GpuDensityMatrix32::new(1) else {
+            return;
+        };
+        gpu.h(&[QubitId(0)]);
+        gpu.apply_amplitude_damping(0, 0.3);
+        let p0 = gpu.probability(0);
+        let p1 = gpu.probability(1);
+        assert!((p0 + p1 - 1.0).abs() < TOL, "tr(rho) = {} != 1", p0 + p1);
+        // Expected: rho_{00} = 0.5 + 0.5*0.3 = 0.65, rho_{11} = 0.5*0.7 = 0.35
+        assert!((p0 - 0.65).abs() < TOL, "P(0)={p0} expected 0.65");
+        assert!((p1 - 0.35).abs() < TOL, "P(1)={p1} expected 0.35");
+    }
+
+    #[test]
+    fn test_amplitude_damping_matches_cpu() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(2);
+        gpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        cpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        gpu.apply_amplitude_damping(0, 0.3);
+        cpu.apply_amplitude_damping(0, 0.3);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "amp damp",
+        );
+    }
+
+    #[test]
+    fn test_depolarizing_matches_cpu() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(2);
+        gpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        cpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        gpu.apply_depolarizing_noise(0, 0.2);
+        cpu.apply_depolarizing_noise(0, 0.2);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "depol",
+        );
+    }
+
+    #[test]
+    fn test_bit_flip_matches_cpu() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(2);
+        gpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        cpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        gpu.apply_bit_flip(1, 0.15);
+        cpu.apply_bit_flip(1, 0.15);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "bit flip",
+        );
+    }
+
+    #[test]
+    fn test_phase_flip_matches_cpu() {
+        let Ok(mut gpu) = GpuDensityMatrix32::new(2) else {
+            return;
+        };
+        let mut cpu = DensityMatrix::new(2);
+        gpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        cpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        gpu.apply_phase_flip(0, 0.25);
+        cpu.apply_phase_flip(0, 0.25);
+        assert_dm_close(
+            &gpu_dm_matrix(&mut gpu),
+            &cpu_dm_matrix(&mut cpu),
+            TOL,
+            "phase flip",
+        );
+    }
+}
diff --git a/crates/pecos-gpu-sims/src/gpu_influence_sampler.rs b/crates/pecos-gpu-sims/src/gpu_influence_sampler.rs
index 99cb808f1..b550c22ad 100644
--- a/crates/pecos-gpu-sims/src/gpu_influence_sampler.rs
+++ b/crates/pecos-gpu-sims/src/gpu_influence_sampler.rs
@@ -3,6 +3,7 @@
 //! Each thread handles ONE shot and ALL locations.
 //! This eliminates atomic contention since each shot has its own output region.
 
+use crate::gpu_probe::gpu_context;
 use bytemuck::{Pod, Zeroable};
 use pecos_random::{PecosRng, time_seed};
 use wgpu::util::DeviceExt;
@@ -177,22 +178,9 @@ impl GpuInfluenceSampler {
     }
 
     fn create_internal(map: &GpuInfluenceMapData, seed: u64) -> Result<Self, String> {
-        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::new_without_display_handle());
-
-        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
-            power_preference: wgpu::PowerPreference::HighPerformance,
-            compatible_surface: None,
-            force_fallback_adapter: false,
-        }))
-        .map_err(|_| "No GPU adapter found")?;
-
-        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
-            label: Some("InfluenceSampler Device"),
-            required_features: wgpu::Features::empty(),
-            required_limits: adapter.limits(),
-            ..Default::default()
-        }))
-        .map_err(|e| format!("Failed to create device: {e}"))?;
+        let ctx = gpu_context().map_err(|e| e.to_string())?;
+        let device = ctx.device;
+        let queue = ctx.queue;
 
         // Helper to create buffer from data
         let create_buffer = |data: &[u32], label: &str| -> wgpu::Buffer {
@@ -659,8 +647,6 @@ impl GpuInfluenceSampler {
     }
 }
 
-crate::impl_gpu_drop!(GpuInfluenceSampler);
-
 /// Result from GPU sampling.
 pub struct GpuSamplingResult {
     pub num_shots: usize,
diff --git a/crates/pecos-gpu-sims/src/gpu_pauli_prop.rs b/crates/pecos-gpu-sims/src/gpu_pauli_prop.rs
index ce579b6ab..067fb8b26 100644
--- a/crates/pecos-gpu-sims/src/gpu_pauli_prop.rs
+++ b/crates/pecos-gpu-sims/src/gpu_pauli_prop.rs
@@ -38,6 +38,7 @@
 //! let flips = prop.measure_z_flips(&[0, 1]);
 //! ```
 
+use crate::gpu_probe::gpu_context;
 use pecos_random::{PecosRng, time_seed};
 use wgpu::util::DeviceExt;
 
@@ -132,23 +133,9 @@ impl GpuPauliProp {
     /// Returns an error if no GPU adapter is found or device creation fails.
     #[allow(clippy::cast_possible_truncation)] // GPU params: qubit/shot counts fit in u32
     pub fn with_seed(num_qubits: usize, num_shots: u32, seed: u64) -> Result<Self, String> {
-        // Initialize wgpu
-        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::new_without_display_handle());
-
-        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
-            power_preference: wgpu::PowerPreference::HighPerformance,
-            compatible_surface: None,
-            force_fallback_adapter: false,
-        }))
-        .map_err(|_| "No GPU adapter found")?;
-
-        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
-            label: Some("PauliProp Device"),
-            required_features: wgpu::Features::empty(),
-            required_limits: adapter.limits(),
-            ..Default::default()
-        }))
-        .map_err(|e| format!("Failed to create device: {e}"))?;
+        let ctx = gpu_context().map_err(|e| e.to_string())?;
+        let device = ctx.device;
+        let queue = ctx.queue;
 
         // Calculate dimensions
         let shot_words = num_shots.div_ceil(32);
@@ -385,7 +372,9 @@ impl GpuPauliProp {
         }
     }
 
-    /// Apply X gate. Toggles X fault.
+    /// Apply Pauli X gate. For Pauli-fault tracking this is a no-op on the bit
+    /// pattern (X P X = +/- P). To inject an X fault instead, call
+    /// [`Self::inject_x_fault`].
     #[allow(clippy::cast_possible_truncation)] // qubit index fits in u32
     pub fn x(&mut self, qubits: &[usize]) {
         for &q in qubits {
@@ -393,7 +382,7 @@ impl GpuPauliProp {
         }
     }
 
-    /// Apply Y gate. Toggles both X and Z faults.
+    /// Apply Pauli Y gate. No-op on fault bits (Y P Y = +/- P).
     #[allow(clippy::cast_possible_truncation)] // qubit index fits in u32
     pub fn y(&mut self, qubits: &[usize]) {
         for &q in qubits {
@@ -401,7 +390,7 @@ impl GpuPauliProp {
         }
     }
 
-    /// Apply Z gate. Toggles Z fault.
+    /// Apply Pauli Z gate. No-op on fault bits (Z P Z = +/- P).
     #[allow(clippy::cast_possible_truncation)] // qubit index fits in u32
     pub fn z(&mut self, qubits: &[usize]) {
         for &q in qubits {
@@ -520,37 +509,36 @@ impl GpuPauliProp {
             return;
         }
 
-        // Separate 1Q and 2Q operations
-        let mut ops_1q: Vec<u32> = Vec::new();
-        let mut ops_2q: Vec<u32> = Vec::new();
+        // Dispatch each gate in its own shader invocation, in original order.
+        //
+        // This is required for correctness because (a) 1q and 2q gates do not
+        // commute in general so the previous reorder-into-two-batches broke
+        // semantics, and (b) two-qubit gates read the *other* qubit's fault
+        // state from the global buffer -- that read only sees values written
+        // back at the end of a dispatch, so a chain of 2q gates in a single
+        // dispatch would read stale global state for the chained qubits.
+        //
+        // Per-gate dispatch is O(n_gates) submits but keeps the shader simple
+        // and correct. 1q gates on distinct qubits could in principle be
+        // coalesced into one dispatch, but that needs a dependency tracker.
+        //
+        // No inter-gate `poll` is needed: `Queue::write_buffer` and
+        // `Queue::submit` on the same queue are sequenced. Each iteration's
+        // write happens before its submit, and that submit completes before
+        // the next iteration's write/submit run on the GPU. The CPU loop
+        // returns as soon as everything is queued; the actual GPU work is
+        // synced by the caller's `sync()` or readback.
+        let total_work_items = self.num_qubits as u32 * self.shot_words;
+        let workgroups = total_work_items.div_ceil(256);
 
         let mut i = 0;
         while i < self.gate_queue.len() {
             let gate_type = self.gate_queue[i];
-
-            // Check if this is a 2Q gate
-            let is_2q = matches!(gate_type, GATE_CX | GATE_CZ | GATE_SWAP);
-
-            // FAULT_DEPOL2 uses 4 words
             let op_len = if gate_type == FAULT_DEPOL2 { 4 } else { 3 };
 
-            if is_2q {
-                ops_2q.extend_from_slice(&self.gate_queue[i..i + op_len]);
-            } else {
-                ops_1q.extend_from_slice(&self.gate_queue[i..i + op_len]);
-            }
-
-            i += op_len;
-        }
-
-        let total_work_items = self.num_qubits as u32 * self.shot_words;
-        let workgroups = total_work_items.div_ceil(256);
-
-        // Dispatch 1Q operations first
-        if !ops_1q.is_empty() {
-            let mut queue_data = Vec::with_capacity(ops_1q.len() + 1);
-            queue_data.push(ops_1q.len() as u32);
-            queue_data.extend_from_slice(&ops_1q);
+            let mut queue_data = Vec::with_capacity(op_len + 1);
+            queue_data.push(op_len as u32);
+            queue_data.extend_from_slice(&self.gate_queue[i..i + op_len]);
 
             self.queue.write_buffer(
                 &self.gate_queue_buffer,
@@ -561,56 +549,20 @@ impl GpuPauliProp {
             let mut encoder = self
                 .device
                 .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                    label: Some("PauliProp 1Q Encoder"),
+                    label: Some("PauliProp gate encoder"),
                 });
-
             {
                 let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                    label: Some("PauliProp 1Q Pass"),
+                    label: Some("PauliProp gate pass"),
                     timestamp_writes: None,
                 });
                 pass.set_pipeline(&self.pipeline);
                 pass.set_bind_group(0, &self.bind_group, &[]);
                 pass.dispatch_workgroups(workgroups, 1, 1);
             }
-
             self.queue.submit(std::iter::once(encoder.finish()));
 
-            // Wait for 1Q to complete before 2Q
-            if !ops_2q.is_empty() {
-                let _ = self.device.poll(wgpu::PollType::wait_indefinitely());
-            }
-        }
-
-        // Dispatch 2Q operations
-        if !ops_2q.is_empty() {
-            let mut queue_data = Vec::with_capacity(ops_2q.len() + 1);
-            queue_data.push(ops_2q.len() as u32);
-            queue_data.extend_from_slice(&ops_2q);
-
-            self.queue.write_buffer(
-                &self.gate_queue_buffer,
-                0,
-                bytemuck::cast_slice(&queue_data),
-            );
-
-            let mut encoder = self
-                .device
-                .create_command_encoder(&wgpu::CommandEncoderDescriptor {
-                    label: Some("PauliProp 2Q Encoder"),
-                });
-
-            {
-                let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
-                    label: Some("PauliProp 2Q Pass"),
-                    timestamp_writes: None,
-                });
-                pass.set_pipeline(&self.pipeline);
-                pass.set_bind_group(0, &self.bind_group, &[]);
-                pass.dispatch_workgroups(workgroups, 1, 1);
-            }
-
-            self.queue.submit(std::iter::once(encoder.finish()));
+            i += op_len;
         }
 
         self.gate_queue.clear();
@@ -801,8 +753,6 @@ impl GpuPauliProp {
     }
 }
 
-crate::impl_gpu_drop!(GpuPauliProp);
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/crates/pecos-gpu-sims/src/gpu_probe.rs b/crates/pecos-gpu-sims/src/gpu_probe.rs
index 9ed9bbc19..c3cbbbdf2 100644
--- a/crates/pecos-gpu-sims/src/gpu_probe.rs
+++ b/crates/pecos-gpu-sims/src/gpu_probe.rs
@@ -1,4 +1,17 @@
-//! Shared GPU startup probe utilities.
+//! Shared process-wide GPU context.
+//!
+//! All PECOS GPU simulators share a single `wgpu::Instance`, `wgpu::Adapter`,
+//! `wgpu::Device`, and `wgpu::Queue`. Creating one device per simulator used to
+//! trigger driver-level races when simulators ran in parallel (e.g. per-shot
+//! rayon parallelism or cargo's default parallel tests) and could SIGSEGV the
+//! Vulkan/wgpu stack.
+//!
+//! The shared context is initialized lazily on first access via `OnceLock`,
+//! requesting the superset of optional features we care about
+//! (`SHADER_F64`, `SUBGROUP`). Simulators that need a particular feature check
+//! the corresponding `supports_*` flag on the context.
+
+use std::sync::OnceLock;
 
 /// Adapter/device information for the selected default GPU backend.
 #[derive(Clone, Debug)]
@@ -8,12 +21,20 @@ pub struct GpuAdapterInfo {
     pub device_type: wgpu::DeviceType,
 }
 
-/// Device context returned by the default GPU startup probe.
-#[derive(Debug)]
+/// Shared GPU device context.
+///
+/// `wgpu::Device` and `wgpu::Queue` are internally reference-counted handles,
+/// so returning a cloned `GpuDeviceContext` from the process-wide singleton is
+/// cheap and all clones point at the same underlying device.
+#[derive(Clone, Debug)]
 pub struct GpuDeviceContext {
     pub info: GpuAdapterInfo,
     pub device: wgpu::Device,
     pub queue: wgpu::Queue,
+    /// True iff the device was created with `wgpu::Features::SHADER_F64`.
+    pub supports_f64: bool,
+    /// True iff the device was created with `wgpu::Features::SUBGROUP`.
+    pub supports_subgroup: bool,
 }
 
 /// Errors that can occur while creating the default GPU device.
@@ -38,13 +59,25 @@ impl std::fmt::Display for GpuStartupError {
 
 impl std::error::Error for GpuStartupError {}
 
-/// Request the default high-performance GPU adapter and device used by PECOS GPU sims.
+static GPU_CONTEXT: OnceLock<Result<GpuDeviceContext, GpuStartupError>> = OnceLock::new();
+
+/// Return a handle to the shared process-wide GPU context.
+///
+/// On first call, initializes the wgpu instance/adapter/device/queue. Later
+/// calls return cheap clones pointing at the same underlying device.
 ///
 /// # Errors
-/// Returns `GpuStartupError` if no GPU adapter is found or device creation fails.
-pub fn request_default_gpu_device(
-    label: &'static str,
-) -> Result<GpuDeviceContext, GpuStartupError> {
+/// Returns `GpuStartupError` if no suitable GPU adapter is found or device
+/// creation fails. The error is memoized: once initialization fails, every
+/// subsequent call returns a clone of the same error.
+pub fn gpu_context() -> Result<GpuDeviceContext, GpuStartupError> {
+    match GPU_CONTEXT.get_or_init(init_gpu_context) {
+        Ok(ctx) => Ok(ctx.clone()),
+        Err(err) => Err(err.clone()),
+    }
+}
+
+fn init_gpu_context() -> Result<GpuDeviceContext, GpuStartupError> {
     let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::new_without_display_handle());
 
     let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
@@ -54,11 +87,11 @@ pub fn request_default_gpu_device(
     }))
     .map_err(|_| GpuStartupError::NoAdapter)?;
 
-    let info = adapter.get_info();
+    let adapter_raw_info = adapter.get_info();
     let info = GpuAdapterInfo {
-        name: info.name,
-        backend: info.backend,
-        device_type: info.device_type,
+        name: adapter_raw_info.name,
+        backend: adapter_raw_info.backend,
+        device_type: adapter_raw_info.device_type,
     };
 
     // Reject software renderers and unknown device types -- they technically
@@ -87,20 +120,33 @@ pub fn request_default_gpu_device(
         });
     }
 
+    // Opportunistically request optional features that individual simulators
+    // want. Intersecting with adapter.features() makes each optional: if the
+    // adapter cannot provide SHADER_F64, we still get a device without it and
+    // the f64 simulator will bail out with a clear error.
+    let optional = wgpu::Features::SHADER_F64 | wgpu::Features::SUBGROUP;
+    let adapter_features = adapter.features();
+    let required_features = optional & adapter_features;
+
     let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
-        label: Some(label),
-        required_features: wgpu::Features::empty(),
-        required_limits: adapter.limits(),
-        ..Default::default()
+        label: Some("PECOS shared GPU device"),
+        required_features,
+        required_limits: limits,
+        memory_hints: wgpu::MemoryHints::Performance,
+        trace: wgpu::Trace::Off,
+        experimental_features: wgpu::ExperimentalFeatures::default(),
     }))
     .map_err(|error| GpuStartupError::DeviceCreation {
         info: info.clone(),
         error: error.to_string(),
     })?;
 
+    let device_features = device.features();
     Ok(GpuDeviceContext {
         info,
         device,
         queue,
+        supports_f64: device_features.contains(wgpu::Features::SHADER_F64),
+        supports_subgroup: device_features.contains(wgpu::Features::SUBGROUP),
     })
 }
diff --git a/crates/pecos-gpu-sims/src/gpu_sampler.rs b/crates/pecos-gpu-sims/src/gpu_sampler.rs
index 1deb8fc7b..0b25f6f5f 100644
--- a/crates/pecos-gpu-sims/src/gpu_sampler.rs
+++ b/crates/pecos-gpu-sims/src/gpu_sampler.rs
@@ -43,6 +43,7 @@
 // Converting probability (f64) to fixed-point threshold (u32) is intentional
 #![allow(clippy::cast_sign_loss)]
 
+use crate::gpu_probe::gpu_context;
 use pecos_random::PecosRng;
 use pecos_simulators::measurement_sampler::MeasurementKind;
 
@@ -281,23 +282,9 @@ impl GpuMeasurementSampler {
             }
         }
 
-        // Initialize wgpu
-        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::new_without_display_handle());
-
-        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
-            power_preference: wgpu::PowerPreference::HighPerformance,
-            compatible_surface: None,
-            force_fallback_adapter: false,
-        }))
-        .map_err(|_| "No GPU adapter found")?;
-
-        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
-            label: Some("GpuMeasurementSampler Device"),
-            required_features: wgpu::Features::empty(),
-            required_limits: adapter.limits(),
-            ..Default::default()
-        }))
-        .map_err(|e| format!("Failed to create device: {e}"))?;
+        let ctx = gpu_context().map_err(|e| e.to_string())?;
+        let device = ctx.device;
+        let queue = ctx.queue;
 
         let num_measurements = measurements.len() as u32;
         let num_words = max_shots.div_ceil(32) as u32;
@@ -972,8 +959,6 @@ impl GpuMeasurementSampler {
     }
 }
 
-crate::impl_gpu_drop!(GpuMeasurementSampler);
-
 #[cfg(test)]
 #[allow(clippy::cast_precision_loss)] // Test code computes ratios from counts
 mod tests {
diff --git a/crates/pecos-gpu-sims/src/gpu_stab.rs b/crates/pecos-gpu-sims/src/gpu_stab.rs
index aec01cfc4..6013210d2 100644
--- a/crates/pecos-gpu-sims/src/gpu_stab.rs
+++ b/crates/pecos-gpu-sims/src/gpu_stab.rs
@@ -10,6 +10,7 @@
 
 use crate::circuit_compiler::{CircuitCompiler, Gate as CompiledGate};
 use crate::clifford_fusion::CliffordFuser;
+use crate::gpu_probe::gpu_context;
 use pecos_core::QubitId;
 use pecos_random::{PecosRng, Rng, SeedableRng};
 use pecos_simulators::{CliffordGateable, MeasurementResult, QuantumSimulator};
@@ -286,34 +287,9 @@ impl<R: Rng + SeedableRng + Debug> GpuStab<R> {
         const MAX_BATCH_QUBITS: u64 = 32 * 1024;
         let rng = R::seed_from_u64(seed);
 
-        // Initialize wgpu
-        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::new_without_display_handle());
-
-        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
-            power_preference: wgpu::PowerPreference::HighPerformance,
-            compatible_surface: None,
-            force_fallback_adapter: false,
-        }))
-        .map_err(|_| "No GPU adapter found")?;
-
-        // Check if subgroups are supported for optimized measurement
-        let adapter_features = adapter.features();
-        let has_subgroups = adapter_features.contains(wgpu::Features::SUBGROUP);
-
-        // Request subgroup feature if available
-        let required_features = if has_subgroups {
-            wgpu::Features::SUBGROUP
-        } else {
-            wgpu::Features::empty()
-        };
-
-        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
-            label: Some("GpuStab Device"),
-            required_features,
-            required_limits: adapter.limits(),
-            ..Default::default()
-        }))
-        .map_err(|e| format!("Failed to create device: {e}"))?;
+        let ctx = gpu_context().map_err(|e| e.to_string())?;
+        let device = ctx.device;
+        let queue = ctx.queue;
 
         if num_qubits > 0x3FFF {
             return Err(format!(
@@ -2499,8 +2475,6 @@ impl<R: Rng + SeedableRng + Debug> Debug for GpuStab<R> {
     }
 }
 
-crate::impl_gpu_drop!(GpuStab<R>, R: Rng + SeedableRng);
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/crates/pecos-gpu-sims/src/gpu_stab_multi.rs b/crates/pecos-gpu-sims/src/gpu_stab_multi.rs
index 664b056c4..283ebf8ea 100644
--- a/crates/pecos-gpu-sims/src/gpu_stab_multi.rs
+++ b/crates/pecos-gpu-sims/src/gpu_stab_multi.rs
@@ -4,7 +4,7 @@
 //! All shots process the same circuit, but with independent random outcomes.
 //! This is ideal for Monte Carlo sampling where many shots are needed.
 
-use crate::gpu_probe::request_default_gpu_device;
+use crate::gpu_probe::gpu_context;
 use pecos_core::QubitId;
 use pecos_random::{PecosRng, Rng, SeedableRng};
 use std::fmt::Debug;
@@ -100,8 +100,7 @@ impl<R: Rng + SeedableRng + Debug> GpuStabMulti<R> {
     /// Returns an error if no GPU adapter is found or buffer allocation exceeds device limits.
     #[allow(clippy::cast_possible_truncation)] // GPU params: qubit/shot counts fit in u32
     pub fn with_seed(num_qubits: usize, num_shots: usize, seed: u64) -> Result<Self, String> {
-        let gpu = request_default_gpu_device("GPU Stab Multi Device")
-            .map_err(|error| error.to_string())?;
+        let gpu = gpu_context().map_err(|error| error.to_string())?;
         let device = gpu.device;
         let queue = gpu.queue;
 
@@ -1681,9 +1680,32 @@ impl<R: Rng + SeedableRng + Debug> GpuStabMulti<R> {
     /// // results[shot][measurement_index] = true/false
     /// ```
     pub fn mz_queue(&mut self, qubits: &[QubitId]) {
-        let batch_shots = self.shots_per_batch;
+        // Mid-circuit semantics: each mz_queue call must observe the state at
+        // the TIME of the call. The queue is not drained until mz_fetch (or
+        // flush), but we must execute any previously-queued measurements
+        // BEFORE applying gates that arrived between then and now -- those
+        // prior measurements expect to see the state at their own call point.
+        //
+        // Order within this call:
+        //   1) execute any prior meas_queue items against the current
+        //      un-flushed state buffer (which still reflects the state at
+        //      their queue-time, modulo intervening non-measurement no-ops),
+        //   2) flush pending gates so the NEW measurements below will see
+        //      the state at this call point when later executed,
+        //   3) append the new qubits to meas_queue.
+        //
+        // Before the fix, mz_queue only appended qubit indices; mz_fetch
+        // later flushed every queued gate first, so every queued measurement
+        // saw the same final state regardless of when it was queued.
+        if !self.meas_queue.is_empty() {
+            let prev_qubits = std::mem::take(&mut self.meas_queue);
+            let prev_random = std::mem::take(&mut self.meas_queue_random_bits);
+            let prev_results = self.mz_gpu_sequential(&prev_qubits, &prev_random);
+            self.meas_pending_results.push(prev_results);
+        }
+        self.flush_gates();
 
-        // Pre-generate random bits for each qubit to measure
+        let batch_shots = self.shots_per_batch;
         for &qubit in qubits {
             let qubit = qubit.index();
             let random_bits: Vec<u32> = (0..batch_shots)
@@ -1763,12 +1785,18 @@ impl<R: Rng + SeedableRng + Debug> GpuStabMulti<R> {
         combined
     }
 
-    /// Check if there are queued measurements waiting to be processed.
+    /// Check if there are measurements still waiting to be executed on the
+    /// current state. Does NOT count measurements that have already been
+    /// eagerly executed (by a subsequent `mz_queue` or flush) but not yet
+    /// fetched -- those are always returned in full on the next `mz_fetch`.
     pub fn has_queued_measurements(&self) -> bool {
         !self.meas_queue.is_empty()
     }
 
-    /// Get the number of queued measurements.
+    /// Number of measurements currently waiting to be executed on the current
+    /// state. With the mid-circuit semantics, each `mz_queue` call eagerly
+    /// executes all previously-queued measurements before appending new
+    /// qubits; this count therefore reflects only the latest batch.
     pub fn queued_measurement_count(&self) -> usize {
         self.meas_queue.len()
     }
@@ -1938,8 +1966,6 @@ impl<R: Rng + SeedableRng + Debug> GpuStabMulti<R> {
     }
 }
 
-crate::impl_gpu_drop!(GpuStabMulti<R>, R: Rng + SeedableRng);
-
 /// PCG-style hash for deterministic noise (CPU version, matches shader)
 fn hash_noise_cpu(seed: u32, gate_idx: u32, qubit: u32) -> u32 {
     let mut h = seed ^ (gate_idx.wrapping_mul(0x9E37_79B9)) ^ (qubit.wrapping_mul(0x85EB_CA6B));
@@ -3201,19 +3227,21 @@ mod tests {
         let num_shots = 64;
         let mut sim = GpuStabMulti::<PecosRng>::with_seed(5, num_shots, 42).unwrap();
 
-        // Queue measurements in multiple calls
+        // Queue measurements in multiple calls. With mid-circuit eager
+        // semantics, each new mz_queue drains the previous queue, so
+        // queued_measurement_count() reflects only the latest batch.
         sim.x(&qid(0)); // Put qubit 0 in |1>
         sim.mz_queue(&[QubitId(0)]);
+        assert_eq!(sim.queued_measurement_count(), 1);
 
         sim.x(&qid(2)); // Put qubit 2 in |1>
         sim.mz_queue(&[QubitId(1), QubitId(2)]);
+        assert_eq!(sim.queued_measurement_count(), 2);
 
         sim.mz_queue(&[QubitId(3), QubitId(4)]);
+        assert_eq!(sim.queued_measurement_count(), 2);
 
-        // Should have 5 measurements queued
-        assert_eq!(sim.queued_measurement_count(), 5);
-
-        // Fetch all results
+        // Fetch returns the full 5 measurements accumulated across all calls.
         let results = sim.mz_fetch();
 
         assert_eq!(results.len(), num_shots);
diff --git a/crates/pecos-gpu-sims/src/lib.rs b/crates/pecos-gpu-sims/src/lib.rs
index 19f72abe2..b734d4cb3 100644
--- a/crates/pecos-gpu-sims/src/lib.rs
+++ b/crates/pecos-gpu-sims/src/lib.rs
@@ -10,47 +10,36 @@
 //!
 //! # Simulators
 //!
-//! - [`GpuStateVec`]: State vector simulator for universal quantum circuits
+//! - [`GpuStateVec`] / [`GpuStateVec64`]: State vector simulator (f64 precision, default)
+//! - [`GpuStateVec32`]: State vector simulator (f32 precision, faster)
 //! - [`GpuStab`]: Stabilizer tableau simulator for Clifford circuits (experimental)
 //!
 //! # Example
 //!
+//! `GpuStateVec` aliases the f64 backend, which requires `SHADER_F64`. On
+//! adapters without f64 support (e.g. Metal on Apple Silicon) `new()` returns
+//! [`GpuError::UnsupportedFeature`]; the doctest skips in that case so it can
+//! still exercise real GPU code where available. Use [`GpuStateVec32`] for a
+//! universally portable f32 backend.
+//!
 //! ```
 //! use pecos_gpu_sims::GpuStateVec;
 //! use pecos_simulators::CliffordGateable;
 //! use pecos_core::{qid, QubitId};
 //!
-//! let mut sim = GpuStateVec::new(4).unwrap(); // 4 qubits
+//! // Skip cleanly on platforms without a GPU or without SHADER_F64.
+//! let Ok(mut sim) = GpuStateVec::new(4) else { return };
 //! sim.h(&qid(0));         // Hadamard on qubit 0
 //! sim.cx(&[(QubitId(0), QubitId(1))]);    // CNOT with control=0, target=1
-//! let result = sim.mz(&[QubitId(0)]);  // Measure qubit 0
+//! let _result = sim.mz(&[QubitId(0)]);  // Measure qubit 0
 //! ```
 
-/// Implement Drop to poll the wgpu device before resources are freed.
-/// All GPU simulator types that own a `device: Arc<wgpu::Device>` need this
-/// to prevent resource cleanup races.
-macro_rules! impl_gpu_drop {
-    ($ty:ty) => {
-        impl Drop for $ty {
-            fn drop(&mut self) {
-                let _ = self.device.poll(wgpu::PollType::wait_indefinitely());
-            }
-        }
-    };
-    ($ty:ty, $($bound:tt)+) => {
-        impl<$($bound)+> Drop for $ty {
-            fn drop(&mut self) {
-                let _ = self.device.poll(wgpu::PollType::wait_indefinitely());
-            }
-        }
-    };
-}
-
-pub(crate) use impl_gpu_drop;
-
 pub mod circuit_compiler;
 mod clifford_fusion;
 mod gpu;
+mod gpu64;
+mod gpu_auto;
+mod gpu_density_matrix;
 mod gpu_influence_sampler;
 mod gpu_noisy_sampler;
 mod gpu_pauli_prop;
@@ -64,7 +53,19 @@ pub mod prelude;
 mod gpu_sampler_validation;
 
 pub use circuit_compiler::{CompiledCircuit, Gate as CompiledGate, GateType};
-pub use gpu::{GpuError, GpuStateVec};
+pub use gpu::{GpuError, GpuStateVec32, RequiredFeature};
+pub use gpu_auto::GpuStateVecAuto;
+pub use gpu_density_matrix::{
+    GpuDensityMatrix, GpuDensityMatrix32, GpuDensityMatrix64, GpuStateVecBackend,
+};
+pub use gpu64::GpuStateVec64;
+
+/// Default GPU state vector simulator (f64 precision).
+///
+/// Use [`GpuStateVec32`] for f32 precision (faster but less accurate), or
+/// [`GpuStateVecAuto`] to opt in to runtime precision selection (tries f64
+/// first, falls back to f32 on adapters without `SHADER_F64`).
+pub type GpuStateVec = GpuStateVec64;
 pub use gpu_influence_sampler::{GpuInfluenceMapData, GpuInfluenceSampler, GpuSamplingResult};
 pub use gpu_noisy_sampler::{
     BiasedDepolarizingNoiseSampler, CircuitBuilder, CircuitOp, DepolarizingNoiseSampler,
@@ -145,6 +146,12 @@ pub mod gates {
     /// SX-dagger gate
     pub const SXDG: [f32; 8] = [0.5, -0.5, 0.5, 0.5, 0.5, 0.5, 0.5, -0.5];
 
+    /// SY gate (sqrt(Y))
+    pub const SY: [f32; 8] = [0.5, 0.5, -0.5, -0.5, 0.5, 0.5, 0.5, 0.5];
+
+    /// SY-dagger gate
+    pub const SYDG: [f32; 8] = [0.5, -0.5, 0.5, -0.5, -0.5, 0.5, 0.5, -0.5];
+
     /// Create RX(theta) gate matrix
     #[must_use]
     pub fn rx(theta: f64) -> [f32; 8] {
diff --git a/crates/pecos-gpu-sims/src/pauli_prop_shader.wgsl b/crates/pecos-gpu-sims/src/pauli_prop_shader.wgsl
index cd6fd4e46..b5f1bf0ca 100644
--- a/crates/pecos-gpu-sims/src/pauli_prop_shader.wgsl
+++ b/crates/pecos-gpu-sims/src/pauli_prop_shader.wgsl
@@ -90,17 +90,15 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
             local_z = local_z ^ local_x;
         }
         else if (gate_type == GATE_X && qubit1 == qubit) {
-            // X: toggle X fault
-            local_x = ~local_x;
+            // Pauli X gate conjugation on a Pauli P (tracked-fault): X P X = +/- P
+            // with unchanged X/Z bit pattern. No change to local_x / local_z.
+            // (To inject an X fault, use FAULT_X; distinct from GATE_X.)
         }
         else if (gate_type == GATE_Y && qubit1 == qubit) {
-            // Y: toggle both X and Z
-            local_x = ~local_x;
-            local_z = ~local_z;
+            // Pauli Y conjugation: Y P Y = +/- P, bits unchanged.
         }
         else if (gate_type == GATE_Z && qubit1 == qubit) {
-            // Z: toggle Z fault
-            local_z = ~local_z;
+            // Pauli Z conjugation: Z P Z = +/- P, bits unchanged.
         }
         // Two-qubit gates: need to read from the other qubit
         else if (gate_type == GATE_CX) {
diff --git a/crates/pecos-gpu-sims/src/persistent_kernel_f32.wgsl b/crates/pecos-gpu-sims/src/persistent_kernel_f32.wgsl
new file mode 100644
index 000000000..3736a2414
--- /dev/null
+++ b/crates/pecos-gpu-sims/src/persistent_kernel_f32.wgsl
@@ -0,0 +1,169 @@
+// Persistent kernel for f32 state vectors.
+// Loads entire state into workgroup shared memory, applies all gates, writes back.
+// SHARED_SIZE is templated at runtime based on the GPU's actual shared memory.
+
+@group(0) @binding(0)
+var<storage, read_write> state: array<vec2<f32>>;
+
+@group(0) @binding(5)
+var<storage, read> gate_queue_buf: array<u32>;
+
+var<workgroup> shared_state: array<vec2<f32>, {SHARED_SIZE}>;
+
+fn cmul(a: vec2<f32>, b: vec2<f32>) -> vec2<f32> {
+    return vec2<f32>(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
+}
+
+const GATE_SINGLE: u32 = 0u;
+const GATE_DIAGONAL: u32 = 1u;
+const GATE_CX: u32 = 2u;
+const GATE_CY: u32 = 3u;
+const GATE_CZ: u32 = 4u;
+const GATE_SWAP: u32 = 5u;
+const GATE_RXX: u32 = 6u;
+const GATE_RYY: u32 = 7u;
+const GATE_RZZ: u32 = 8u;
+const GATE_STRIDE: u32 = 12u;
+
+@compute @workgroup_size(256)
+fn apply_gate_queue_persistent(
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let tid = local_id.x;
+    let num_gates = gate_queue_buf[0];
+    let num_qubits = gate_queue_buf[1];
+    let num_amplitudes = 1u << num_qubits;
+
+    for (var i = tid; i < num_amplitudes; i += 256u) {
+        shared_state[i] = state[i];
+    }
+    workgroupBarrier();
+
+    for (var g = 0u; g < num_gates; g++) {
+        let base = 2u + g * GATE_STRIDE;
+        let gate_type = gate_queue_buf[base];
+        let tgt = gate_queue_buf[base + 1u];
+        let ctrl = gate_queue_buf[base + 2u];
+        let num_pairs = num_amplitudes >> 1u;
+
+        switch (gate_type) {
+            case GATE_SINGLE: {
+                let a = vec2<f32>(bitcast<f32>(gate_queue_buf[base + 4u]), bitcast<f32>(gate_queue_buf[base + 5u]));
+                let b = vec2<f32>(bitcast<f32>(gate_queue_buf[base + 6u]), bitcast<f32>(gate_queue_buf[base + 7u]));
+                let c = vec2<f32>(bitcast<f32>(gate_queue_buf[base + 8u]), bitcast<f32>(gate_queue_buf[base + 9u]));
+                let d = vec2<f32>(bitcast<f32>(gate_queue_buf[base + 10u]), bitcast<f32>(gate_queue_buf[base + 11u]));
+
+                let low_mask = (1u << tgt) - 1u;
+                for (var pair_idx = tid; pair_idx < num_pairs; pair_idx += 256u) {
+                    let high_bits = pair_idx >> tgt;
+                    let low_bits = pair_idx & low_mask;
+                    let idx0 = (high_bits << (tgt + 1u)) | low_bits;
+                    let idx1 = idx0 | (1u << tgt);
+                    let amp0 = shared_state[idx0];
+                    let amp1 = shared_state[idx1];
+                    shared_state[idx0] = cmul(a, amp0) + cmul(b, amp1);
+                    shared_state[idx1] = cmul(c, amp0) + cmul(d, amp1);
+                }
+            }
+            case GATE_DIAGONAL: {
+                let a = vec2<f32>(bitcast<f32>(gate_queue_buf[base + 4u]), bitcast<f32>(gate_queue_buf[base + 5u]));
+                let d = vec2<f32>(bitcast<f32>(gate_queue_buf[base + 10u]), bitcast<f32>(gate_queue_buf[base + 11u]));
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let bit = (i >> tgt) & 1u;
+                    let phase = select(a, d, bit == 1u);
+                    shared_state[i] = cmul(phase, shared_state[i]);
+                }
+            }
+            case GATE_CX: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    if ((i & (1u << ctrl)) != 0u && (i & (1u << tgt)) == 0u) {
+                        let partner = i | (1u << tgt);
+                        let tmp = shared_state[i];
+                        shared_state[i] = shared_state[partner];
+                        shared_state[partner] = tmp;
+                    }
+                }
+            }
+            case GATE_CY: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    if ((i & (1u << ctrl)) != 0u && (i & (1u << tgt)) == 0u) {
+                        let partner = i | (1u << tgt);
+                        let amp0 = shared_state[i];
+                        let amp1 = shared_state[partner];
+                        shared_state[i] = vec2<f32>(amp1.y, -amp1.x);
+                        shared_state[partner] = vec2<f32>(-amp0.y, amp0.x);
+                    }
+                }
+            }
+            case GATE_CZ: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    if ((i & (1u << ctrl)) != 0u && (i & (1u << tgt)) != 0u) {
+                        shared_state[i] = -shared_state[i];
+                    }
+                }
+            }
+            case GATE_SWAP: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let bit_a = (i & (1u << ctrl)) != 0u;
+                    let bit_b = (i & (1u << tgt)) != 0u;
+                    if (!bit_a && bit_b) {
+                        let partner = (i & ~(1u << tgt)) | (1u << ctrl);
+                        let tmp = shared_state[i];
+                        shared_state[i] = shared_state[partner];
+                        shared_state[partner] = tmp;
+                    }
+                }
+            }
+            case GATE_RXX: {
+                let theta = bitcast<f32>(gate_queue_buf[base + 4u]);
+                let c_val = cos(theta / 2.0);
+                let s_val = sin(theta / 2.0);
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let partner = i ^ (1u << ctrl) ^ (1u << tgt);
+                    if (i < partner) {
+                        let amp0 = shared_state[i];
+                        let amp1 = shared_state[partner];
+                        shared_state[i] = vec2<f32>(amp0.x * c_val + amp1.y * s_val, amp0.y * c_val - amp1.x * s_val);
+                        shared_state[partner] = vec2<f32>(amp1.x * c_val + amp0.y * s_val, amp1.y * c_val - amp0.x * s_val);
+                    }
+                }
+            }
+            case GATE_RYY: {
+                let theta = bitcast<f32>(gate_queue_buf[base + 4u]);
+                let c_val = cos(theta / 2.0);
+                let s_abs = sin(theta / 2.0);
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let partner = i ^ (1u << ctrl) ^ (1u << tgt);
+                    if (i < partner) {
+                        let bit_a = (i & (1u << ctrl)) != 0u;
+                        let bit_b = (i & (1u << tgt)) != 0u;
+                        let s_val = select(s_abs, -s_abs, bit_a == bit_b);
+                        let amp0 = shared_state[i];
+                        let amp1 = shared_state[partner];
+                        shared_state[i] = vec2<f32>(amp0.x * c_val + amp1.y * s_val, amp0.y * c_val - amp1.x * s_val);
+                        shared_state[partner] = vec2<f32>(amp1.x * c_val + amp0.y * s_val, amp1.y * c_val - amp0.x * s_val);
+                    }
+                }
+            }
+            case GATE_RZZ: {
+                let theta = bitcast<f32>(gate_queue_buf[base + 4u]);
+                let half_theta = theta / 2.0;
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let q1_set = (i & (1u << ctrl)) != 0u;
+                    let q2_set = (i & (1u << tgt)) != 0u;
+                    let phase = select(half_theta, -half_theta, q1_set == q2_set);
+                    let c_val = cos(phase);
+                    let s_val = sin(phase);
+                    let amp = shared_state[i];
+                    shared_state[i] = vec2<f32>(amp.x * c_val - amp.y * s_val, amp.x * s_val + amp.y * c_val);
+                }
+            }
+            default: {}
+        }
+        workgroupBarrier();
+    }
+
+    for (var i = tid; i < num_amplitudes; i += 256u) {
+        state[i] = shared_state[i];
+    }
+}
diff --git a/crates/pecos-gpu-sims/src/persistent_kernel_f64.wgsl b/crates/pecos-gpu-sims/src/persistent_kernel_f64.wgsl
new file mode 100644
index 000000000..921fe3c75
--- /dev/null
+++ b/crates/pecos-gpu-sims/src/persistent_kernel_f64.wgsl
@@ -0,0 +1,168 @@
+// Persistent kernel for f64 state vectors.
+// SHARED_SIZE is templated at runtime based on the GPU's actual shared memory.
+
+@group(0) @binding(0)
+var<storage, read_write> state: array<vec2<f64>>;
+
+// Gate queue stored as array<f64>. Metadata stored as f64-encoded u32 values.
+// Layout per gate (12 f64): [type, tgt, ctrl, pad, a_re, a_im, b_re, b_im, c_re, c_im, d_re, d_im]
+// Header: [num_gates, num_qubits] as f64.
+@group(0) @binding(5)
+var<storage, read> gate_queue_f64: array<f64>;
+
+var<workgroup> shared_state: array<vec2<f64>, {SHARED_SIZE}>;
+
+fn cmul(a: vec2<f64>, b: vec2<f64>) -> vec2<f64> {
+    return vec2<f64>(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
+}
+
+const GATE_SINGLE: u32 = 0u;
+const GATE_DIAGONAL: u32 = 1u;
+const GATE_CX: u32 = 2u;
+const GATE_CY: u32 = 3u;
+const GATE_CZ: u32 = 4u;
+const GATE_SWAP: u32 = 5u;
+const GATE_RXX: u32 = 6u;
+const GATE_RYY: u32 = 7u;
+const GATE_RZZ: u32 = 8u;
+const GATE_STRIDE: u32 = 12u;
+
+@compute @workgroup_size(256)
+fn apply_gate_queue_persistent(
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+) {
+    let tid = local_id.x;
+    let num_gates = u32(gate_queue_f64[0]);
+    let num_qubits = u32(gate_queue_f64[1]);
+    let num_amplitudes = 1u << num_qubits;
+
+    for (var i = tid; i < num_amplitudes; i += 256u) {
+        shared_state[i] = state[i];
+    }
+    workgroupBarrier();
+
+    for (var g = 0u; g < num_gates; g++) {
+        let base = 2u + g * GATE_STRIDE;
+        let gate_type = u32(gate_queue_f64[base]);
+        let tgt = u32(gate_queue_f64[base + 1u]);
+        let ctrl = u32(gate_queue_f64[base + 2u]);
+        let num_pairs = num_amplitudes >> 1u;
+
+        switch (gate_type) {
+            case GATE_SINGLE: {
+                let a = vec2<f64>(gate_queue_f64[base + 4u], gate_queue_f64[base + 5u]);
+                let b = vec2<f64>(gate_queue_f64[base + 6u], gate_queue_f64[base + 7u]);
+                let c = vec2<f64>(gate_queue_f64[base + 8u], gate_queue_f64[base + 9u]);
+                let d = vec2<f64>(gate_queue_f64[base + 10u], gate_queue_f64[base + 11u]);
+
+                let low_mask = (1u << tgt) - 1u;
+                for (var pair_idx = tid; pair_idx < num_pairs; pair_idx += 256u) {
+                    let high_bits = pair_idx >> tgt;
+                    let low_bits = pair_idx & low_mask;
+                    let idx0 = (high_bits << (tgt + 1u)) | low_bits;
+                    let idx1 = idx0 | (1u << tgt);
+                    let amp0 = shared_state[idx0];
+                    let amp1 = shared_state[idx1];
+                    shared_state[idx0] = cmul(a, amp0) + cmul(b, amp1);
+                    shared_state[idx1] = cmul(c, amp0) + cmul(d, amp1);
+                }
+            }
+            case GATE_DIAGONAL: {
+                let a = vec2<f64>(gate_queue_f64[base + 4u], gate_queue_f64[base + 5u]);
+                let d = vec2<f64>(gate_queue_f64[base + 10u], gate_queue_f64[base + 11u]);
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let bit = (i >> tgt) & 1u;
+                    let phase = select(a, d, bit == 1u);
+                    shared_state[i] = cmul(phase, shared_state[i]);
+                }
+            }
+            case GATE_CX: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    if ((i & (1u << ctrl)) != 0u && (i & (1u << tgt)) == 0u) {
+                        let partner = i | (1u << tgt);
+                        let tmp = shared_state[i];
+                        shared_state[i] = shared_state[partner];
+                        shared_state[partner] = tmp;
+                    }
+                }
+            }
+            case GATE_CY: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    if ((i & (1u << ctrl)) != 0u && (i & (1u << tgt)) == 0u) {
+                        let partner = i | (1u << tgt);
+                        let amp0 = shared_state[i];
+                        let amp1 = shared_state[partner];
+                        shared_state[i] = vec2<f64>(amp1.y, -amp1.x);
+                        shared_state[partner] = vec2<f64>(-amp0.y, amp0.x);
+                    }
+                }
+            }
+            case GATE_CZ: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    if ((i & (1u << ctrl)) != 0u && (i & (1u << tgt)) != 0u) {
+                        shared_state[i] = -shared_state[i];
+                    }
+                }
+            }
+            case GATE_SWAP: {
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let bit_a = (i & (1u << ctrl)) != 0u;
+                    let bit_b = (i & (1u << tgt)) != 0u;
+                    if (!bit_a && bit_b) {
+                        let partner = (i & ~(1u << tgt)) | (1u << ctrl);
+                        let tmp = shared_state[i];
+                        shared_state[i] = shared_state[partner];
+                        shared_state[partner] = tmp;
+                    }
+                }
+            }
+            case GATE_RXX: {
+                // cos/sin precomputed on CPU: base+4 = cos(t/2), base+5 = sin(t/2).
+                let c_val = gate_queue_f64[base + 4u];
+                let s_val = gate_queue_f64[base + 5u];
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let partner = i ^ (1u << ctrl) ^ (1u << tgt);
+                    if (i < partner) {
+                        let amp0 = shared_state[i];
+                        let amp1 = shared_state[partner];
+                        shared_state[i] = vec2<f64>(amp0.x * c_val + amp1.y * s_val, amp0.y * c_val - amp1.x * s_val);
+                        shared_state[partner] = vec2<f64>(amp1.x * c_val + amp0.y * s_val, amp1.y * c_val - amp0.x * s_val);
+                    }
+                }
+            }
+            case GATE_RYY: {
+                let c_val = gate_queue_f64[base + 4u];
+                let s_abs = gate_queue_f64[base + 5u];
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let partner = i ^ (1u << ctrl) ^ (1u << tgt);
+                    if (i < partner) {
+                        let bit_a = (i & (1u << ctrl)) != 0u;
+                        let bit_b = (i & (1u << tgt)) != 0u;
+                        let s_val = select(s_abs, -s_abs, bit_a == bit_b);
+                        let amp0 = shared_state[i];
+                        let amp1 = shared_state[partner];
+                        shared_state[i] = vec2<f64>(amp0.x * c_val + amp1.y * s_val, amp0.y * c_val - amp1.x * s_val);
+                        shared_state[partner] = vec2<f64>(amp1.x * c_val + amp0.y * s_val, amp1.y * c_val - amp0.x * s_val);
+                    }
+                }
+            }
+            case GATE_RZZ: {
+                let c_val = gate_queue_f64[base + 4u];
+                let s_abs = gate_queue_f64[base + 5u];
+                for (var i = tid; i < num_amplitudes; i += 256u) {
+                    let q1_set = (i & (1u << ctrl)) != 0u;
+                    let q2_set = (i & (1u << tgt)) != 0u;
+                    let s_val = select(s_abs, -s_abs, q1_set == q2_set);
+                    let amp = shared_state[i];
+                    shared_state[i] = vec2<f64>(amp.x * c_val - amp.y * s_val, amp.x * s_val + amp.y * c_val);
+                }
+            }
+            default: {}
+        }
+        workgroupBarrier();
+    }
+
+    for (var i = tid; i < num_amplitudes; i += 256u) {
+        state[i] = shared_state[i];
+    }
+}
diff --git a/crates/pecos-gpu-sims/src/prelude.rs b/crates/pecos-gpu-sims/src/prelude.rs
index 9431897c8..2978db5fb 100644
--- a/crates/pecos-gpu-sims/src/prelude.rs
+++ b/crates/pecos-gpu-sims/src/prelude.rs
@@ -14,14 +14,18 @@
 //!
 //! # Example
 //!
+//! `GpuStateVec` is the f64 backend; construction returns an error on adapters
+//! without `SHADER_F64`. The doctest skips cleanly in that case. See the
+//! crate-level docs for platform notes.
+//!
 //! ```
 //! use pecos_gpu_sims::prelude::*;
 //!
-//! let mut sim = GpuStateVec::new(4).unwrap();
+//! let Ok(mut sim) = GpuStateVec::new(4) else { return };
 //! sim.h(&qid(0));
 //! sim.cx(&[(QubitId(0), QubitId(1))]);
 //! ```
 
-pub use crate::{GpuError, GpuStateVec};
+pub use crate::{GpuError, GpuStateVec, GpuStateVec32, GpuStateVec64};
 pub use pecos_core::{QubitId, qid};
 pub use pecos_simulators::{ArbitraryRotationGateable, CliffordGateable};
diff --git a/crates/pecos-gpu-sims/src/shaders.wgsl b/crates/pecos-gpu-sims/src/shaders.wgsl
index 0c841130f..96a734d18 100644
--- a/crates/pecos-gpu-sims/src/shaders.wgsl
+++ b/crates/pecos-gpu-sims/src/shaders.wgsl
@@ -47,6 +47,27 @@ fn cadd(a: vec2<f32>, b: vec2<f32>) -> vec2<f32> {
     return a + b;
 }
 
+// Apply diagonal single-qubit gate: [[a, 0], [0, d]]
+// Each thread handles ONE amplitude (not a pair), applying the appropriate
+// diagonal element based on the qubit bit. Fully coalesced memory access.
+@compute @workgroup_size(256)
+fn apply_diagonal_gate(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let bit = (idx >> params.target_qubit) & 1u;
+    let phase = select(params.matrix_row0.xy, params.matrix_row1.zw, bit == 1u);
+
+    state[idx] = cmul(phase, state[idx]);
+}
+
 // Apply arbitrary single-qubit gate
 // Each thread handles one pair of amplitudes that differ in the target qubit bit
 @compute @workgroup_size(256)
@@ -146,6 +167,150 @@ fn apply_cz(
     }
 }
 
+// Apply CY gate: controlled-Y
+@compute @workgroup_size(256)
+fn apply_cy(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let control_mask = 1u << params.control_qubit;
+    let target_mask = 1u << params.target_qubit;
+
+    let control_set = (idx & control_mask) != 0u;
+    let target_set = (idx & target_mask) != 0u;
+
+    if (control_set && !target_set) {
+        let partner_idx = idx | target_mask;
+        let amp0 = state[idx];
+        let amp1 = state[partner_idx];
+
+        state[idx] = vec2<f32>(amp1.y, -amp1.x);
+        state[partner_idx] = vec2<f32>(-amp0.y, amp0.x);
+    }
+}
+
+// Apply SWAP gate
+@compute @workgroup_size(256)
+fn apply_swap(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let mask_a = 1u << params.control_qubit;
+    let mask_b = 1u << params.target_qubit;
+
+    let bit_a = (idx & mask_a) != 0u;
+    let bit_b = (idx & mask_b) != 0u;
+
+    if (!bit_a && bit_b) {
+        let partner = (idx & ~mask_b) | mask_a;
+        let amp0 = state[idx];
+        let amp1 = state[partner];
+        state[idx] = amp1;
+        state[partner] = amp0;
+    }
+}
+
+// Apply RXX(theta) gate: exp(-i * theta/2 * X⊗X)
+// Pairs amplitudes that differ in BOTH qubit bits.
+// Each pair transforms as [[cos(t/2), -i*sin(t/2)], [-i*sin(t/2), cos(t/2)]]
+// Angle theta is passed in matrix_row0.x
+@compute @workgroup_size(256)
+fn apply_rxx(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let mask_a = 1u << params.control_qubit;
+    let mask_b = 1u << params.target_qubit;
+
+    // RXX couples every (idx, partner) pair with -i*sin. One thread per pair
+    // (idx < partner) covers every pair exactly once and avoids racing writes
+    // to state[idx] / state[partner].
+    let partner = idx ^ mask_a ^ mask_b;
+    if (idx < partner) {
+        let theta = params.matrix_row0.x;
+        let c = cos(theta / 2.0);
+        let s = sin(theta / 2.0);
+
+        let amp0 = state[idx];
+        let amp1 = state[partner];
+
+        state[idx] = vec2<f32>(
+            amp0.x * c + amp1.y * s,
+            amp0.y * c - amp1.x * s
+        );
+        state[partner] = vec2<f32>(
+            amp1.x * c + amp0.y * s,
+            amp1.y * c - amp0.x * s
+        );
+    }
+}
+
+// Apply RYY(theta) gate: exp(-i * theta/2 * Y⊗Y)
+// Same pairing as RXX (flip both bits), different rotation matrix.
+// Each pair: [[cos(t/2), i*sin(t/2)], [i*sin(t/2), cos(t/2)]] for same-parity,
+//            [[cos(t/2), -i*sin(t/2)], [-i*sin(t/2), cos(t/2)]] for diff-parity.
+// Angle theta is passed in matrix_row0.x
+@compute @workgroup_size(256)
+fn apply_ryy(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let mask_a = 1u << params.control_qubit;
+    let mask_b = 1u << params.target_qubit;
+
+    // RYY couples every pair with +i*sin on same-parity (|00>,|11>) and
+    // -i*sin on diff-parity (|01>,|10>). One thread per pair (idx < partner).
+    let bit_a = (idx & mask_a) != 0u;
+    let bit_b = (idx & mask_b) != 0u;
+    let partner = idx ^ mask_a ^ mask_b;
+    if (idx < partner) {
+        let theta = params.matrix_row0.x;
+        let c = cos(theta / 2.0);
+        let s_abs = sin(theta / 2.0);
+        let s = select(s_abs, -s_abs, bit_a == bit_b);
+
+        let amp0 = state[idx];
+        let amp1 = state[partner];
+
+        state[idx] = vec2<f32>(
+            amp0.x * c + amp1.y * s,
+            amp0.y * c - amp1.x * s
+        );
+        state[partner] = vec2<f32>(
+            amp1.x * c + amp0.y * s,
+            amp1.y * c - amp0.x * s
+        );
+    }
+}
+
 // Apply RZZ(theta) gate: exp(-i * theta/2 * Z⊗Z)
 // Phase depends on parity of the two qubits:
 // |00⟩ → e^{-iθ/2} |00⟩  (same parity: negative phase)
diff --git a/crates/pecos-gpu-sims/src/shaders_f64.wgsl b/crates/pecos-gpu-sims/src/shaders_f64.wgsl
new file mode 100644
index 000000000..6a2d9eb18
--- /dev/null
+++ b/crates/pecos-gpu-sims/src/shaders_f64.wgsl
@@ -0,0 +1,379 @@
+// State vector quantum simulation shaders (f64 precision)
+//
+// State vector layout: array of vec2<f64> where .x = real, .y = imaginary
+// For n qubits, we have 2^n amplitudes.
+//
+// Requires SHADER_F64 feature (Vulkan shaderFloat64 capability).
+
+// Shared state vector buffer (read-write)
+@group(0) @binding(0)
+var<storage, read_write> state: array<vec2<f64>>;
+
+// Gate parameters
+struct GateParams {
+    target_qubit: u32,
+    control_qubit: u32,
+    num_qubits: u32,
+    _padding: u32,
+    // 2x2 gate matrix stored as 8 f64 values:
+    // a_re, a_im, b_re, b_im, c_re, c_im, d_re, d_im
+    a_re: f64,
+    a_im: f64,
+    b_re: f64,
+    b_im: f64,
+    c_re: f64,
+    c_im: f64,
+    d_re: f64,
+    d_im: f64,
+}
+
+@group(0) @binding(1)
+var<uniform> params: GateParams;
+
+const WORKGROUP_SIZE: u32 = 256u;
+
+fn get_linear_idx(global_id: vec3<u32>, num_workgroups: vec3<u32>) -> u32 {
+    let threads_per_y_row = num_workgroups.x * WORKGROUP_SIZE;
+    return global_id.y * threads_per_y_row + global_id.x;
+}
+
+// Complex multiplication: (a + bi) * (c + di) = (ac - bd) + (ad + bc)i
+fn cmul(a: vec2<f64>, b: vec2<f64>) -> vec2<f64> {
+    return vec2<f64>(
+        a.x * b.x - a.y * b.y,
+        a.x * b.y + a.y * b.x
+    );
+}
+
+// Apply diagonal single-qubit gate: [[a, 0], [0, d]]
+// Each thread handles ONE amplitude (not a pair), applying the appropriate
+// diagonal element based on the qubit bit. Fully coalesced memory access.
+@compute @workgroup_size(256)
+fn apply_diagonal_gate(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    // Select diagonal element based on target qubit bit
+    let bit = (idx >> params.target_qubit) & 1u;
+    let phase_re = select(params.a_re, params.d_re, bit == 1u);
+    let phase_im = select(params.a_im, params.d_im, bit == 1u);
+    let phase = vec2<f64>(phase_re, phase_im);
+
+    state[idx] = cmul(phase, state[idx]);
+}
+
+// Apply arbitrary single-qubit gate
+@compute @workgroup_size(256)
+fn apply_single_gate(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let pair_idx = get_linear_idx(global_id, num_workgroups);
+    let num_pairs = 1u << (params.num_qubits - 1u);
+
+    if (pair_idx >= num_pairs) {
+        return;
+    }
+
+    let low_mask = (1u << params.target_qubit) - 1u;
+    let high_bits = pair_idx >> params.target_qubit;
+    let low_bits = pair_idx & low_mask;
+
+    let idx0 = (high_bits << (params.target_qubit + 1u)) | low_bits;
+    let idx1 = idx0 | (1u << params.target_qubit);
+
+    let amp0 = state[idx0];
+    let amp1 = state[idx1];
+
+    let a = vec2<f64>(params.a_re, params.a_im);
+    let b = vec2<f64>(params.b_re, params.b_im);
+    let c = vec2<f64>(params.c_re, params.c_im);
+    let d = vec2<f64>(params.d_re, params.d_im);
+
+    let new_amp0 = cmul(a, amp0) + cmul(b, amp1);
+    let new_amp1 = cmul(c, amp0) + cmul(d, amp1);
+
+    state[idx0] = new_amp0;
+    state[idx1] = new_amp1;
+}
+
+// Apply CNOT (CX) gate
+@compute @workgroup_size(256)
+fn apply_cx(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let control_mask = 1u << params.control_qubit;
+    let target_mask = 1u << params.target_qubit;
+
+    let control_set = (idx & control_mask) != 0u;
+    let target_set = (idx & target_mask) != 0u;
+
+    if (control_set && !target_set) {
+        let partner_idx = idx | target_mask;
+        let amp0 = state[idx];
+        let amp1 = state[partner_idx];
+        state[idx] = amp1;
+        state[partner_idx] = amp0;
+    }
+}
+
+// Apply CZ gate
+@compute @workgroup_size(256)
+fn apply_cz(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let control_mask = 1u << params.control_qubit;
+    let target_mask = 1u << params.target_qubit;
+
+    if ((idx & control_mask) != 0u && (idx & target_mask) != 0u) {
+        state[idx] = -state[idx];
+    }
+}
+
+// Apply CY gate: controlled-Y
+// When control is |1> and target is |0>, swap and apply phase
+// CY|c,t> = |c> (Y|t>) when c=1, else |c,t>
+// Y = [[0, -i], [i, 0]]
+@compute @workgroup_size(256)
+fn apply_cy(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let control_mask = 1u << params.control_qubit;
+    let target_mask = 1u << params.target_qubit;
+
+    let control_set = (idx & control_mask) != 0u;
+    let target_set = (idx & target_mask) != 0u;
+
+    // Process pairs once: when control=1 and target=0
+    if (control_set && !target_set) {
+        let partner_idx = idx | target_mask;
+        let amp0 = state[idx];         // |...0...> (target=0)
+        let amp1 = state[partner_idx]; // |...1...> (target=1)
+
+        // Y|0> = i|1>, Y|1> = -i|0>
+        // new amp0 = -i * amp1 = (amp1.y, -amp1.x)
+        // new amp1 = i * amp0 = (-amp0.y, amp0.x)
+        state[idx] = vec2<f64>(amp1.y, -amp1.x);
+        state[partner_idx] = vec2<f64>(-amp0.y, amp0.x);
+    }
+}
+
+// Apply SWAP gate: exchange amplitudes between two qubits
+@compute @workgroup_size(256)
+fn apply_swap(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let mask_a = 1u << params.control_qubit;
+    let mask_b = 1u << params.target_qubit;
+
+    let bit_a = (idx & mask_a) != 0u;
+    let bit_b = (idx & mask_b) != 0u;
+
+    // Only swap when bits differ: (a=0,b=1) swaps with (a=1,b=0)
+    // Process once: when a=0 and b=1
+    if (!bit_a && bit_b) {
+        let partner = (idx & ~mask_b) | mask_a;
+        let amp0 = state[idx];
+        let amp1 = state[partner];
+        state[idx] = amp1;
+        state[partner] = amp0;
+    }
+}
+
+// Apply RXX(theta) gate: exp(-i * theta/2 * X x X)
+@compute @workgroup_size(256)
+fn apply_rxx(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+    if (idx >= num_amplitudes) { return; }
+
+    let mask_a = 1u << params.control_qubit;
+    let mask_b = 1u << params.target_qubit;
+    // cos(theta/2) and sin(theta/2) are precomputed on the CPU and passed
+    // via (a_re, a_im). wgpu+Vulkan f64 cos/sin is unreliable.
+    let partner = idx ^ mask_a ^ mask_b;
+    if (idx < partner) {
+        let c = params.a_re;
+        let s = params.a_im;
+        let amp0 = state[idx];
+        let amp1 = state[partner];
+        state[idx] = vec2<f64>(amp0.x * c + amp1.y * s, amp0.y * c - amp1.x * s);
+        state[partner] = vec2<f64>(amp1.x * c + amp0.y * s, amp1.y * c - amp0.x * s);
+    }
+}
+
+// Apply RYY(theta) gate: exp(-i * theta/2 * Y x Y)
+@compute @workgroup_size(256)
+fn apply_ryy(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+    if (idx >= num_amplitudes) { return; }
+
+    let mask_a = 1u << params.control_qubit;
+    let mask_b = 1u << params.target_qubit;
+    // RYY acts on all 4 basis states but the coupling sign differs between
+    // the (|00>,|11>) same-parity pair (+i*sin) and the (|01>,|10>)
+    // diff-parity pair (-i*sin).
+    let bit_a = (idx & mask_a) != 0u;
+    let bit_b = (idx & mask_b) != 0u;
+    let partner = idx ^ mask_a ^ mask_b;
+    if (idx < partner) {
+        let c = params.a_re;
+        let s_abs = params.a_im;
+        let s = select(s_abs, -s_abs, bit_a == bit_b);
+        let amp0 = state[idx];
+        let amp1 = state[partner];
+        state[idx] = vec2<f64>(amp0.x * c + amp1.y * s, amp0.y * c - amp1.x * s);
+        state[partner] = vec2<f64>(amp1.x * c + amp0.y * s, amp1.y * c - amp0.x * s);
+    }
+}
+
+// Apply RZZ(theta) gate
+// Angle theta is passed in a_re field
+@compute @workgroup_size(256)
+fn apply_rzz(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let q1_mask = 1u << params.control_qubit;
+    let q2_mask = 1u << params.target_qubit;
+
+    let q1_set = (idx & q1_mask) != 0u;
+    let q2_set = (idx & q2_mask) != 0u;
+
+    // RZZ phase is -theta/2 when both bits match, +theta/2 otherwise.
+    // cos/sin precomputed on CPU: a_re = cos(theta/2), a_im = sin(theta/2).
+    let c = params.a_re;
+    let s_abs = params.a_im;
+    let s = select(s_abs, -s_abs, q1_set == q2_set);
+    let amp = state[idx];
+    state[idx] = vec2<f64>(amp.x * c - amp.y * s, amp.x * s + amp.y * c);
+}
+
+// Collapse state after measurement
+struct MeasureParams {
+    target_qubit: u32,
+    outcome: u32,
+    norm_factor: f64,
+}
+
+@group(0) @binding(3)
+var<uniform> measure_params: MeasureParams;
+
+@compute @workgroup_size(256)
+fn collapse_state(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+
+    if (idx >= num_amplitudes) {
+        return;
+    }
+
+    let target_mask = 1u << measure_params.target_qubit;
+    let qubit_value = select(0u, 1u, (idx & target_mask) != 0u);
+
+    if (qubit_value == measure_params.outcome) {
+        state[idx] = state[idx] * measure_params.norm_factor;
+    } else {
+        state[idx] = vec2<f64>(0.0, 0.0);
+    }
+}
+
+// GPU-side workgroup reduction for marginal probability
+@group(0) @binding(4)
+var<storage, read_write> partial_sums: array<f64>;
+
+var<workgroup> shared_prob: array<f64, 256>;
+
+@compute @workgroup_size(256)
+fn reduce_marginal_probability(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) workgroup_id: vec3<u32>,
+    @builtin(num_workgroups) num_workgroups: vec3<u32>
+) {
+    let idx = get_linear_idx(global_id, num_workgroups);
+    let num_amplitudes = 1u << params.num_qubits;
+    let lid = local_id.x;
+
+    if (idx < num_amplitudes) {
+        let target_mask = 1u << params.target_qubit;
+        if ((idx & target_mask) != 0u) {
+            let amp = state[idx];
+            shared_prob[lid] = amp.x * amp.x + amp.y * amp.y;
+        } else {
+            shared_prob[lid] = 0.0;
+        }
+    } else {
+        shared_prob[lid] = 0.0;
+    }
+
+    workgroupBarrier();
+
+    for (var stride = 128u; stride > 0u; stride >>= 1u) {
+        if (lid < stride) {
+            shared_prob[lid] += shared_prob[lid + stride];
+        }
+        workgroupBarrier();
+    }
+
+    if (lid == 0u) {
+        let wg_idx = workgroup_id.y * num_workgroups.x + workgroup_id.x;
+        partial_sums[wg_idx] = shared_prob[0];
+    }
+}
diff --git a/crates/pecos-gpu-sims/tests/concurrent_gpu_test.rs b/crates/pecos-gpu-sims/tests/concurrent_gpu_test.rs
index 7ebc3719f..f4fb411cb 100644
--- a/crates/pecos-gpu-sims/tests/concurrent_gpu_test.rs
+++ b/crates/pecos-gpu-sims/tests/concurrent_gpu_test.rs
@@ -2,12 +2,19 @@
 //!
 //! These tests verify that multiple GPU simulators can be created, used,
 //! and destroyed concurrently without segfaults or resource leaks.
+//!
+//! With the shared process-wide wgpu context (`gpu_probe::gpu_context`),
+//! multiple sim types share one Device and Queue. The mixed-sim stress test
+//! is the real coverage for the shared-context corner cases (interleaved
+//! `queue.write_buffer` / `queue.submit`, parallel state readback, etc.).
 
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::thread;
 
 use pecos_core::QubitId;
-use pecos_gpu_sims::DefaultGpuStab;
+use pecos_gpu_sims::{
+    DefaultGpuStab, GpuPauliProp, GpuStateVec32, GpuStateVecAuto, gpu_probe::gpu_context,
+};
 use pecos_simulators::CliffordGateable;
 
 #[test]
@@ -26,7 +33,7 @@ fn test_concurrent_gpu_stab_creation_and_destruction() {
                         sim.cx(&[(QubitId(0), QubitId(1))]);
                         let _ = sim.mz(&[QubitId(0)]);
                     }
-                    // sim drops here -- Drop should sync device
+                    // sim drops here
                 }
             });
         }
@@ -55,3 +62,132 @@ fn test_rapid_create_destroy() {
         eprintln!("WARNING: no GPU available -- rapid create/destroy test was a no-op");
     }
 }
+
+/// Mixed simulator types under `thread::scope`. Each thread spins a small Bell-state
+/// circuit on a randomly-picked sim type. Asserts perfect Z parity on the result.
+/// This is the corner case the shared-context fix is designed for: interleaved
+/// `queue.write_buffer` and `queue.submit` from many sim types on one Device.
+#[test]
+fn test_mixed_sim_types_concurrent() {
+    // First check whether the shared context comes up at all -- if not the
+    // whole test should skip cleanly rather than counting zero successes.
+    if gpu_context().is_err() {
+        eprintln!("WARNING: no GPU available -- mixed-sim concurrent test was a no-op");
+        return;
+    }
+
+    let bell_parity_failures = AtomicUsize::new(0);
+    let total_runs = AtomicUsize::new(0);
+
+    thread::scope(|s| {
+        for tid in 0u32..8 {
+            let bell_parity_failures = &bell_parity_failures;
+            let total_runs = &total_runs;
+            s.spawn(move || {
+                // Each thread cycles through all four sim types twice.
+                for round in 0..2 {
+                    let kind = (tid + round) % 4;
+                    match kind {
+                        0 => run_stab_bell(tid, total_runs, bell_parity_failures),
+                        1 => run_statevec_auto_bell(tid, total_runs, bell_parity_failures),
+                        2 => run_statevec32_bell(tid, total_runs, bell_parity_failures),
+                        _ => run_pauli_prop_no_op(tid, total_runs),
+                    }
+                }
+            });
+        }
+    });
+
+    assert_eq!(
+        bell_parity_failures.load(Ordering::Relaxed),
+        0,
+        "Bell parity violated across threads -- shared GPU context interleaving bug"
+    );
+
+    let runs = total_runs.load(Ordering::Relaxed);
+    assert!(
+        runs > 0,
+        "Mixed-sim test produced zero runs even though gpu_context() reported success"
+    );
+}
+
+fn run_stab_bell(seed: u32, total: &AtomicUsize, fails: &AtomicUsize) {
+    let Ok(mut sim) = DefaultGpuStab::with_seed(2, u64::from(seed)) else {
+        return;
+    };
+    sim.h(&[QubitId(0)]);
+    sim.cx(&[(QubitId(0), QubitId(1))]);
+    let results = sim.mz(&[QubitId(0), QubitId(1)]);
+    total.fetch_add(1, Ordering::Relaxed);
+    if results.len() != 2 || results[0].outcome != results[1].outcome {
+        fails.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+fn run_statevec_auto_bell(seed: u32, total: &AtomicUsize, fails: &AtomicUsize) {
+    let _ = seed; // GpuStateVecAuto::new doesn't take a seed; deterministic enough.
+    let Ok(mut sim) = GpuStateVecAuto::new(2) else {
+        return;
+    };
+    sim.h(&[QubitId(0)]);
+    sim.cx(&[(QubitId(0), QubitId(1))]);
+    let results = sim.mz(&[QubitId(0), QubitId(1)]);
+    total.fetch_add(1, Ordering::Relaxed);
+    if results.len() != 2 || results[0].outcome != results[1].outcome {
+        fails.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+fn run_statevec32_bell(seed: u32, total: &AtomicUsize, fails: &AtomicUsize) {
+    let _ = seed;
+    let Ok(mut sim) = GpuStateVec32::new(2) else {
+        return;
+    };
+    sim.h(&[QubitId(0)]);
+    sim.cx(&[(QubitId(0), QubitId(1))]);
+    let results = sim.mz(&[QubitId(0), QubitId(1)]);
+    total.fetch_add(1, Ordering::Relaxed);
+    if results.len() != 2 || results[0].outcome != results[1].outcome {
+        fails.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+/// `GpuPauliProp` doesn't have a Bell-state notion; just exercise create + flush
+/// on the shared device alongside the other sims to stress the queue.
+fn run_pauli_prop_no_op(seed: u32, total: &AtomicUsize) {
+    let Ok(mut prop) = GpuPauliProp::with_seed(4, 8, u64::from(seed)) else {
+        return;
+    };
+    prop.inject_x_fault(0);
+    prop.h(&[0]);
+    prop.cx(&[(0, 1)]);
+    prop.sync();
+    total.fetch_add(1, Ordering::Relaxed);
+}
+
+/// Memoization: the second call to `gpu_context()` must return the same device
+/// (or the same error) without re-initializing. Catches a regression where
+/// someone removes the `OnceLock` in `gpu_probe::gpu_context`.
+#[test]
+fn test_gpu_context_is_memoized() {
+    let first = gpu_context();
+    let second = gpu_context();
+
+    match (first, second) {
+        (Ok(a), Ok(b)) => {
+            // Device handles are Arc-based internally in wgpu; equality of the
+            // returned context's underlying device pointer is the cheapest
+            // check that we got the same device twice. We don't have a public
+            // API for "are these the same device" so use adapter info as a
+            // proxy: same name, backend, device_type.
+            assert_eq!(a.info.name, b.info.name);
+            assert_eq!(a.info.backend, b.info.backend);
+            assert_eq!(a.info.device_type, b.info.device_type);
+        }
+        (Err(a), Err(b)) => {
+            // Errors must be memoized too: same kind on both calls.
+            assert_eq!(format!("{a}"), format!("{b}"), "memoized error must match");
+        }
+        _ => panic!("gpu_context() returned different success/failure on repeat calls"),
+    }
+}
diff --git a/crates/pecos-gpu-sims/tests/extra_audits.rs b/crates/pecos-gpu-sims/tests/extra_audits.rs
new file mode 100644
index 000000000..18583e145
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/extra_audits.rs
@@ -0,0 +1,443 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Supplementary audits covering gaps surfaced during the review:
+//!
+//! - `GpuStabMulti` shot-consistency on deterministic circuits (all shots agree,
+//!   match CPU single-shot).
+//! - `GpuDensityMatrix` measurement distribution vs `StateVecSoA` expectation over
+//!   many trials.
+//! - Gate fusion replay: force per-gate dispatch vs normal fused batching, the
+//!   two must produce bit-identical output (modulo f32 rounding) for any
+//!   circuit.
+
+use pecos_core::{Angle64, QubitId};
+use pecos_gpu_sims::{GpuDensityMatrix32, GpuStabMulti, GpuStateVec32, GpuStateVec64};
+use pecos_simulators::{
+    ArbitraryRotationGateable, CliffordGateable, QuantumSimulator, SparseStab, StateVecSoA,
+};
+use rand::rngs::StdRng;
+use rand::{RngExt, SeedableRng};
+
+// =============================================================================
+// #2: GpuStabMulti shot-consistency on deterministic Clifford circuits
+// =============================================================================
+
+/// Build a circuit whose Z-basis measurements are deterministic (no
+/// superposition on the measured qubits), then run it through `GpuStabMulti`
+/// with many shots and CPU `SparseStab` once. Every GPU shot must match the
+/// single CPU outcome.
+fn check_deterministic_multi<F>(label: &str, n: usize, shots: usize, build: F)
+where
+    F: Fn(&mut dyn FnMut(usize, GateOp)),
+{
+    // Collect the gate sequence.
+    let mut gates: Vec<(usize, GateOp)> = Vec::new();
+    build(&mut |q, g| gates.push((q, g)));
+
+    // CPU reference.
+    let mut cpu = SparseStab::new(n);
+    for (_idx, op) in &gates {
+        apply_cpu_clifford(&mut cpu, *op);
+    }
+    let cpu_results: Vec<bool> = (0..n).map(|q| cpu.mz(&[QubitId(q)])[0].outcome).collect();
+
+    // GPU multi-shot.
+    let Ok(mut gpu) = GpuStabMulti::<pecos_random::PecosRng>::with_seed(n, shots, 42) else {
+        return;
+    };
+    for (_idx, op) in &gates {
+        apply_multi_clifford(&mut gpu, *op);
+    }
+    let gpu_results: Vec<Vec<bool>> = gpu.mz(&(0..n).map(QubitId).collect::<Vec<_>>());
+
+    assert_eq!(gpu_results.len(), shots, "{label}: shot count");
+    for (shot, row) in gpu_results.iter().enumerate() {
+        assert_eq!(row, &cpu_results, "{label}: shot {shot} disagrees with CPU");
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+enum GateOp {
+    X(usize),
+    H(usize),
+    Cx(usize, usize),
+    Cz(usize, usize),
+}
+
+fn apply_cpu_clifford(sim: &mut SparseStab, op: GateOp) {
+    match op {
+        GateOp::X(q) => {
+            sim.x(&[QubitId(q)]);
+        }
+        GateOp::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        GateOp::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        GateOp::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+fn apply_multi_clifford(sim: &mut GpuStabMulti, op: GateOp) {
+    match op {
+        GateOp::X(q) => {
+            sim.x(&[QubitId(q)]);
+        }
+        GateOp::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        GateOp::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        GateOp::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+#[test]
+fn stab_multi_all_zero_basis() {
+    // No gates: all qubits in |0>, every shot must read 0s.
+    check_deterministic_multi("all |0>", 5, 128, |_| {});
+}
+
+#[test]
+fn stab_multi_x_prep() {
+    // Apply X to qubits 1 and 3: expect 01010 reading.
+    check_deterministic_multi("X prep", 5, 128, |emit| {
+        emit(0, GateOp::X(1));
+        emit(1, GateOp::X(3));
+    });
+}
+
+#[test]
+fn stab_multi_cx_chain() {
+    // X(0) then CX(0,1), CX(1,2), ..., CX(n-2, n-1) => all bits flipped (GHZ-with-X).
+    check_deterministic_multi("CX chain", 5, 128, |emit| {
+        emit(0, GateOp::X(0));
+        for q in 0..4 {
+            emit(q, GateOp::Cx(q, q + 1));
+        }
+    });
+}
+
+#[test]
+fn stab_multi_cz_noop_on_zeros() {
+    // CZ on |00..0> is identity: every shot still reads 0s.
+    check_deterministic_multi("CZ on zeros", 5, 128, |emit| {
+        for q in 0..4 {
+            emit(q, GateOp::Cz(q, q + 1));
+        }
+    });
+}
+
+#[test]
+fn stab_multi_h_h_identity() {
+    // H H is identity: |0> -> |+> -> |0>, deterministic.
+    check_deterministic_multi("H H identity", 4, 64, |emit| {
+        for q in 0..4 {
+            emit(q, GateOp::H(q));
+            emit(q, GateOp::H(q));
+        }
+    });
+}
+
+// =============================================================================
+// #3: Density matrix mz distribution vs state-vector expectation
+// =============================================================================
+
+/// For |+> state: P(0) = P(1) = 0.5. Running GPU DM mz many times should
+/// produce ~50/50.
+#[test]
+fn dm_mz_plus_state_distribution() {
+    let Ok(mut gpu) = GpuDensityMatrix32::with_seed(1, 123) else {
+        return;
+    };
+    gpu.h(&[QubitId(0)]);
+
+    let trials = 400;
+    let mut ones = 0usize;
+    for _ in 0..trials {
+        // Fresh prep each trial since mz collapses.
+        gpu.reset();
+        gpu.h(&[QubitId(0)]);
+        let res = gpu.mz(&[QubitId(0)]);
+        if res[0].outcome {
+            ones += 1;
+        }
+    }
+
+    // 3-sigma window on Binomial(400, 0.5): sigma = 10, so |ones - 200| < 30 typically.
+    #[allow(clippy::cast_precision_loss)] // ones <= trials = 400, exact in f64
+    let p = ones as f64 / f64::from(trials);
+    assert!(
+        (p - 0.5).abs() < 0.1,
+        "|+> measurement: got P(1) = {p} from {ones}/{trials}"
+    );
+}
+
+/// Bell state: measurements should be perfectly correlated across shots.
+#[test]
+fn dm_mz_bell_state_correlation() {
+    let Ok(mut gpu) = GpuDensityMatrix32::with_seed(2, 321) else {
+        return;
+    };
+
+    let trials = 200;
+    let mut correlated = 0usize;
+    for _ in 0..trials {
+        gpu.reset();
+        gpu.h(&[QubitId(0)]).cx(&[(QubitId(0), QubitId(1))]);
+        let res = gpu.mz(&[QubitId(0), QubitId(1)]);
+        if res[0].outcome == res[1].outcome {
+            correlated += 1;
+        }
+    }
+
+    assert_eq!(correlated, trials, "Bell state: correlation must be exact");
+}
+
+// =============================================================================
+// #4: Gate fusion replay -- per-gate dispatch vs normal fused batching
+// =============================================================================
+
+/// Apply a long sequence of gates two ways: once normally (fusion groups them),
+/// and once with a forced readback after every gate (prevents fusion across
+/// operations). Both outputs must agree.
+#[test]
+fn fusion_replay_f32() {
+    let n: u32 = 10;
+    let n_us = usize::try_from(n).unwrap();
+    let mut rng = StdRng::seed_from_u64(777);
+    let seq: Vec<Instr> = (0..60).map(|_| gen_instr(&mut rng, n_us)).collect();
+
+    let Ok(mut fused) = GpuStateVec32::new(n) else {
+        return;
+    };
+    let qubits: Vec<QubitId> = (0..n_us).map(QubitId).collect();
+    fused.h(&qubits);
+    for instr in &seq {
+        apply_instr_f32(&mut fused, *instr);
+    }
+    let fused_state: Vec<[f32; 2]> = fused.state();
+
+    let Ok(mut unfused) = GpuStateVec32::new(n) else {
+        return;
+    };
+    unfused.h(&qubits);
+    let _ = unfused.state(); // force H flush
+    for instr in &seq {
+        apply_instr_f32(&mut unfused, *instr);
+        let _ = unfused.state(); // force per-gate flush (no fusion across the boundary)
+    }
+    let unfused_state = unfused.state();
+
+    assert_eq!(fused_state.len(), unfused_state.len());
+    let tol: f32 = 5e-3;
+    let mut max_diff = 0.0f32;
+    for ([fr, fi], [ur, ui]) in fused_state.iter().zip(unfused_state.iter()) {
+        let dr = fr - ur;
+        let di = fi - ui;
+        let d = (dr * dr + di * di).sqrt();
+        if d > max_diff {
+            max_diff = d;
+        }
+    }
+    assert!(
+        max_diff < tol,
+        "f32 fused vs per-gate dispatch diverged: max_diff = {max_diff:.3e}"
+    );
+}
+
+#[test]
+fn fusion_replay_f64() {
+    let n: u32 = 10;
+    let n_us = usize::try_from(n).unwrap();
+    let mut rng = StdRng::seed_from_u64(888);
+    let seq: Vec<Instr> = (0..60).map(|_| gen_instr(&mut rng, n_us)).collect();
+
+    let Ok(mut fused) = GpuStateVec64::new(n) else {
+        return;
+    };
+    let qubits: Vec<QubitId> = (0..n_us).map(QubitId).collect();
+    fused.h(&qubits);
+    for instr in &seq {
+        apply_instr_f64(&mut fused, *instr);
+    }
+    let fused_state = fused.state();
+
+    let Ok(mut unfused) = GpuStateVec64::new(n) else {
+        return;
+    };
+    unfused.h(&qubits);
+    let _ = unfused.state();
+    for instr in &seq {
+        apply_instr_f64(&mut unfused, *instr);
+        let _ = unfused.state();
+    }
+    let unfused_state = unfused.state();
+
+    assert_eq!(fused_state.len(), unfused_state.len());
+    let tol: f64 = 1e-5;
+    let mut max_diff = 0.0f64;
+    for ([fr, fi], [ur, ui]) in fused_state.iter().zip(unfused_state.iter()) {
+        let dr = fr - ur;
+        let di = fi - ui;
+        let d = (dr * dr + di * di).sqrt();
+        if d > max_diff {
+            max_diff = d;
+        }
+    }
+    assert!(
+        max_diff < tol,
+        "f64 fused vs per-gate dispatch diverged: max_diff = {max_diff:.3e}"
+    );
+}
+
+#[derive(Clone, Copy)]
+enum Instr {
+    H(usize),
+    Rz(usize, f64),
+    Cx(usize, usize),
+    Cz(usize, usize),
+    Rzz(usize, usize, f64),
+}
+
+fn gen_instr(rng: &mut StdRng, n: usize) -> Instr {
+    match rng.random_range(0u32..5) {
+        0 => Instr::H(rng.random_range(0..n)),
+        1 => Instr::Rz(rng.random_range(0..n), rng.random_range(-3.0..3.0)),
+        2 => {
+            let a = rng.random_range(0..n);
+            let mut b = rng.random_range(0..n);
+            while b == a {
+                b = rng.random_range(0..n);
+            }
+            Instr::Cx(a, b)
+        }
+        3 => {
+            let a = rng.random_range(0..n);
+            let mut b = rng.random_range(0..n);
+            while b == a {
+                b = rng.random_range(0..n);
+            }
+            Instr::Cz(a, b)
+        }
+        _ => {
+            let a = rng.random_range(0..n);
+            let mut b = rng.random_range(0..n);
+            while b == a {
+                b = rng.random_range(0..n);
+            }
+            Instr::Rzz(a, b, rng.random_range(-3.0..3.0))
+        }
+    }
+}
+
+fn apply_instr_f32(sim: &mut GpuStateVec32, instr: Instr) {
+    match instr {
+        Instr::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Instr::Rz(q, t) => {
+            sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Instr::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        Instr::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+        Instr::Rzz(a, b, t) => {
+            sim.rzz(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+fn apply_instr_f64(sim: &mut GpuStateVec64, instr: Instr) {
+    match instr {
+        Instr::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Instr::Rz(q, t) => {
+            sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Instr::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        Instr::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+        Instr::Rzz(a, b, t) => {
+            sim.rzz(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+// =============================================================================
+// Sanity: StateVecSoA reference for the fusion replay isn't needed -- the
+// per-gate and fused paths are compared against each other. But verify once
+// against CPU to make sure both match the ground truth for a small N.
+// =============================================================================
+
+#[test]
+fn fusion_replay_matches_cpu() {
+    let n: usize = 6;
+    let mut rng = StdRng::seed_from_u64(999);
+    let seq: Vec<Instr> = (0..40).map(|_| gen_instr(&mut rng, n)).collect();
+
+    let mut cpu = StateVecSoA::new(n);
+    let qubits: Vec<QubitId> = (0..n).map(QubitId).collect();
+    cpu.h(&qubits);
+    for instr in &seq {
+        match *instr {
+            Instr::H(q) => {
+                cpu.h(&[QubitId(q)]);
+            }
+            Instr::Rz(q, t) => {
+                cpu.rz(Angle64::from_radians(t), &[QubitId(q)]);
+            }
+            Instr::Cx(a, b) => {
+                cpu.cx(&[(QubitId(a), QubitId(b))]);
+            }
+            Instr::Cz(a, b) => {
+                cpu.cz(&[(QubitId(a), QubitId(b))]);
+            }
+            Instr::Rzz(a, b, t) => {
+                cpu.rzz(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+            }
+        }
+    }
+    let cpu_state: Vec<[f64; 2]> = cpu.state().into_iter().map(|c| [c.re, c.im]).collect();
+
+    let Ok(mut gpu) = GpuStateVec64::new(u32::try_from(n).expect("test N fits in u32")) else {
+        return;
+    };
+    gpu.h(&qubits);
+    for instr in &seq {
+        apply_instr_f64(&mut gpu, *instr);
+    }
+    let gpu_state = gpu.state();
+
+    let mut max_diff = 0.0f64;
+    for ([gr, gi], [cr, ci]) in gpu_state.iter().zip(cpu_state.iter()) {
+        let dr = gr - cr;
+        let di = gi - ci;
+        let d = (dr * dr + di * di).sqrt();
+        if d > max_diff {
+            max_diff = d;
+        }
+    }
+    assert!(
+        max_diff < 1e-5,
+        "GPU (fused) vs CPU ground truth diverged: max_diff = {max_diff:.3e}"
+    );
+}
diff --git a/crates/pecos-gpu-sims/tests/gate_audit.rs b/crates/pecos-gpu-sims/tests/gate_audit.rs
new file mode 100644
index 000000000..b41f674d6
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/gate_audit.rs
@@ -0,0 +1,264 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Systematic correctness audit: every GPU 2-qubit gate, on every backend
+//! (f32 / f64), on every dispatch path (persistent / dispatched), cross-checked
+//! against the CPU `StateVecSoA` reference.
+//!
+//! Seed state is H^N|0> (uniform |+>^N), which populates every 2-qubit
+//! subspace pair with a nonzero amplitude -- this exposes bugs where a gate
+//! only updates half the basis pairs (e.g. the existing RXX/RYY
+//! bit_a==bit_b-only bug).
+
+use pecos_core::{Angle64, QubitId};
+use pecos_gpu_sims::{GpuStateVec32, GpuStateVec64};
+use pecos_simulators::{ArbitraryRotationGateable, CliffordGateable, StateVecSoA};
+
+const TOL_F32: f64 = 1e-3;
+const TOL_F64: f64 = 1e-5;
+
+// --- CPU reference ---
+
+fn cpu_seed(n: usize) -> StateVecSoA {
+    let mut sv = StateVecSoA::new(n);
+    let qubits: Vec<QubitId> = (0..n).map(QubitId).collect();
+    sv.h(&qubits);
+    sv
+}
+
+fn cpu_state(sv: &mut StateVecSoA) -> Vec<[f64; 2]> {
+    sv.state().into_iter().map(|c| [c.re, c.im]).collect()
+}
+
+// --- backend harness ---
+
+fn run_f32<F>(n: u32, apply: F) -> Option<Vec<[f64; 2]>>
+where
+    F: Fn(&mut GpuStateVec32),
+{
+    let mut sv = GpuStateVec32::new(n).ok()?;
+    let qubits: Vec<QubitId> = (0..usize::try_from(n).unwrap()).map(QubitId).collect();
+    sv.h(&qubits);
+    apply(&mut sv);
+    Some(
+        sv.state()
+            .into_iter()
+            .map(|[re, im]| [f64::from(re), f64::from(im)])
+            .collect(),
+    )
+}
+
+fn run_f64<F>(n: u32, apply: F) -> Option<Vec<[f64; 2]>>
+where
+    F: Fn(&mut GpuStateVec64),
+{
+    let mut sv = GpuStateVec64::new(n).ok()?;
+    let qubits: Vec<QubitId> = (0..usize::try_from(n).unwrap()).map(QubitId).collect();
+    sv.h(&qubits);
+    apply(&mut sv);
+    Some(sv.state())
+}
+
+fn diff(gpu: &[[f64; 2]], cpu: &[[f64; 2]]) -> f64 {
+    gpu.iter()
+        .zip(cpu.iter())
+        .map(|([gr, gi], [cr, ci])| {
+            let dr = gr - cr;
+            let di = gi - ci;
+            (dr * dr + di * di).sqrt()
+        })
+        .fold(0.0, f64::max)
+}
+
+// --- checks ---
+
+struct Case {
+    name: &'static str,
+    apply_cpu: fn(&mut StateVecSoA),
+    apply_f32: fn(&mut GpuStateVec32),
+    apply_f64: fn(&mut GpuStateVec64),
+}
+
+fn theta() -> Angle64 {
+    Angle64::from_radians(0.37)
+}
+
+macro_rules! case_1q {
+    ($name:literal, $m:ident) => {
+        Case {
+            name: $name,
+            apply_cpu: |sv| {
+                sv.$m(&[QubitId(0)]);
+            },
+            apply_f32: |sv| {
+                sv.$m(&[QubitId(0)]);
+            },
+            apply_f64: |sv| {
+                sv.$m(&[QubitId(0)]);
+            },
+        }
+    };
+}
+
+macro_rules! case_2q {
+    ($name:literal, $m:ident) => {
+        Case {
+            name: $name,
+            apply_cpu: |sv| {
+                sv.$m(&[(QubitId(0), QubitId(1))]);
+            },
+            apply_f32: |sv| {
+                sv.$m(&[(QubitId(0), QubitId(1))]);
+            },
+            apply_f64: |sv| {
+                sv.$m(&[(QubitId(0), QubitId(1))]);
+            },
+        }
+    };
+}
+
+macro_rules! case_2q_rot {
+    ($name:literal, $m:ident) => {
+        Case {
+            name: $name,
+            apply_cpu: |sv| {
+                sv.$m(theta(), &[(QubitId(0), QubitId(1))]);
+            },
+            apply_f32: |sv| {
+                sv.$m(theta(), &[(QubitId(0), QubitId(1))]);
+            },
+            apply_f64: |sv| {
+                sv.$m(theta(), &[(QubitId(0), QubitId(1))]);
+            },
+        }
+    };
+}
+
+fn all_cases() -> Vec<Case> {
+    vec![
+        // 1q for regression baseline
+        case_1q!("h", h),
+        case_1q!("x", x),
+        case_1q!("y", y),
+        case_1q!("z", z),
+        case_1q!("sx", sx),
+        case_1q!("sxdg", sxdg),
+        case_1q!("sy", sy),
+        case_1q!("sydg", sydg),
+        case_1q!("sz", sz),
+        case_1q!("szdg", szdg),
+        // 2q Clifford
+        case_2q!("cx", cx),
+        case_2q!("cy", cy),
+        case_2q!("cz", cz),
+        case_2q!("swap", swap),
+        case_2q!("szz", szz),
+        case_2q!("szzdg", szzdg),
+        case_2q!("sxx", sxx),
+        case_2q!("sxxdg", sxxdg),
+        case_2q!("syy", syy),
+        case_2q!("syydg", syydg),
+        // 2q rotations
+        case_2q_rot!("rxx", rxx),
+        case_2q_rot!("ryy", ryy),
+        case_2q_rot!("rzz", rzz),
+    ]
+}
+
+fn check_backends(label: &str, n: usize, cases: &[Case]) {
+    // CPU reference
+    for case in cases {
+        let mut cpu = cpu_seed(n);
+        (case.apply_cpu)(&mut cpu);
+        let cpu_state = cpu_state(&mut cpu);
+
+        // f32
+        if let Some(gpu) = run_f32(u32::try_from(n).unwrap(), case.apply_f32) {
+            let d = diff(&gpu, &cpu_state);
+            if d > TOL_F32 {
+                println!("FAIL [{label}] f32 {} (N={n}): max_diff={d:.3e}", case.name);
+            } else {
+                println!(" ok  [{label}] f32 {} (N={n}): max_diff={d:.3e}", case.name);
+            }
+        }
+
+        // f64
+        if let Some(gpu) = run_f64(u32::try_from(n).unwrap(), case.apply_f64) {
+            let d = diff(&gpu, &cpu_state);
+            if d > TOL_F64 {
+                println!("FAIL [{label}] f64 {} (N={n}): max_diff={d:.3e}", case.name);
+            } else {
+                println!(" ok  [{label}] f64 {} (N={n}): max_diff={d:.3e}", case.name);
+            }
+        }
+    }
+}
+
+// The persistent_max_qubits threshold on typical desktop GPUs is 10..12.
+// N=4 forces persistent kernel. N=14 forces dispatched path.
+#[test]
+fn audit_persistent_path() {
+    let cases = all_cases();
+    check_backends("persistent", 4, &cases);
+}
+
+#[test]
+fn audit_dispatched_path() {
+    let cases = all_cases();
+    check_backends("dispatched", 14, &cases);
+}
+
+/// Boundary qubit counts around the persistent/dispatched threshold.
+/// `persistent_max_qubits` on RTX 4090 is ~12, so N=11 is still persistent,
+/// N=13 is dispatched; N=12 is right at the edge.
+#[test]
+fn audit_persistent_dispatched_boundary() {
+    let cases = all_cases();
+    for n in [11usize, 12, 13] {
+        check_backends("boundary", n, &cases);
+    }
+}
+
+/// Summary assertion: every tested path in one test. Any shader bug surfaces
+/// as a FAIL line -- run with `--nocapture` to see per-gate verdicts.
+#[test]
+fn audit_strict() {
+    let cases = all_cases();
+    let mut failures: Vec<String> = Vec::new();
+    for (label, n) in [
+        ("persistent", 4usize),
+        ("boundary-under", 11usize),
+        ("boundary-at", 12usize),
+        ("boundary-over", 13usize),
+        ("dispatched", 14usize),
+    ] {
+        for case in &cases {
+            let mut cpu = cpu_seed(n);
+            (case.apply_cpu)(&mut cpu);
+            let cpu_state = cpu_state(&mut cpu);
+
+            if let Some(gpu) = run_f32(u32::try_from(n).unwrap(), case.apply_f32) {
+                let d = diff(&gpu, &cpu_state);
+                if d > TOL_F32 {
+                    failures.push(format!("{label}/f32/{}: diff={d:.3e}", case.name));
+                }
+            }
+            if let Some(gpu) = run_f64(u32::try_from(n).unwrap(), case.apply_f64) {
+                let d = diff(&gpu, &cpu_state);
+                if d > TOL_F64 {
+                    failures.push(format!("{label}/f64/{}: diff={d:.3e}", case.name));
+                }
+            }
+        }
+    }
+    assert!(
+        failures.is_empty(),
+        "GPU shader correctness failures ({}):\n  {}",
+        failures.len(),
+        failures.join("\n  ")
+    );
+}
diff --git a/crates/pecos-gpu-sims/tests/gate_fuzz.rs b/crates/pecos-gpu-sims/tests/gate_fuzz.rs
new file mode 100644
index 000000000..d6230129c
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/gate_fuzz.rs
@@ -0,0 +1,575 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Randomized correctness fuzz: random circuits of mixed 1q/2q Clifford and
+//! rotation gates on random qubit pairs with random angles, cross-checked
+//! against the CPU `StateVecSoA` reference. Fills coverage gaps that the
+//! single-gate audit doesn't: gate queue fusion/reorder interactions,
+//! different (control, target) pairs, theta edge cases.
+
+use pecos_core::{Angle64, QubitId};
+use pecos_gpu_sims::{GpuStateVec32, GpuStateVec64};
+use pecos_simulators::{ArbitraryRotationGateable, CliffordGateable, StateVecSoA};
+use rand::rngs::StdRng;
+use rand::{RngExt, SeedableRng};
+
+const TOL_F32: f64 = 5e-3;
+const TOL_F64: f64 = 5e-5;
+
+// --- RNG-driven gate emission applied to a generic simulator ---
+
+#[derive(Clone, Copy)]
+enum Op {
+    H(usize),
+    X(usize),
+    Y(usize),
+    Z(usize),
+    S(usize),
+    Sdg(usize),
+    Sx(usize),
+    Sxdg(usize),
+    Sy(usize),
+    Sydg(usize),
+    T(usize),
+    Tdg(usize),
+    Rx(usize, f64),
+    Ry(usize, f64),
+    Rz(usize, f64),
+    Cx(usize, usize),
+    Cy(usize, usize),
+    Cz(usize, usize),
+    Swap(usize, usize),
+    Szz(usize, usize),
+    Szzdg(usize, usize),
+    Sxx(usize, usize),
+    Sxxdg(usize, usize),
+    Syy(usize, usize),
+    Syydg(usize, usize),
+    Rxx(usize, usize, f64),
+    Ryy(usize, usize, f64),
+    Rzz(usize, usize, f64),
+}
+
+fn pick_two(rng: &mut StdRng, n: usize) -> (usize, usize) {
+    assert!(n >= 2);
+    let a = rng.random_range(0..n);
+    let mut b = rng.random_range(0..n);
+    while b == a {
+        b = rng.random_range(0..n);
+    }
+    (a, b)
+}
+
+fn gen_op(rng: &mut StdRng, n: usize) -> Op {
+    let kind = rng.random_range(0u32..27);
+    match kind {
+        0 => Op::H(rng.random_range(0..n)),
+        1 => Op::X(rng.random_range(0..n)),
+        2 => Op::Y(rng.random_range(0..n)),
+        3 => Op::Z(rng.random_range(0..n)),
+        4 => Op::S(rng.random_range(0..n)),
+        5 => Op::Sdg(rng.random_range(0..n)),
+        6 => Op::Sx(rng.random_range(0..n)),
+        7 => Op::Sxdg(rng.random_range(0..n)),
+        8 => Op::Sy(rng.random_range(0..n)),
+        9 => Op::Sydg(rng.random_range(0..n)),
+        10 => Op::T(rng.random_range(0..n)),
+        11 => Op::Tdg(rng.random_range(0..n)),
+        12 => Op::Rx(rng.random_range(0..n), rng.random_range(-3.3..3.3)),
+        13 => Op::Ry(rng.random_range(0..n), rng.random_range(-3.3..3.3)),
+        14 => Op::Rz(rng.random_range(0..n), rng.random_range(-3.3..3.3)),
+        15 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Cx(a, b)
+        }
+        16 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Cy(a, b)
+        }
+        17 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Cz(a, b)
+        }
+        18 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Swap(a, b)
+        }
+        19 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Szz(a, b)
+        }
+        20 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Szzdg(a, b)
+        }
+        21 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Sxx(a, b)
+        }
+        22 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Sxxdg(a, b)
+        }
+        23 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Syy(a, b)
+        }
+        24 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Syydg(a, b)
+        }
+        25 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Rxx(a, b, rng.random_range(-3.3..3.3))
+        }
+        26 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Ryy(a, b, rng.random_range(-3.3..3.3))
+        }
+        _ => {
+            let (a, b) = pick_two(rng, n);
+            Op::Rzz(a, b, rng.random_range(-3.3..3.3))
+        }
+    }
+}
+
+fn apply_cpu(sim: &mut StateVecSoA, op: Op) {
+    match op {
+        Op::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Op::X(q) => {
+            sim.x(&[QubitId(q)]);
+        }
+        Op::Y(q) => {
+            sim.y(&[QubitId(q)]);
+        }
+        Op::Z(q) => {
+            sim.z(&[QubitId(q)]);
+        }
+        Op::S(q) => {
+            sim.sz(&[QubitId(q)]);
+        }
+        Op::Sdg(q) => {
+            sim.szdg(&[QubitId(q)]);
+        }
+        Op::Sx(q) => {
+            sim.sx(&[QubitId(q)]);
+        }
+        Op::Sxdg(q) => {
+            sim.sxdg(&[QubitId(q)]);
+        }
+        Op::Sy(q) => {
+            sim.sy(&[QubitId(q)]);
+        }
+        Op::Sydg(q) => {
+            sim.sydg(&[QubitId(q)]);
+        }
+        Op::T(q) => {
+            sim.t(&[QubitId(q)]);
+        }
+        Op::Tdg(q) => {
+            sim.tdg(&[QubitId(q)]);
+        }
+        Op::Rx(q, t) => {
+            sim.rx(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op::Ry(q, t) => {
+            sim.ry(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op::Rz(q, t) => {
+            sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Cy(a, b) => {
+            sim.cy(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Swap(a, b) => {
+            sim.swap(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Szz(a, b) => {
+            sim.szz(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Szzdg(a, b) => {
+            sim.szzdg(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Sxx(a, b) => {
+            sim.sxx(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Sxxdg(a, b) => {
+            sim.sxxdg(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Syy(a, b) => {
+            sim.syy(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Syydg(a, b) => {
+            sim.syydg(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Rxx(a, b, t) => {
+            sim.rxx(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+        }
+        Op::Ryy(a, b, t) => {
+            sim.ryy(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+        }
+        Op::Rzz(a, b, t) => {
+            sim.rzz(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+macro_rules! apply_gpu_impl {
+    ($fn_name:ident, $sv:ty) => {
+        fn $fn_name(sim: &mut $sv, op: Op) {
+            match op {
+                Op::H(q) => {
+                    sim.h(&[QubitId(q)]);
+                }
+                Op::X(q) => {
+                    sim.x(&[QubitId(q)]);
+                }
+                Op::Y(q) => {
+                    sim.y(&[QubitId(q)]);
+                }
+                Op::Z(q) => {
+                    sim.z(&[QubitId(q)]);
+                }
+                Op::S(q) => {
+                    sim.sz(&[QubitId(q)]);
+                }
+                Op::Sdg(q) => {
+                    sim.szdg(&[QubitId(q)]);
+                }
+                Op::Sx(q) => {
+                    sim.sx(&[QubitId(q)]);
+                }
+                Op::Sxdg(q) => {
+                    sim.sxdg(&[QubitId(q)]);
+                }
+                Op::Sy(q) => {
+                    sim.sy(&[QubitId(q)]);
+                }
+                Op::Sydg(q) => {
+                    sim.sydg(&[QubitId(q)]);
+                }
+                Op::T(q) => {
+                    sim.t(&[QubitId(q)]);
+                }
+                Op::Tdg(q) => {
+                    sim.tdg(&[QubitId(q)]);
+                }
+                Op::Rx(q, t) => {
+                    sim.rx(Angle64::from_radians(t), &[QubitId(q)]);
+                }
+                Op::Ry(q, t) => {
+                    sim.ry(Angle64::from_radians(t), &[QubitId(q)]);
+                }
+                Op::Rz(q, t) => {
+                    sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+                }
+                Op::Cx(a, b) => {
+                    sim.cx(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Cy(a, b) => {
+                    sim.cy(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Cz(a, b) => {
+                    sim.cz(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Swap(a, b) => {
+                    sim.swap(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Szz(a, b) => {
+                    sim.szz(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Szzdg(a, b) => {
+                    sim.szzdg(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Sxx(a, b) => {
+                    sim.sxx(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Sxxdg(a, b) => {
+                    sim.sxxdg(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Syy(a, b) => {
+                    sim.syy(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Syydg(a, b) => {
+                    sim.syydg(&[(QubitId(a), QubitId(b))]);
+                }
+                Op::Rxx(a, b, t) => {
+                    sim.rxx(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+                }
+                Op::Ryy(a, b, t) => {
+                    sim.ryy(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+                }
+                Op::Rzz(a, b, t) => {
+                    sim.rzz(Angle64::from_radians(t), &[(QubitId(a), QubitId(b))]);
+                }
+            }
+        }
+    };
+}
+
+apply_gpu_impl!(apply_gpu32, GpuStateVec32);
+apply_gpu_impl!(apply_gpu64, GpuStateVec64);
+
+fn cpu_state(sim: &mut StateVecSoA) -> Vec<[f64; 2]> {
+    sim.state().into_iter().map(|c| [c.re, c.im]).collect()
+}
+
+fn max_diff(gpu: &[[f64; 2]], cpu: &[[f64; 2]]) -> f64 {
+    gpu.iter()
+        .zip(cpu.iter())
+        .map(|([gr, gi], [cr, ci])| {
+            let dr = gr - cr;
+            let di = gi - ci;
+            (dr * dr + di * di).sqrt()
+        })
+        .fold(0.0, f64::max)
+}
+
+// --- Fuzz harness ---
+
+fn fuzz_one_seed(seed: u64, n: usize, gates: usize) -> Result<(), String> {
+    let mut rng = StdRng::seed_from_u64(seed);
+    let ops: Vec<Op> = (0..gates).map(|_| gen_op(&mut rng, n)).collect();
+
+    // CPU reference
+    let mut cpu = StateVecSoA::new(n);
+    for &op in &ops {
+        apply_cpu(&mut cpu, op);
+    }
+    let cpu_s = cpu_state(&mut cpu);
+
+    // f32
+    if let Ok(mut g32) = GpuStateVec32::new(u32::try_from(n).expect("test N fits in u32")) {
+        for &op in &ops {
+            apply_gpu32(&mut g32, op);
+        }
+        let s: Vec<[f64; 2]> = g32
+            .state()
+            .into_iter()
+            .map(|[re, im]| [f64::from(re), f64::from(im)])
+            .collect();
+        let d = max_diff(&s, &cpu_s);
+        if d > TOL_F32 {
+            return Err(format!("seed={seed} N={n} G={gates} f32 diff={d:.3e}"));
+        }
+    }
+
+    // f64
+    if let Ok(mut g64) = GpuStateVec64::new(u32::try_from(n).expect("test N fits in u32")) {
+        for &op in &ops {
+            apply_gpu64(&mut g64, op);
+        }
+        let s = g64.state();
+        let d = max_diff(&s, &cpu_s);
+        if d > TOL_F64 {
+            return Err(format!("seed={seed} N={n} G={gates} f64 diff={d:.3e}"));
+        }
+    }
+
+    Ok(())
+}
+
+#[test]
+fn fuzz_persistent_path() {
+    // N=4 is below persistent_max_qubits on any realistic GPU.
+    let mut fails: Vec<String> = Vec::new();
+    for seed in 0..20u64 {
+        if let Err(e) = fuzz_one_seed(seed, 4, 40) {
+            fails.push(e);
+        }
+    }
+    assert!(
+        fails.is_empty(),
+        "{} persistent-path fuzz failures:\n  {}",
+        fails.len(),
+        fails.join("\n  ")
+    );
+}
+
+#[test]
+fn fuzz_dispatched_path() {
+    // N=14 is well above typical persistent_max_qubits (~12) -- forces dispatched path.
+    let mut fails: Vec<String> = Vec::new();
+    for seed in 100..115u64 {
+        if let Err(e) = fuzz_one_seed(seed, 14, 30) {
+            fails.push(e);
+        }
+    }
+    assert!(
+        fails.is_empty(),
+        "{} dispatched-path fuzz failures:\n  {}",
+        fails.len(),
+        fails.join("\n  ")
+    );
+}
+
+#[test]
+fn fuzz_small_n_stress() {
+    // Stress 2q-gate qubit-pair mask handling with N=2 and N=3 where
+    // off-by-one / low-stride scalar-vs-SIMD selection would show up.
+    let mut fails: Vec<String> = Vec::new();
+    for n in [2usize, 3] {
+        for seed in 200..210u64 {
+            if let Err(e) = fuzz_one_seed(seed, n, 30) {
+                fails.push(e);
+            }
+        }
+    }
+    assert!(
+        fails.is_empty(),
+        "{} small-N fuzz failures:\n  {}",
+        fails.len(),
+        fails.join("\n  ")
+    );
+}
+
+// --- Angle edge cases ---
+
+#[test]
+fn angle_edge_cases() {
+    use std::f64::consts::PI;
+    // Angles that commonly hit edge behavior in trig implementations.
+    let angles = [0.0, PI, -PI, PI / 2.0, -PI / 2.0, 2.0 * PI, 1e-10, -1e-10];
+    let n: usize = 5;
+    let qubits: Vec<QubitId> = (0..n).map(QubitId).collect();
+
+    let gate_kind = ["rx", "ry", "rz", "rxx", "ryy", "rzz"];
+    for (gi, gate) in gate_kind.iter().enumerate() {
+        for (ai, &theta) in angles.iter().enumerate() {
+            let t = Angle64::from_radians(theta);
+            let mut cpu = StateVecSoA::new(n);
+            cpu.h(&qubits);
+
+            let apply =
+                |is_cpu: bool, c: &mut StateVecSoA, gs: &mut Option<GpuStateVec64>| match *gate {
+                    "rx" => {
+                        if is_cpu {
+                            c.rx(t, &[QubitId(0)]);
+                        } else if let Some(g) = gs {
+                            g.rx(t, &[QubitId(0)]);
+                        }
+                    }
+                    "ry" => {
+                        if is_cpu {
+                            c.ry(t, &[QubitId(0)]);
+                        } else if let Some(g) = gs {
+                            g.ry(t, &[QubitId(0)]);
+                        }
+                    }
+                    "rz" => {
+                        if is_cpu {
+                            c.rz(t, &[QubitId(0)]);
+                        } else if let Some(g) = gs {
+                            g.rz(t, &[QubitId(0)]);
+                        }
+                    }
+                    "rxx" => {
+                        if is_cpu {
+                            c.rxx(t, &[(QubitId(0), QubitId(1))]);
+                        } else if let Some(g) = gs {
+                            g.rxx(t, &[(QubitId(0), QubitId(1))]);
+                        }
+                    }
+                    "ryy" => {
+                        if is_cpu {
+                            c.ryy(t, &[(QubitId(0), QubitId(1))]);
+                        } else if let Some(g) = gs {
+                            g.ryy(t, &[(QubitId(0), QubitId(1))]);
+                        }
+                    }
+                    "rzz" => {
+                        if is_cpu {
+                            c.rzz(t, &[(QubitId(0), QubitId(1))]);
+                        } else if let Some(g) = gs {
+                            g.rzz(t, &[(QubitId(0), QubitId(1))]);
+                        }
+                    }
+                    _ => {}
+                };
+
+            let mut none: Option<GpuStateVec64> = None;
+            apply(true, &mut cpu, &mut none);
+            let cpu_s = cpu_state(&mut cpu);
+
+            let mut g = GpuStateVec64::new(u32::try_from(n).expect("test N fits in u32")).ok();
+            if let Some(g) = g.as_mut() {
+                g.h(&qubits);
+            }
+            apply(false, &mut cpu /* unused */, &mut g);
+            if let Some(g) = g.as_mut() {
+                let gs = g.state();
+                let d = max_diff(&gs, &cpu_s);
+                assert!(
+                    d < 1e-4,
+                    "angle {theta} gate {gate} ({gi},{ai}): f64 diff={d:.3e}"
+                );
+            }
+        }
+    }
+}
+
+// --- Measurement determinism ---
+
+#[test]
+fn measurement_deterministic_on_basis_states() {
+    use pecos_simulators::CliffordGateable;
+    let n: usize = 4;
+    // Prepare |0101> and check mz outcomes match.
+    let prep_bits = [false, true, false, true];
+
+    let Ok(mut g32) = GpuStateVec32::new(u32::try_from(n).expect("test N fits in u32")) else {
+        return;
+    };
+    for (q, &b) in prep_bits.iter().enumerate() {
+        if b {
+            g32.x(&[QubitId(q)]);
+        }
+    }
+    let results = g32.mz(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    for (q, r) in results.iter().enumerate() {
+        assert_eq!(
+            r.outcome,
+            prep_bits[q],
+            "mz on prepared |{}> qubit {q} got {}",
+            to_bits(&prep_bits),
+            r.outcome
+        );
+        assert!(
+            r.is_deterministic,
+            "|basis state> measurement must be deterministic"
+        );
+    }
+
+    let Ok(mut g64) = GpuStateVec64::new(u32::try_from(n).expect("test N fits in u32")) else {
+        return;
+    };
+    for (q, &b) in prep_bits.iter().enumerate() {
+        if b {
+            g64.x(&[QubitId(q)]);
+        }
+    }
+    let results = g64.mz(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    for (q, r) in results.iter().enumerate() {
+        assert_eq!(r.outcome, prep_bits[q], "f64 qubit {q}");
+        assert!(r.is_deterministic);
+    }
+}
+
+fn to_bits(bits: &[bool]) -> String {
+    bits.iter()
+        .rev()
+        .map(|&b| if b { '1' } else { '0' })
+        .collect()
+}
diff --git a/crates/pecos-gpu-sims/tests/influence_sampler_audit.rs b/crates/pecos-gpu-sims/tests/influence_sampler_audit.rs
new file mode 100644
index 000000000..530d5b58d
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/influence_sampler_audit.rs
@@ -0,0 +1,205 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Targeted audit of `GpuInfluenceSampler`.
+//!
+//! Semantics: for each shot, each location has probability `p_error` of a
+//! fault. If a fault fires, a random Pauli (X/Y/Z, uniformly) is applied.
+//! Each fault toggles a CSR-encoded set of detectors and logicals.
+//!
+//! We don't have a CPU reference implementation to cross-check against.
+//! Instead, we use tight edge-case tests + distributional sanity checks.
+
+use pecos_gpu_sims::{GpuInfluenceMapData, GpuInfluenceSampler};
+
+/// Build an influence map with `n_loc` locations, `n_det` detectors, and
+/// `n_log` logicals, where:
+///   - X fault at location `k` toggles detector `k % n_det`
+///   - Z fault at location `k` toggles logical `k % n_log`
+///   - Y fault at location `k` toggles both
+///
+/// Written as three separate CSR tables (X, Y, Z each have a row per location).
+#[allow(clippy::cast_possible_truncation)] // CSR row offsets, n_loc bounded by test inputs (<= u32::MAX trivially)
+fn simple_diagonal_map(n_loc: u32, n_det: u32, n_log: u32) -> GpuInfluenceMapData {
+    let mut det_off_x = vec![0u32; (n_loc + 1) as usize];
+    let mut det_dat_x = Vec::<u32>::new();
+    let mut det_off_y = vec![0u32; (n_loc + 1) as usize];
+    let mut det_dat_y = Vec::<u32>::new();
+    let mut det_off_z = vec![0u32; (n_loc + 1) as usize];
+    let det_dat_z = Vec::<u32>::new();
+    let mut log_off_x = vec![0u32; (n_loc + 1) as usize];
+    let log_dat_x = Vec::<u32>::new();
+    let mut log_off_y = vec![0u32; (n_loc + 1) as usize];
+    let mut log_dat_y = Vec::<u32>::new();
+    let mut log_off_z = vec![0u32; (n_loc + 1) as usize];
+    let mut log_dat_z = Vec::<u32>::new();
+
+    for k in 0..n_loc {
+        // X at k -> detector (k % n_det)
+        det_dat_x.push(k % n_det);
+        det_off_x[(k + 1) as usize] = det_dat_x.len() as u32;
+
+        // Z at k -> logical (k % n_log)
+        log_dat_z.push(k % n_log);
+        log_off_z[(k + 1) as usize] = log_dat_z.len() as u32;
+
+        // Y at k -> both
+        det_dat_y.push(k % n_det);
+        det_off_y[(k + 1) as usize] = det_dat_y.len() as u32;
+        log_dat_y.push(k % n_log);
+        log_off_y[(k + 1) as usize] = log_dat_y.len() as u32;
+
+        // X touches no logicals (empty row)
+        log_off_x[(k + 1) as usize] = log_dat_x.len() as u32;
+        // Z touches no detectors (empty row)
+        det_off_z[(k + 1) as usize] = det_dat_z.len() as u32;
+    }
+
+    GpuInfluenceMapData::from_csr(
+        n_loc, n_det, n_log, det_off_x, det_dat_x, det_off_y, det_dat_y, det_off_z, det_dat_z,
+        log_off_x, log_dat_x, log_off_y, log_dat_y, log_off_z, log_dat_z,
+    )
+}
+
+fn no_flips(flips: &[u32]) -> bool {
+    flips.iter().all(|&w| w == 0)
+}
+
+#[test]
+fn zero_prob_no_flips() {
+    let map = simple_diagonal_map(32, 8, 4);
+    let Ok(mut sampler) = GpuInfluenceSampler::new(&map, 42) else {
+        return;
+    };
+    let result = sampler.sample_uniform(200, 0.0);
+
+    assert_eq!(result.count_logical_errors(), 0);
+    for shot in 0..200 {
+        assert!(
+            !result.has_logical_error(shot),
+            "p=0 shot {shot} reports logical error"
+        );
+        let flips = result.detector_flips_for_shot(shot);
+        assert!(
+            no_flips(&flips),
+            "p=0 shot {shot} has detector flips: {flips:?}"
+        );
+    }
+}
+
+#[test]
+fn empty_map_no_flips() {
+    // Empty influence map: even at p=1, nothing toggles.
+    let map = GpuInfluenceMapData::empty();
+    let Ok(mut sampler) = GpuInfluenceSampler::new(&map, 42) else {
+        return;
+    };
+    let result = sampler.sample_uniform(64, 1.0);
+    assert_eq!(result.count_logical_errors(), 0);
+    for shot in 0..64 {
+        assert!(!result.has_logical_error(shot));
+        assert!(no_flips(&result.detector_flips_for_shot(shot)));
+    }
+}
+
+#[test]
+fn full_prob_saturates_parity() {
+    // At p=1 every location fires every shot. For a map where every
+    // location touches at most one detector and one logical, every shot is
+    // an independent draw of X/Y/Z per location. The parity of the total
+    // toggle count per detector is a deterministic function of the
+    // per-location Pauli choices, but statistically the number of shots
+    // that flip detector 0 should be non-zero.
+    let map = simple_diagonal_map(16, 1, 1); // all locations -> detector 0, logical 0
+    let Ok(mut sampler) = GpuInfluenceSampler::new(&map, 7) else {
+        return;
+    };
+    let result = sampler.sample_uniform(256, 1.0);
+
+    let mut any_detector_flip = 0usize;
+    let mut any_logical_error = 0usize;
+    for shot in 0..256 {
+        if !no_flips(&result.detector_flips_for_shot(shot)) {
+            any_detector_flip += 1;
+        }
+        if result.has_logical_error(shot) {
+            any_logical_error += 1;
+        }
+    }
+    // At p=1 with 16 locations each randomly {X, Y, Z} mapped to det 0 via
+    // X and Y: the parity of detector 0 over 16 flips is ~50/50.
+    // Not all 256 shots flip or all stay; expect a healthy mix.
+    assert!(
+        any_detector_flip > 16,
+        "too few detector flips: {any_detector_flip}/256"
+    );
+    assert!(
+        any_detector_flip < 240,
+        "too many detector flips: {any_detector_flip}/256"
+    );
+    assert!(
+        any_logical_error > 16,
+        "too few logical errors: {any_logical_error}/256"
+    );
+    assert!(
+        any_logical_error < 240,
+        "too many logical errors: {any_logical_error}/256"
+    );
+}
+
+#[test]
+fn determinism_with_same_seed() {
+    // Two samplers with the same seed should produce identical results.
+    let map = simple_diagonal_map(32, 8, 4);
+    let Ok(mut a) = GpuInfluenceSampler::new(&map, 99) else {
+        return;
+    };
+    let Ok(mut b) = GpuInfluenceSampler::new(&map, 99) else {
+        return;
+    };
+    let ra = a.sample_uniform(64, 0.1);
+    let rb = b.sample_uniform(64, 0.1);
+    assert_eq!(ra.count_logical_errors(), rb.count_logical_errors());
+    for shot in 0..64 {
+        assert_eq!(
+            ra.has_logical_error(shot),
+            rb.has_logical_error(shot),
+            "shot {shot} logical mismatch"
+        );
+        assert_eq!(
+            ra.detector_flips_for_shot(shot),
+            rb.detector_flips_for_shot(shot),
+            "shot {shot} detector mismatch"
+        );
+    }
+}
+
+#[test]
+fn scaling_with_p_error() {
+    // Logical error rate should monotonically increase with p.
+    let map = simple_diagonal_map(32, 8, 4);
+    let Ok(mut sampler) = GpuInfluenceSampler::new(&map, 42) else {
+        return;
+    };
+
+    let r_low = sampler.sample_uniform(512, 0.01);
+    let r_mid = sampler.sample_uniform(512, 0.1);
+    let r_high = sampler.sample_uniform(512, 0.5);
+
+    let cnt_low = r_low.count_logical_errors();
+    let cnt_mid = r_mid.count_logical_errors();
+    let cnt_high = r_high.count_logical_errors();
+
+    assert!(
+        cnt_low < cnt_mid,
+        "low p={cnt_low} should have fewer errors than mid p={cnt_mid}"
+    );
+    assert!(
+        cnt_mid < cnt_high,
+        "mid p={cnt_mid} should have fewer errors than high p={cnt_high}"
+    );
+}
diff --git a/crates/pecos-gpu-sims/tests/large_n_audit.rs b/crates/pecos-gpu-sims/tests/large_n_audit.rs
new file mode 100644
index 000000000..d0e96b693
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/large_n_audit.rs
@@ -0,0 +1,169 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Large-N GPU state-vector correctness at N=18, 20, 22. Previous audits
+//! only cover N<=14 (`gate_audit`) and N<=14 fuzz. This one exercises the
+//! workgroup dispatch / indexing at state sizes 2 MB, 8 MB, and 32 MB where
+//! any 2^N overflow, workgroup-stride, or buffer-layout bug would surface.
+//!
+//! Reference: CPU `StateVecSoA`. Tolerance: f64 1e-5 (f32 gate constants limit
+//! precision regardless of backend), f32 5e-3.
+
+use pecos_core::{Angle64, QubitId};
+use pecos_gpu_sims::{GpuStateVec32, GpuStateVec64};
+use pecos_simulators::{ArbitraryRotationGateable, CliffordGateable, StateVecSoA};
+use rand::rngs::StdRng;
+use rand::{RngExt, SeedableRng};
+
+#[derive(Clone, Copy)]
+enum Op {
+    H(usize),
+    Rz(usize, f64),
+    Cx(usize, usize),
+    Cz(usize, usize),
+}
+
+fn gen_op(rng: &mut StdRng, n: usize) -> Op {
+    match rng.random_range(0u32..4) {
+        0 => Op::H(rng.random_range(0..n)),
+        1 => Op::Rz(rng.random_range(0..n), rng.random_range(-3.0..3.0)),
+        2 => {
+            let a = rng.random_range(0..n);
+            let mut b = rng.random_range(0..n);
+            while b == a {
+                b = rng.random_range(0..n);
+            }
+            Op::Cx(a, b)
+        }
+        _ => {
+            let a = rng.random_range(0..n);
+            let mut b = rng.random_range(0..n);
+            while b == a {
+                b = rng.random_range(0..n);
+            }
+            Op::Cz(a, b)
+        }
+    }
+}
+
+fn apply_cpu(sim: &mut StateVecSoA, op: Op) {
+    match op {
+        Op::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Op::Rz(q, t) => {
+            sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+fn apply_gpu32(sim: &mut GpuStateVec32, op: Op) {
+    match op {
+        Op::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Op::Rz(q, t) => {
+            sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+fn apply_gpu64(sim: &mut GpuStateVec64, op: Op) {
+    match op {
+        Op::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Op::Rz(q, t) => {
+            sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+fn run_cross_check(n: usize, gates: usize, seed: u64) {
+    let mut rng = StdRng::seed_from_u64(seed);
+    let ops: Vec<Op> = (0..gates).map(|_| gen_op(&mut rng, n)).collect();
+
+    // CPU reference
+    let mut cpu = StateVecSoA::new(n);
+    for &op in &ops {
+        apply_cpu(&mut cpu, op);
+    }
+    let cpu_state: Vec<[f64; 2]> = cpu.state().into_iter().map(|c| [c.re, c.im]).collect();
+
+    let n_u32 = u32::try_from(n).expect("test N fits in u32");
+    // f32 GPU
+    if let Ok(mut g32) = GpuStateVec32::new(n_u32) {
+        for &op in &ops {
+            apply_gpu32(&mut g32, op);
+        }
+        let s: Vec<[f64; 2]> = g32
+            .state()
+            .into_iter()
+            .map(|[re, im]| [f64::from(re), f64::from(im)])
+            .collect();
+        let d = max_diff(&s, &cpu_state);
+        assert!(d < 5e-3, "N={n} G={gates} seed={seed} f32 diff={d:.3e}");
+    }
+
+    // f64 GPU
+    if let Ok(mut g64) = GpuStateVec64::new(n_u32) {
+        for &op in &ops {
+            apply_gpu64(&mut g64, op);
+        }
+        let s = g64.state();
+        let d = max_diff(&s, &cpu_state);
+        assert!(d < 1e-5, "N={n} G={gates} seed={seed} f64 diff={d:.3e}");
+    }
+}
+
+fn max_diff(a: &[[f64; 2]], b: &[[f64; 2]]) -> f64 {
+    a.iter()
+        .zip(b.iter())
+        .map(|([x0, x1], [y0, y1])| {
+            let dr = x0 - y0;
+            let di = x1 - y1;
+            (dr * dr + di * di).sqrt()
+        })
+        .fold(0.0, f64::max)
+}
+
+#[test]
+fn n18_cross_check() {
+    // 2 MB state
+    run_cross_check(18, 40, 0x1818);
+    run_cross_check(18, 40, 0x1819);
+}
+
+#[test]
+fn n20_cross_check() {
+    // 8 MB state
+    run_cross_check(20, 30, 0x2020);
+}
+
+#[test]
+fn n22_cross_check() {
+    // 32 MB state -- heaviest test in the suite, single run
+    run_cross_check(22, 20, 0x2222);
+}
diff --git a/crates/pecos-gpu-sims/tests/noisy_sampler_stats.rs b/crates/pecos-gpu-sims/tests/noisy_sampler_stats.rs
new file mode 100644
index 000000000..c68d2d88e
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/noisy_sampler_stats.rs
@@ -0,0 +1,166 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Statistical audit of `GpuNoisySampler` + `DepolarizingNoiseSampler`.
+//!
+//! Existing tests check API surface and deterministic seed-replay. These
+//! add distribution-shape checks that would catch a noise-rate off by a
+//! factor (e.g. the sampler applying p^2 instead of p, or applying per-gate
+//! when it should be per-shot).
+
+use pecos_gpu_sims::{CircuitBuilder, DepolarizingNoiseSampler, GpuNoisySampler};
+
+/// For the trivial circuit `mz(0)` on |0> with measurement-error probability
+/// `p_meas`, the fraction of shots returning 1 should be close to `p_meas`.
+#[test]
+#[allow(clippy::cast_precision_loss)] // shots <= 4096, exact in f64
+fn measurement_error_rate_matches_p() {
+    let shots = 4096usize;
+    for &p in &[0.0_f64, 0.05, 0.1, 0.3] {
+        let sampler = DepolarizingNoiseSampler::with_seed(0.0, 0.0, p, 0x1234);
+        let mut gpu = GpuNoisySampler::new(1, sampler);
+        let mut circuit = CircuitBuilder::new();
+        circuit.mz(&[0]);
+
+        let results = gpu
+            .sample(shots, |b| {
+                b.mz(&[0]);
+            })
+            .expect("sample failed");
+
+        let mut ones = 0usize;
+        for shot in &results {
+            if shot.outcomes.first().copied().unwrap_or(false) {
+                ones += 1;
+            }
+        }
+        let observed = ones as f64 / shots as f64;
+        // 3-sigma window for Binomial(shots, p): sigma = sqrt(p(1-p)/shots).
+        let sigma = (p * (1.0 - p) / shots as f64).sqrt();
+        let delta = (observed - p).abs();
+        assert!(
+            delta < 4.0 * sigma + 0.01,
+            "p_meas={p}: observed={observed:.4} expected~{p} (4 sigma = {:.4})",
+            4.0 * sigma + 0.01
+        );
+    }
+}
+
+/// With no noise at all, `mz(0)` on the computational basis state should be
+/// 0 on every shot.
+#[test]
+fn zero_noise_zero_ones() {
+    let sampler = DepolarizingNoiseSampler::with_seed(0.0, 0.0, 0.0, 0x5678);
+    let mut gpu = GpuNoisySampler::new(1, sampler);
+    let results = gpu
+        .sample(256, |b| {
+            b.mz(&[0]);
+        })
+        .expect("sample failed");
+    for shot in &results {
+        assert!(!shot.outcomes[0], "zero noise should give 0 outcome");
+    }
+}
+
+/// For a single-qubit depolarizing channel with probability p1, applying a
+/// `noise_1q` after preparing |0> should flip the measurement with probability
+/// ~2p/3 (X and Y flip Z-basis; Z doesn't). Check within 4 sigma.
+#[test]
+#[allow(clippy::cast_precision_loss)] // shots bounded, exact in f64
+fn depol1_flip_rate() {
+    let shots = 4096usize;
+    for &p in &[0.1_f64, 0.3] {
+        let sampler = DepolarizingNoiseSampler::with_seed(p, 0.0, 0.0, 0xbeef);
+        let mut gpu = GpuNoisySampler::new(1, sampler);
+        let results = gpu
+            .sample(shots, |b| {
+                b.noise_1q(&[0]);
+                b.mz(&[0]);
+            })
+            .expect("sample failed");
+        let ones: usize = results.iter().filter(|r| r.outcomes[0]).count();
+        let observed = ones as f64 / shots as f64;
+        let expected = 2.0 * p / 3.0;
+        let sigma = (expected * (1.0 - expected) / shots as f64).sqrt();
+        let delta = (observed - expected).abs();
+        assert!(
+            delta < 4.0 * sigma + 0.01,
+            "p1={p}: flip rate observed={observed:.4} expected {expected:.4}"
+        );
+    }
+}
+
+/// Two-qubit depolarizing `noise_2q` uses p2. For a Bell pair + `noise_2q` +
+/// measure both: the *correlation* between the two measurements should drop
+/// by a quantity related to p2. This tests that p2 actually plumbs to the
+/// 2q noise path (not re-using p1).
+#[test]
+#[allow(clippy::cast_precision_loss)] // shots bounded, exact in f64
+fn depol2_reduces_bell_correlation() {
+    let shots = 4096usize;
+    let sampler_clean = DepolarizingNoiseSampler::with_seed(0.0, 0.0, 0.0, 0x100);
+    let sampler_noisy = DepolarizingNoiseSampler::with_seed(0.0, 0.5, 0.0, 0x100);
+
+    let circuit_fn = |b: &mut CircuitBuilder| {
+        b.h(&[0]);
+        b.cx(&[(0, 1)]);
+        b.noise_2q(&[(0, 1)]);
+        b.mz(&[0, 1]);
+    };
+
+    let mut gpu_clean = GpuNoisySampler::new(2, sampler_clean);
+    let clean = gpu_clean.sample(shots, circuit_fn).expect("clean sample");
+    let mut gpu_noisy = GpuNoisySampler::new(2, sampler_noisy);
+    let noisy = gpu_noisy.sample(shots, circuit_fn).expect("noisy sample");
+
+    let clean_correlated = clean
+        .iter()
+        .filter(|r| r.outcomes[0] == r.outcomes[1])
+        .count();
+    let noisy_correlated = noisy
+        .iter()
+        .filter(|r| r.outcomes[0] == r.outcomes[1])
+        .count();
+
+    // Perfectly clean Bell pair: correlation = 100%. With strong 2q noise,
+    // correlation drops notably.
+    let clean_rate = clean_correlated as f64 / shots as f64;
+    let noisy_rate = noisy_correlated as f64 / shots as f64;
+    assert!(
+        clean_rate > 0.99,
+        "clean Bell correlation {clean_rate} should be ~1"
+    );
+    assert!(
+        noisy_rate < 0.95,
+        "noisy (p2=0.5) Bell correlation {noisy_rate} should be visibly below clean"
+    );
+}
+
+/// Determinism: same seed => same `ShotResult` stream.
+#[test]
+fn same_seed_same_results() {
+    let p1 = 0.1;
+    let s1 = DepolarizingNoiseSampler::with_seed(p1, 0.0, 0.0, 42);
+    let s2 = DepolarizingNoiseSampler::with_seed(p1, 0.0, 0.0, 42);
+    let mut g1 = GpuNoisySampler::with_seed(1, s1, 777);
+    let mut g2 = GpuNoisySampler::with_seed(1, s2, 777);
+    let r1 = g1
+        .sample(128, |b| {
+            b.noise_1q(&[0]);
+            b.mz(&[0]);
+        })
+        .expect("g1 sample");
+    let r2 = g2
+        .sample(128, |b| {
+            b.noise_1q(&[0]);
+            b.mz(&[0]);
+        })
+        .expect("g2 sample");
+    for (a, b) in r1.iter().zip(r2.iter()) {
+        assert_eq!(a.outcomes, b.outcomes, "same seed should match");
+    }
+}
diff --git a/crates/pecos-gpu-sims/tests/pauli_prop_audit.rs b/crates/pecos-gpu-sims/tests/pauli_prop_audit.rs
new file mode 100644
index 000000000..f8971c848
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/pauli_prop_audit.rs
@@ -0,0 +1,477 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Cross-check `GpuPauliProp` (shot 0) against the CPU `PauliProp` reference.
+//!
+//! Both simulators propagate a single Pauli fault through a Clifford circuit.
+//! CPU `PauliProp` tracks one X-set and one Z-set; GPU `GpuPauliProp` tracks
+//! per-shot X and Z fault bitmaps. With shots=1 and a single seed fault, they
+//! must agree on which qubits end up with X-, Z-, or Y-components.
+//!
+//! The correspondence used:
+//!   GPU `measure_z_flips`[0][q]  <=>  CPU `contains_x(q)` || `contains_y(q)`
+//!   GPU `measure_x_flips`[0][q]  <=>  CPU `contains_z(q)` || `contains_y(q)`
+//! (a Z-basis measurement of qubit q flips iff the tracked Pauli has X or Y on q;
+//! an X-basis measurement flips iff the Pauli has Z or Y on q).
+
+use pecos_core::QubitId;
+use pecos_gpu_sims::GpuPauliProp;
+use pecos_simulators::{CliffordGateable, PauliProp};
+use rand::rngs::StdRng;
+use rand::{RngExt, SeedableRng};
+
+#[derive(Clone, Copy, Debug)]
+enum Op {
+    H(usize),
+    Sz(usize),
+    Szdg(usize),
+    X(usize),
+    Y(usize),
+    Z(usize),
+    Cx(usize, usize),
+    Cz(usize, usize),
+    Swap(usize, usize),
+}
+
+fn pick_two(rng: &mut StdRng, n: usize) -> (usize, usize) {
+    let a = rng.random_range(0..n);
+    let mut b = rng.random_range(0..n);
+    while b == a {
+        b = rng.random_range(0..n);
+    }
+    (a, b)
+}
+
+fn gen_op(rng: &mut StdRng, n: usize) -> Op {
+    match rng.random_range(0u32..9) {
+        0 => Op::H(rng.random_range(0..n)),
+        1 => Op::Sz(rng.random_range(0..n)),
+        2 => Op::Szdg(rng.random_range(0..n)),
+        3 => Op::X(rng.random_range(0..n)),
+        4 => Op::Y(rng.random_range(0..n)),
+        5 => Op::Z(rng.random_range(0..n)),
+        6 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Cx(a, b)
+        }
+        7 => {
+            let (a, b) = pick_two(rng, n);
+            Op::Cz(a, b)
+        }
+        _ => {
+            let (a, b) = pick_two(rng, n);
+            Op::Swap(a, b)
+        }
+    }
+}
+
+fn apply_cpu(sim: &mut PauliProp, op: Op) {
+    match op {
+        Op::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Op::Sz(q) => {
+            sim.sz(&[QubitId(q)]);
+        }
+        Op::Szdg(q) => {
+            sim.szdg(&[QubitId(q)]);
+        }
+        Op::X(q) => {
+            sim.x(&[QubitId(q)]);
+        }
+        Op::Y(q) => {
+            sim.y(&[QubitId(q)]);
+        }
+        Op::Z(q) => {
+            sim.z(&[QubitId(q)]);
+        }
+        Op::Cx(a, b) => {
+            sim.cx(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Cz(a, b) => {
+            sim.cz(&[(QubitId(a), QubitId(b))]);
+        }
+        Op::Swap(a, b) => {
+            sim.swap(&[(QubitId(a), QubitId(b))]);
+        }
+    }
+}
+
+fn apply_gpu(sim: &mut GpuPauliProp, op: Op) {
+    match op {
+        Op::H(q) => sim.h(&[q]),
+        Op::Sz(q) => sim.sz(&[q]),
+        Op::Szdg(q) => sim.szdg(&[q]),
+        Op::X(q) => sim.x(&[q]),
+        Op::Y(q) => sim.y(&[q]),
+        Op::Z(q) => sim.z(&[q]),
+        Op::Cx(a, b) => sim.cx(&[(a, b)]),
+        Op::Cz(a, b) => sim.cz(&[(a, b)]),
+        Op::Swap(a, b) => sim.swap(&[(a, b)]),
+    }
+}
+
+fn run_cross_check(seed: u64, n: usize, gates: usize, fault_qubit: usize, fault_kind: &str) {
+    let Ok(mut gpu) = GpuPauliProp::with_seed(n, 1, seed) else {
+        return;
+    };
+    let mut cpu = PauliProp::new();
+
+    // Inject same fault on both.
+    match fault_kind {
+        "x" => {
+            gpu.inject_x_fault(fault_qubit);
+            cpu.track_x(&[fault_qubit]);
+        }
+        "z" => {
+            gpu.inject_z_fault(fault_qubit);
+            cpu.track_z(&[fault_qubit]);
+        }
+        "y" => {
+            gpu.inject_y_fault(fault_qubit);
+            cpu.track_y(&[fault_qubit]);
+        }
+        _ => unreachable!(),
+    }
+
+    // Apply same random Clifford circuit to both.
+    let mut rng = StdRng::seed_from_u64(seed);
+    let ops: Vec<Op> = (0..gates).map(|_| gen_op(&mut rng, n)).collect();
+    for &op in &ops {
+        apply_gpu(&mut gpu, op);
+        apply_cpu(&mut cpu, op);
+    }
+
+    // Read final Pauli frame.
+    let qubits: Vec<usize> = (0..n).collect();
+    let z_flips = gpu.measure_z_flips(&qubits);
+    let x_flips = gpu.measure_x_flips(&qubits);
+
+    for q in 0..n {
+        // GPU Z-basis flip <=> CPU has X component (X or Y).
+        let gpu_z = z_flips[0][q];
+        let cpu_z = cpu.contains_x(q) || cpu.contains_y(q);
+        assert_eq!(
+            gpu_z, cpu_z,
+            "fault={fault_kind}@{fault_qubit} seed={seed} N={n} G={gates} Z-flip mismatch at q={q}: gpu={gpu_z} cpu={cpu_z}"
+        );
+
+        // GPU X-basis flip <=> CPU has Z component (Z or Y).
+        let gpu_x = x_flips[0][q];
+        let cpu_x = cpu.contains_z(q) || cpu.contains_y(q);
+        assert_eq!(
+            gpu_x, cpu_x,
+            "fault={fault_kind}@{fault_qubit} seed={seed} N={n} G={gates} X-flip mismatch at q={q}: gpu={gpu_x} cpu={cpu_x}"
+        );
+    }
+}
+
+#[test]
+fn shrink_deterministic_check() {
+    // Repeat the same 4-op sequence many times to check determinism.
+    for trial in 0..10 {
+        let Ok(mut gpu) = GpuPauliProp::with_seed(3, 1, 0) else {
+            return;
+        };
+        gpu.inject_x_fault(0);
+        gpu.cz(&[(2, 1)]);
+        gpu.cx(&[(2, 0)]);
+        gpu.cx(&[(1, 2)]);
+        gpu.szdg(&[0]);
+        let zf = gpu.measure_z_flips(&[0, 1, 2]);
+        let xf = gpu.measure_x_flips(&[0, 1, 2]);
+        eprintln!("trial {trial}: zf={:?} xf={:?}", zf[0], xf[0]);
+    }
+}
+
+#[test]
+fn trace_after_each_gate() {
+    // Apply gates one by one, printing Pauli frame after each.
+    let ops = [Op::Cz(2, 1), Op::Cx(2, 0), Op::Cx(1, 2), Op::Szdg(0)];
+    let Ok(mut gpu) = GpuPauliProp::with_seed(3, 1, 0) else {
+        return;
+    };
+    let mut cpu = PauliProp::new();
+    gpu.inject_x_fault(0);
+    cpu.track_x(&[0]);
+
+    eprintln!(
+        "START: CPU x={}  y={}  z={}",
+        (0..3)
+            .map(|q| if cpu.contains_x(q) { '1' } else { '0' })
+            .collect::<String>(),
+        (0..3)
+            .map(|q| if cpu.contains_y(q) { '1' } else { '0' })
+            .collect::<String>(),
+        (0..3)
+            .map(|q| if cpu.contains_z(q) { '1' } else { '0' })
+            .collect::<String>(),
+    );
+    let qubits = vec![0usize, 1, 2];
+
+    for (i, op) in ops.iter().enumerate() {
+        apply_gpu(&mut gpu, *op);
+        apply_cpu(&mut cpu, *op);
+        let zf = gpu.measure_z_flips(&qubits);
+        let xf = gpu.measure_x_flips(&qubits);
+        eprintln!(
+            "After [{i}] {op:?}: GPU z={}  x={}  | CPU x={}  y={}  z={}",
+            zf[0]
+                .iter()
+                .map(|b| if *b { '1' } else { '0' })
+                .collect::<String>(),
+            xf[0]
+                .iter()
+                .map(|b| if *b { '1' } else { '0' })
+                .collect::<String>(),
+            (0..3)
+                .map(|q| if cpu.contains_x(q) { '1' } else { '0' })
+                .collect::<String>(),
+            (0..3)
+                .map(|q| if cpu.contains_y(q) { '1' } else { '0' })
+                .collect::<String>(),
+            (0..3)
+                .map(|q| if cpu.contains_z(q) { '1' } else { '0' })
+                .collect::<String>(),
+        );
+    }
+}
+
+/// Shrink a failing (seed, n, gates) down to the minimum prefix length that fails.
+#[test]
+fn shrink_failing_case() {
+    // Known failing case: cross_check_x_faults seed=0 N=3 G=30 fault=x@0.
+    for gates in 1..=30 {
+        let Ok(mut gpu) = GpuPauliProp::with_seed(3, 1, 0) else {
+            return;
+        };
+        let mut cpu = PauliProp::new();
+        gpu.inject_x_fault(0);
+        cpu.track_x(&[0]);
+
+        let mut rng = StdRng::seed_from_u64(0);
+        let ops: Vec<Op> = (0..30).map(|_| gen_op(&mut rng, 3)).collect();
+        for &op in ops.iter().take(gates) {
+            apply_gpu(&mut gpu, op);
+            apply_cpu(&mut cpu, op);
+        }
+
+        let qubits: Vec<usize> = (0..3).collect();
+        let zf = gpu.measure_z_flips(&qubits);
+        let xf = gpu.measure_x_flips(&qubits);
+
+        for q in 0..3 {
+            let cpu_zf = cpu.contains_x(q) || cpu.contains_y(q);
+            let cpu_xf = cpu.contains_z(q) || cpu.contains_y(q);
+            if zf[0][q] != cpu_zf || xf[0][q] != cpu_xf {
+                eprintln!("MISMATCH at gates={gates}, q={q}:");
+                eprintln!("  ops so far:");
+                for (i, &op) in ops.iter().take(gates).enumerate() {
+                    eprintln!("    [{i}] {op:?}");
+                }
+                eprintln!(
+                    "  gpu z_flip[{q}]={} x_flip[{q}]={} cpu: x={} y={} z={}",
+                    zf[0][q],
+                    xf[0][q],
+                    cpu.contains_x(q),
+                    cpu.contains_y(q),
+                    cpu.contains_z(q)
+                );
+                panic!("divergence at gate {gates}");
+            }
+        }
+    }
+}
+
+#[test]
+fn simple_cx_check() {
+    // X(0) then CX(0,1): X_0 -> X_0 X_1. So Pauli = XX.
+    let Ok(mut gpu) = GpuPauliProp::with_seed(2, 1, 0) else {
+        return;
+    };
+    let mut cpu = PauliProp::new();
+    gpu.inject_x_fault(0);
+    cpu.track_x(&[0]);
+    gpu.cx(&[(0, 1)]);
+    cpu.cx(&[(QubitId(0), QubitId(1))]);
+    let zf = gpu.measure_z_flips(&[0, 1]);
+    let xf = gpu.measure_x_flips(&[0, 1]);
+    eprintln!(
+        "After X(0) CX(0,1): gpu z_flip={:?} x_flip={:?} ; cpu x(0)={} x(1)={} z(0)={} z(1)={}",
+        zf[0],
+        xf[0],
+        cpu.contains_x(0),
+        cpu.contains_x(1),
+        cpu.contains_z(0),
+        cpu.contains_z(1)
+    );
+    // Expected XX: z_flip = [true, true], x_flip = [false, false]
+    for q in 0..2 {
+        assert_eq!(
+            zf[0][q],
+            cpu.contains_x(q) || cpu.contains_y(q),
+            "z_flip q={q}"
+        );
+        assert_eq!(
+            xf[0][q],
+            cpu.contains_z(q) || cpu.contains_y(q),
+            "x_flip q={q}"
+        );
+    }
+}
+
+#[test]
+fn simple_cz_check() {
+    // Z(0) then CZ(0,1): Z_0 commutes with CZ; Pauli stays Z_0.
+    let Ok(mut gpu) = GpuPauliProp::with_seed(2, 1, 0) else {
+        return;
+    };
+    let mut cpu = PauliProp::new();
+    gpu.inject_z_fault(0);
+    cpu.track_z(&[0]);
+    gpu.cz(&[(0, 1)]);
+    cpu.cz(&[(QubitId(0), QubitId(1))]);
+    let zf = gpu.measure_z_flips(&[0, 1]);
+    let xf = gpu.measure_x_flips(&[0, 1]);
+    eprintln!(
+        "After Z(0) CZ(0,1): gpu z_flip={:?} x_flip={:?} ; cpu x(0)={} x(1)={} z(0)={} z(1)={}",
+        zf[0],
+        xf[0],
+        cpu.contains_x(0),
+        cpu.contains_x(1),
+        cpu.contains_z(0),
+        cpu.contains_z(1)
+    );
+    for q in 0..2 {
+        assert_eq!(
+            zf[0][q],
+            cpu.contains_x(q) || cpu.contains_y(q),
+            "z_flip q={q}"
+        );
+        assert_eq!(
+            xf[0][q],
+            cpu.contains_z(q) || cpu.contains_y(q),
+            "x_flip q={q}"
+        );
+    }
+}
+
+#[test]
+fn simple_x_cz_check() {
+    // X(0) then CZ(0,1): X_0 -> X_0 Z_1. So Pauli = X_0 Z_1.
+    let Ok(mut gpu) = GpuPauliProp::with_seed(2, 1, 0) else {
+        return;
+    };
+    let mut cpu = PauliProp::new();
+    gpu.inject_x_fault(0);
+    cpu.track_x(&[0]);
+    gpu.cz(&[(0, 1)]);
+    cpu.cz(&[(QubitId(0), QubitId(1))]);
+    let zf = gpu.measure_z_flips(&[0, 1]);
+    let xf = gpu.measure_x_flips(&[0, 1]);
+    eprintln!(
+        "After X(0) CZ(0,1): gpu z_flip={:?} x_flip={:?} ; cpu x(0)={} x(1)={} z(0)={} z(1)={}",
+        zf[0],
+        xf[0],
+        cpu.contains_x(0),
+        cpu.contains_x(1),
+        cpu.contains_z(0),
+        cpu.contains_z(1)
+    );
+    // Expected X_0 Z_1: z_flip = [true, false], x_flip = [false, true]
+    for q in 0..2 {
+        assert_eq!(
+            zf[0][q],
+            cpu.contains_x(q) || cpu.contains_y(q),
+            "z_flip q={q}"
+        );
+        assert_eq!(
+            xf[0][q],
+            cpu.contains_z(q) || cpu.contains_y(q),
+            "x_flip q={q}"
+        );
+    }
+}
+
+#[test]
+fn minimal_single_gate_checks() {
+    // X fault on q=0 followed by H(0): X -> Z, so Z-flip(0) should be FALSE and X-flip(0) TRUE.
+    let Ok(mut gpu) = GpuPauliProp::with_seed(2, 1, 0) else {
+        return;
+    };
+    let mut cpu = PauliProp::new();
+    gpu.inject_x_fault(0);
+    cpu.track_x(&[0]);
+    gpu.h(&[0]);
+    cpu.h(&[QubitId(0)]);
+    let zf = gpu.measure_z_flips(&[0, 1]);
+    let xf = gpu.measure_x_flips(&[0, 1]);
+    eprintln!(
+        "After X(0) H(0): gpu z_flip={:?} x_flip={:?} ; cpu contains_x(0)={} z(0)={}",
+        zf[0],
+        xf[0],
+        cpu.contains_x(0),
+        cpu.contains_z(0)
+    );
+    // Expected: Pauli is Z on qubit 0. GPU z_flip[0] = false, x_flip[0] = true. CPU contains_z(0) = true.
+    assert_eq!(zf[0][0], cpu.contains_x(0) || cpu.contains_y(0), "z_flip 0");
+    assert_eq!(xf[0][0], cpu.contains_z(0) || cpu.contains_y(0), "x_flip 0");
+}
+
+#[test]
+fn cross_check_x_faults() {
+    for seed in 0u64..10 {
+        for n in [3usize, 5, 8] {
+            for &fq in &[0usize, 1] {
+                if fq < n {
+                    run_cross_check(seed, n, 30, fq, "x");
+                }
+            }
+        }
+    }
+}
+
+#[test]
+fn cross_check_z_faults() {
+    for seed in 100u64..110 {
+        for n in [3usize, 5, 8] {
+            for &fq in &[0usize, 1] {
+                if fq < n {
+                    run_cross_check(seed, n, 30, fq, "z");
+                }
+            }
+        }
+    }
+}
+
+#[test]
+fn cross_check_y_faults() {
+    for seed in 200u64..210 {
+        for n in [3usize, 5, 8] {
+            for &fq in &[0usize, 1] {
+                if fq < n {
+                    run_cross_check(seed, n, 30, fq, "y");
+                }
+            }
+        }
+    }
+}
+
+#[test]
+fn cross_check_longer_circuits() {
+    // Stress test with longer circuits and more qubits.
+    for seed in 300u64..305 {
+        for n in [10usize, 16] {
+            for &fq in &[0usize, 3] {
+                if fq < n {
+                    run_cross_check(seed, n, 100, fq, "x");
+                    run_cross_check(seed, n, 100, fq, "z");
+                }
+            }
+        }
+    }
+}
diff --git a/crates/pecos-gpu-sims/tests/stab_extra_audits.rs b/crates/pecos-gpu-sims/tests/stab_extra_audits.rs
new file mode 100644
index 000000000..5adebc4ff
--- /dev/null
+++ b/crates/pecos-gpu-sims/tests/stab_extra_audits.rs
@@ -0,0 +1,316 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Audits for `GpuStab` compile-circuit path and `GpuStab` / `GpuStabMulti`
+//! mid-circuit measurement queues against CPU `SparseStab` reference.
+
+use pecos_core::QubitId;
+use pecos_gpu_sims::{CompiledGate, DefaultGpuStab, GateType, GpuStabMulti};
+use pecos_random::PecosRng;
+use pecos_simulators::{CliffordGateable, QuantumSimulator, SparseStab};
+
+// ---------------------------------------------------------------------------
+// compile_circuit replay: compiled dispatch should match normal per-gate
+// dispatch exactly on the same deterministic circuit.
+// ---------------------------------------------------------------------------
+
+/// Build an equivalent (`CompiledGate`, trait-call) pair for each op so we can
+/// apply the same circuit through either dispatch path.
+#[derive(Clone, Copy)]
+#[allow(dead_code)] // X and Cz reserved for expanded test coverage
+enum Op {
+    H(u32),
+    X(u32),
+    S(u32),
+    Cx(u32, u32),
+    Cz(u32, u32),
+}
+
+fn to_compiled(op: Op) -> CompiledGate {
+    match op {
+        Op::H(t) => CompiledGate::h(t),
+        Op::X(t) => CompiledGate::x(t),
+        Op::S(t) => CompiledGate::s(t),
+        Op::Cx(c, t) => CompiledGate {
+            gate_type: GateType::Cx,
+            target: t,
+            control: Some(c),
+        },
+        Op::Cz(c, t) => CompiledGate {
+            gate_type: GateType::Cz,
+            target: t,
+            control: Some(c),
+        },
+    }
+}
+
+fn apply_trait<S: CliffordGateable>(sim: &mut S, op: Op) {
+    match op {
+        Op::H(q) => {
+            sim.h(&[QubitId(q as usize)]);
+        }
+        Op::X(q) => {
+            sim.x(&[QubitId(q as usize)]);
+        }
+        Op::S(q) => {
+            sim.sz(&[QubitId(q as usize)]);
+        }
+        Op::Cx(c, t) => {
+            sim.cx(&[(QubitId(c as usize), QubitId(t as usize))]);
+        }
+        Op::Cz(c, t) => {
+            sim.cz(&[(QubitId(c as usize), QubitId(t as usize))]);
+        }
+    }
+}
+
+#[test]
+fn compile_circuit_matches_normal_path() {
+    let Ok(mut gpu_compiled) = DefaultGpuStab::with_seed(6, 42) else {
+        return;
+    };
+    let Ok(mut gpu_normal) = DefaultGpuStab::with_seed(6, 42) else {
+        return;
+    };
+
+    // A Bell-state-like deterministic Clifford circuit: all measurements
+    // forced by the state.
+    let ops = [
+        Op::H(0),
+        Op::Cx(0, 1),
+        Op::H(2),
+        Op::Cx(2, 3),
+        Op::H(4),
+        Op::Cx(4, 5),
+        Op::S(1),
+        Op::S(3),
+    ];
+
+    let compiled_gates: Vec<CompiledGate> = ops.iter().copied().map(to_compiled).collect();
+    let hash = gpu_compiled.compile_circuit(&compiled_gates);
+    gpu_compiled.execute_compiled_wait(hash);
+
+    for op in ops {
+        apply_trait(&mut gpu_normal, op);
+    }
+    gpu_normal.sync_wait();
+
+    let results_compiled = gpu_compiled.mz(&(0..6).map(QubitId).collect::<Vec<_>>());
+    let results_normal = gpu_normal.mz(&(0..6).map(QubitId).collect::<Vec<_>>());
+    for (a, b) in results_compiled.iter().zip(results_normal.iter()) {
+        assert_eq!(a.outcome, b.outcome, "compile_circuit vs normal mz differ");
+    }
+}
+
+#[test]
+fn compile_circuit_cached_second_call() {
+    // Compiling the same circuit twice should return the same hash.
+    let Ok(mut gpu) = DefaultGpuStab::with_seed(4, 7) else {
+        return;
+    };
+    let gates = vec![
+        CompiledGate::h(0),
+        CompiledGate {
+            gate_type: GateType::Cx,
+            target: 1,
+            control: Some(0),
+        },
+    ];
+    let h1 = gpu.compile_circuit(&gates);
+    let h2 = gpu.compile_circuit(&gates);
+    assert_eq!(h1, h2);
+    assert!(gpu.is_circuit_compiled(&gates));
+}
+
+#[test]
+fn compile_circuit_matches_cpu_ghz() {
+    // GHZ-state preparation. Z-basis measurement is deterministic (all agree
+    // on random 0 or 1). Compiled and normal paths must agree with CPU on at
+    // least the correlation structure.
+    let n: u32 = 5;
+    let gates: Vec<CompiledGate> = std::iter::once(CompiledGate::h(0))
+        .chain((0..n - 1).map(|q| CompiledGate {
+            gate_type: GateType::Cx,
+            target: q + 1,
+            control: Some(q),
+        }))
+        .collect();
+
+    let Ok(mut gpu) = DefaultGpuStab::with_seed(n as usize, 99) else {
+        return;
+    };
+    let hash = gpu.compile_circuit(&gates);
+    gpu.execute_compiled_wait(hash);
+
+    let mut cpu = SparseStab::new(n as usize);
+    cpu.h(&[QubitId(0)]);
+    for q in 0..n - 1 {
+        cpu.cx(&[(QubitId(q as usize), QubitId((q + 1) as usize))]);
+    }
+
+    // After GHZ prep all measurement outcomes must be identical across
+    // qubits in each individual shot.
+    let gpu_results = gpu.mz(&(0..n as usize).map(QubitId).collect::<Vec<_>>());
+    let cpu_results = cpu.mz(&(0..n as usize).map(QubitId).collect::<Vec<_>>());
+    let gpu_val = gpu_results[0].outcome;
+    for r in &gpu_results {
+        assert_eq!(r.outcome, gpu_val, "GHZ: GPU qubits disagree");
+    }
+    let cpu_val = cpu_results[0].outcome;
+    for r in &cpu_results {
+        assert_eq!(r.outcome, cpu_val, "GHZ: CPU qubits disagree");
+    }
+}
+
+// ---------------------------------------------------------------------------
+// GpuStab mz_queue / mz_fetch mid-circuit measurement
+// ---------------------------------------------------------------------------
+
+#[test]
+fn stab_mz_queue_fetch_matches_direct_mz() {
+    // For deterministic Clifford circuits, mz_queue + more gates + mz_fetch
+    // should produce outcomes identical to calling mz() at each intermediate
+    // point.
+    let Ok(mut gpu_queue) = DefaultGpuStab::with_seed(4, 11) else {
+        return;
+    };
+    let Ok(mut gpu_direct) = DefaultGpuStab::with_seed(4, 11) else {
+        return;
+    };
+
+    // Prepare |0101> by applying X on qubits 1, 3. Measure.
+    // Then apply X on qubit 0 and measure again.
+    // Both outcomes forced by state.
+    gpu_queue.x(&[QubitId(1), QubitId(3)]);
+    gpu_queue.mz_queue(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    gpu_queue.x(&[QubitId(0)]);
+    gpu_queue.mz_queue(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    let all = gpu_queue.mz_fetch();
+    assert_eq!(all.len(), 8);
+    let first_round: Vec<bool> = all[..4].iter().map(|r| r.outcome).collect();
+    let second_round: Vec<bool> = all[4..].iter().map(|r| r.outcome).collect();
+    assert_eq!(first_round, vec![false, true, false, true]);
+    assert_eq!(second_round, vec![true, true, false, true]);
+
+    // Compare with direct-mz reference.
+    gpu_direct.x(&[QubitId(1), QubitId(3)]);
+    let first_direct = gpu_direct.mz(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    gpu_direct.x(&[QubitId(0)]);
+    let second_direct = gpu_direct.mz(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    let first_direct: Vec<bool> = first_direct.iter().map(|r| r.outcome).collect();
+    let second_direct: Vec<bool> = second_direct.iter().map(|r| r.outcome).collect();
+    assert_eq!(first_round, first_direct);
+    assert_eq!(second_round, second_direct);
+}
+
+// ---------------------------------------------------------------------------
+// GpuStabMulti mz_queue / mz_fetch
+// ---------------------------------------------------------------------------
+
+#[test]
+fn stab_multi_fresh_state_mz_queue_all_zero() {
+    // Absolute simplest: fresh |0000> via constructor, mz_queue immediately.
+    let shots = 4;
+    let Ok(mut gpu) = GpuStabMulti::<PecosRng>::with_seed(4, shots, 42) else {
+        return;
+    };
+    gpu.mz_queue(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    let all = gpu.mz_fetch();
+    for (shot, row) in all.iter().enumerate() {
+        assert_eq!(
+            *row,
+            vec![false, false, false, false],
+            "shot {shot}: fresh |0000> mz_queue should be all-false, got {row:?}"
+        );
+    }
+}
+
+#[test]
+fn stab_multi_x_then_mz_queue() {
+    // Simplest per-qubit determinism: X(q), mz_queue(q). Should be true.
+    let shots = 4;
+    for q in 0..4 {
+        let Ok(mut gpu) = GpuStabMulti::<PecosRng>::with_seed(4, shots, 42) else {
+            return;
+        };
+        gpu.x(&[QubitId(q)]);
+        gpu.mz_queue(&[QubitId(q)]);
+        let all = gpu.mz_fetch();
+        for (shot, row) in all.iter().enumerate() {
+            assert_eq!(
+                *row,
+                vec![true],
+                "shot {shot}: X({q}) mz_queue({q}) should be true, got {row:?}"
+            );
+        }
+    }
+}
+
+#[test]
+fn stab_multi_mz_queue_deterministic_shots() {
+    // Deterministic circuit: all 32 shots must agree.
+    let shots = 32;
+    let Ok(mut gpu) = GpuStabMulti::<PecosRng>::with_seed(4, shots, 42) else {
+        return;
+    };
+    gpu.x(&[QubitId(1), QubitId(3)]);
+    gpu.mz_queue(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    gpu.x(&[QubitId(0)]);
+    gpu.mz_queue(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+    let all = gpu.mz_fetch();
+    assert_eq!(all.len(), shots);
+    for row in &all {
+        assert_eq!(row.len(), 8, "8 queued measurements per shot");
+        let expected = vec![false, true, false, true, true, true, false, true];
+        assert_eq!(*row, expected);
+    }
+}
+
+#[test]
+fn stab_multi_reset_clears_queue() {
+    // After reset the measurement queue must be empty.
+    let Ok(mut gpu) = GpuStabMulti::<PecosRng>::with_seed(3, 4, 42) else {
+        return;
+    };
+    gpu.x(&[QubitId(0)]);
+    gpu.mz_queue(&[QubitId(0)]);
+    gpu.reset();
+    // Fresh state: mz_queue on fresh |000> then fetch should be all zeros.
+    gpu.mz_queue(&[QubitId(0), QubitId(1), QubitId(2)]);
+    let all = gpu.mz_fetch();
+    for row in &all {
+        assert_eq!(
+            *row,
+            vec![false, false, false],
+            "reset should clear residual queue and state"
+        );
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Reset semantics: multiple reuse cycles on the same GpuStab should not leak
+// state or RNG entropy between runs.
+// ---------------------------------------------------------------------------
+
+#[test]
+fn stab_reset_reuse_deterministic() {
+    let Ok(mut gpu) = DefaultGpuStab::with_seed(4, 42) else {
+        return;
+    };
+    for cycle in 0..5 {
+        // Each cycle: reset, apply X(2), measure all.
+        gpu.reset();
+        gpu.x(&[QubitId(2)]);
+        let r = gpu.mz(&[QubitId(0), QubitId(1), QubitId(2), QubitId(3)]);
+        let outs: Vec<bool> = r.iter().map(|x| x.outcome).collect();
+        assert_eq!(
+            outs,
+            vec![false, false, true, false],
+            "cycle {cycle}: reset+X(2) should give |0010>"
+        );
+    }
+}
diff --git a/crates/pecos-hugr/Cargo.toml b/crates/pecos-hugr/Cargo.toml
index 3f842f411..117ea07e8 100644
--- a/crates/pecos-hugr/Cargo.toml
+++ b/crates/pecos-hugr/Cargo.toml
@@ -44,7 +44,7 @@ tempfile.workspace = true
 # For creating test HUGRs from DagCircuit
 pecos-quantum = { workspace = true, features = ["hugr"] }
 # For integration testing with quantum simulator
-pecos-quest.workspace = true
+pecos-simulators.workspace = true
 # For compiling WAT to WASM in tests
 wat.workspace = true
 
diff --git a/crates/pecos-hugr/src/engine.rs b/crates/pecos-hugr/src/engine.rs
index efd11c2a5..f1091f6ed 100644
--- a/crates/pecos-hugr/src/engine.rs
+++ b/crates/pecos-hugr/src/engine.rs
@@ -2648,10 +2648,10 @@ mod tests {
     // --- Integration Tests with Quantum Simulator ---
 
     #[test]
-    fn test_bell_state_with_quest() {
-        // Test HugrEngine with Quest quantum simulator for a Bell state circuit
+    fn test_bell_state_with_statevec() {
+        // Test HugrEngine with PECOS DenseStateVecEngine for a Bell state circuit
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -2662,10 +2662,10 @@ mod tests {
         let num_qubits = hugr_engine.num_qubits();
         println!("Bell state HUGR has {num_qubits} qubits");
 
-        // Create HybridEngine with HugrEngine and Quest
+        // Create HybridEngine with HugrEngine and DenseStateVecEngine
         let mut hybrid = HybridEngineBuilder::new()
             .with_classical_engine(Box::new(hugr_engine))
-            .with_quantum_engine(Box::new(QuestStateVecEngine::new(num_qubits)))
+            .with_quantum_engine(Box::new(DenseStateVecEngine::new(num_qubits)))
             .build();
 
         // Set seed for reproducibility
@@ -2690,10 +2690,10 @@ mod tests {
     }
 
     #[test]
-    fn test_simple_hadamard_with_quest() {
-        // Test a simple Hadamard + measure circuit with Quest
+    fn test_simple_hadamard_with_statevec() {
+        // Test a simple Hadamard + measure circuit with DenseStateVecEngine
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -2707,7 +2707,7 @@ mod tests {
         // Create HybridEngine
         let mut hybrid = HybridEngineBuilder::new()
             .with_classical_engine(Box::new(hugr_engine))
-            .with_quantum_engine(Box::new(QuestStateVecEngine::new(num_qubits)))
+            .with_quantum_engine(Box::new(DenseStateVecEngine::new(num_qubits)))
             .build();
 
         hybrid.set_seed(42);
@@ -2741,10 +2741,10 @@ mod tests {
     }
 
     #[test]
-    fn test_conditional_with_quest() {
+    fn test_conditional_with_statevec() {
         // Test conditional circuit with real quantum simulation
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -2760,7 +2760,7 @@ mod tests {
         // Create HybridEngine - use more qubits in case HUGR structure differs
         let mut hybrid = HybridEngineBuilder::new()
             .with_classical_engine(Box::new(hugr_engine))
-            .with_quantum_engine(Box::new(QuestStateVecEngine::new(4))) // Use 4 qubits to be safe
+            .with_quantum_engine(Box::new(DenseStateVecEngine::new(4))) // Use 4 qubits to be safe
             .build();
 
         hybrid.set_seed(42);
@@ -3067,8 +3067,8 @@ mod tests {
 
     #[test]
     #[allow(clippy::cast_sign_loss)]
-    fn test_simple_conditional_with_quest() {
-        // Test the simple conditional circuit with Quest simulation
+    fn test_simple_conditional_with_statevec() {
+        // Test the simple conditional circuit with DenseStateVecEngine
         // Circuit: H(q0), measure(q0), if result=1: X(q1), measure(q1)
         //
         // Expected behavior:
@@ -3078,7 +3078,7 @@ mod tests {
         //   - If m0=1: X applied, so m1=1
         // Key invariant: m0 == m1 for every shot
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3097,7 +3097,7 @@ mod tests {
             let hugr_engine = HugrEngine::from_file(hugr_path).expect("Failed to load HUGR");
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
@@ -3164,8 +3164,8 @@ mod tests {
 
     #[test]
     #[allow(clippy::cast_sign_loss)]
-    fn test_conditional_branch_with_quest() {
-        // Test the conditional branch circuit with Quest simulation
+    fn test_conditional_branch_with_statevec() {
+        // Test the conditional branch circuit with DenseStateVecEngine
         // Circuit: measure(q0), if m0=0: H(q1), else: X(q1), measure(q1)
         //
         // Expected behavior:
@@ -3173,7 +3173,7 @@ mod tests {
         // - Second measurement (m1): 50/50 (H applied since m0=0)
         // Key invariant: m0 is always 0
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3193,7 +3193,7 @@ mod tests {
             let hugr_engine = HugrEngine::from_file(hugr_path).expect("Failed to load HUGR");
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
@@ -3258,8 +3258,8 @@ mod tests {
 
     #[test]
     #[allow(clippy::cast_sign_loss)]
-    fn test_conditional_h_with_quest() {
-        // Test the conditional H circuit with Quest simulation
+    fn test_conditional_h_with_statevec() {
+        // Test the conditional H circuit with DenseStateVecEngine
         // Circuit: H(control), measure(control), if control=1: H(result), measure(result)
         //
         // Expected behavior:
@@ -3269,7 +3269,7 @@ mod tests {
         //   - If control=1: result is 50/50 (H applied)
         // Key invariant: when control=0, result must be 0
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3289,7 +3289,7 @@ mod tests {
             let hugr_engine = HugrEngine::from_file(hugr_path).expect("Failed to load HUGR");
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
@@ -3413,8 +3413,8 @@ mod tests {
 
     #[test]
     #[allow(clippy::cast_sign_loss)]
-    fn test_while_loop_with_quest() {
-        // Test the while loop circuit with Quest simulation
+    fn test_while_loop_with_statevec() {
+        // Test the while loop circuit with DenseStateVecEngine
         // Circuit: while not result: q=qubit(), H(q), result=measure(q)
         //
         // Expected behavior:
@@ -3423,7 +3423,7 @@ mod tests {
         // - Final result is always True (1) since that's the exit condition
         use pecos_engines::ControlEngine;
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3481,7 +3481,7 @@ mod tests {
             let hugr_engine = HugrEngine::from_file(hugr_path).expect("Failed to load HUGR");
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
@@ -3531,14 +3531,14 @@ mod tests {
 
     #[test]
     #[allow(clippy::cast_sign_loss)]
-    fn test_function_call_with_quest() {
-        // Test function call circuit with Quest simulation
+    fn test_function_call_with_statevec() {
+        // Test function call circuit with DenseStateVecEngine
         // Circuit: q = qubit(), q = apply_h(q), measure(q)
         // where apply_h applies H gate
         //
         // Expected behavior: 50/50 measurement outcome
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3576,7 +3576,7 @@ mod tests {
             let hugr_engine = HugrEngine::from_file(hugr_path).expect("Failed to load HUGR");
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
@@ -3649,11 +3649,11 @@ mod tests {
 
     #[test]
     #[allow(clippy::too_many_lines, clippy::cast_sign_loss)]
-    fn test_multiple_function_calls_with_quest() {
+    fn test_multiple_function_calls_with_statevec() {
         // Test multiple function calls: apply_h to two qubits
         // Expected: both measurements are 50/50 independent
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3704,7 +3704,7 @@ mod tests {
 
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
@@ -3800,11 +3800,11 @@ mod tests {
 
     #[test]
     #[allow(clippy::cast_sign_loss)]
-    fn test_nested_function_calls_with_quest() {
+    fn test_nested_function_calls_with_statevec() {
         // Test nested function calls: main -> outer_func -> inner_h
         // Expected: 50/50 measurement outcome
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3829,7 +3829,7 @@ mod tests {
             let hugr_engine = HugrEngine::from_file(hugr_path).expect("Failed to load HUGR");
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
@@ -3907,11 +3907,11 @@ mod tests {
 
     #[test]
     #[allow(clippy::cast_sign_loss)]
-    fn test_multi_qubit_function_with_quest() {
+    fn test_multi_qubit_function_with_statevec() {
         // Test multi-qubit function: apply_cx creates Bell state
         // Expected: measurements are correlated (00 or 11, never 01 or 10)
         use pecos_engines::hybrid::HybridEngineBuilder;
-        use pecos_quest::QuestStateVecEngine;
+        use pecos_engines::quantum::DenseStateVecEngine;
 
         let hugr_path = concat!(
             env!("CARGO_MANIFEST_DIR"),
@@ -3941,7 +3941,7 @@ mod tests {
             let hugr_engine = HugrEngine::from_file(hugr_path).expect("Failed to load HUGR");
             let mut hybrid = HybridEngineBuilder::new()
                 .with_classical_engine(Box::new(hugr_engine))
-                .with_quantum_engine(Box::new(QuestStateVecEngine::new(estimated_qubits)))
+                .with_quantum_engine(Box::new(DenseStateVecEngine::new(estimated_qubits)))
                 .build();
 
             hybrid.set_seed(shot_num as u64);
diff --git a/crates/pecos-quest/Cargo.toml b/crates/pecos-quest/Cargo.toml
deleted file mode 100644
index 70402901e..000000000
--- a/crates/pecos-quest/Cargo.toml
+++ /dev/null
@@ -1,43 +0,0 @@
-[package]
-name = "pecos-quest"
-version.workspace = true
-edition.workspace = true
-readme = "README.md"
-authors.workspace = true
-homepage.workspace = true
-repository.workspace = true
-license.workspace = true
-keywords.workspace = true
-categories.workspace = true
-description = "QuEST quantum simulator wrapper for PECOS"
-
-[features]
-default = ["cpu"]
-cpu = []
-
-[dependencies]
-thiserror.workspace = true
-cxx.workspace = true
-pecos-core.workspace = true
-pecos-simulators.workspace = true
-pecos-engines.workspace = true
-pecos-random.workspace = true
-num-complex.workspace = true
-rand.workspace = true
-rand_core.workspace = true
-libloading.workspace = true
-log.workspace = true
-dirs.workspace = true
-
-[build-dependencies]
-pecos-build.workspace = true
-cxx-build.workspace = true
-log.workspace = true
-env_logger.workspace = true
-dirs.workspace = true
-
-[dev-dependencies]
-pecos-num.workspace = true
-
-[lints]
-workspace = true
diff --git a/crates/pecos-quest/README.md b/crates/pecos-quest/README.md
deleted file mode 100644
index 32d00a4fc..000000000
--- a/crates/pecos-quest/README.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# pecos-quest
-
-Rust wrapper for the QuEST quantum simulator, implementing PECOS quantum simulator traits.
-
-## Features
-
-- **Dual Simulation Modes**:
-  - `QuestStateVec`: Pure state vector simulation
-  - `QuestDensityMatrix`: Mixed state density matrix simulation
-- **PECOS Compatible**: Drop-in replacement for other PECOS simulators
-- **Thread Safe**: Independent instances for parallel Monte Carlo simulations
-- **Automatic Build**: QuEST v4.0.0 is downloaded and built automatically
-
-## Quick Start
-
-```rust
-use pecos_quest::{QuestStateVec, CliffordGateable};
-
-// Create a 2-qubit simulator
-let mut state = QuestStateVec::new(2);
-
-// Create Bell state |Φ+⟩ = (|00⟩ + |11⟩)/√2
-state.h(0).cx(0, 1);
-
-// Measure qubit 0
-let result = state.mz(0);
-println!("Measured: {}", result.outcome);
-```
-
-## Density Matrix Simulation
-
-```rust
-use pecos_quest::{QuestDensityMatrix, CliffordGateable};
-
-// Create mixed state simulator
-let mut state = QuestDensityMatrix::new(2);
-
-// Apply operations
-state.h(0).cx(0, 1);
-```
-
-## Parallel Execution
-
-Each simulator instance is independent, perfect for Monte Carlo simulations:
-
-```rust
-use std::thread;
-use pecos_quest::{QuestStateVec, CliffordGateable};
-
-let handles: Vec<_> = (0..4).map(|id| {
-    thread::spawn(move || {
-        let mut state = QuestStateVec::with_seed(2, id);
-        // Each thread runs independently
-        state.h(0).cx(0, 1);
-        state.mz(0)
-    })
-}).collect();
-```
-
-## Building
-
-```bash
-# Build
-cargo build --package pecos-quest
-
-# Test
-cargo test --package pecos-quest
-
-# Run example
-cargo run --package pecos-quest --example bell_state
-```
-
-### Requirements
-- C++ compiler with C++14 support
-- Internet connection for first build (to download QuEST)
-
-## API Compatibility
-
-Implements standard PECOS traits:
-- `QuantumSimulator`
-- `CliffordGateable`
-- `ArbitraryRotationGateable`
-- `RngManageable`
-
-## Acknowledgements
-
-This crate wraps [QuEST](https://github.com/QuEST-Kit/QuEST) (Quantum Exact Simulation Toolkit), developed by the QuEST-Kit team at the University of Oxford.
-
-**Paper:**
-- Jones, T., Brown, A., Bush, I., & Benjamin, S. C. (2019). "QuEST and High Performance Simulation of Quantum Computers." Scientific Reports, 9, 10736. [arXiv:1802.08032](https://arxiv.org/abs/1802.08032)
-
-## License
-
-Apache-2.0 (PECOS project license). QuEST is MIT licensed.
diff --git a/crates/pecos-quest/build.rs b/crates/pecos-quest/build.rs
deleted file mode 100644
index 93ec1b472..000000000
--- a/crates/pecos-quest/build.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-//! Build script for pecos-quest
-
-mod build_quest;
-
-fn main() {
-    // Initialize logger for build script
-    env_logger::init();
-
-    // Build QuEST (download handled inside build_quest)
-    build_quest::build().expect("QuEST build failed");
-}
diff --git a/crates/pecos-quest/build_quest.rs b/crates/pecos-quest/build_quest.rs
deleted file mode 100644
index 6faf427a6..000000000
--- a/crates/pecos-quest/build_quest.rs
+++ /dev/null
@@ -1,878 +0,0 @@
-//! Build script for `QuEST` integration
-//!
-//! This build script produces:
-//! 1. A static library (libquest-bridge.a) for CPU-only `QuEST` operations
-//! 2. Optionally, a shared library (`libpecos_quest_cuda.so`) for CUDA operations (when cuda feature enabled)
-//!
-//! The CUDA library is loaded at runtime via dlopen, allowing a single binary to work
-//! on systems with and without CUDA installed.
-
-use log::{debug, info};
-use pecos_build::{Manifest, Result, ensure_dep_ready, report_cache_config};
-use std::env;
-use std::fs;
-use std::path::{Path, PathBuf};
-use std::process::Command;
-
-/// Detect CUDA installation using nvcc command
-/// Returns the CUDA installation path if found
-///
-/// Search order:
-/// 1. `~/.pecos/deps/cuda/` (new local installation via pecos install cuda)
-/// 2. `~/.pecos/cuda/` (legacy path)
-/// 3. `CUDA_PATH` environment variable
-/// 4. `nvcc` in PATH
-/// 5. Standard system paths
-fn detect_cuda_path() -> Option<String> {
-    // 1-2. Check ~/.pecos/deps/cuda/ and legacy ~/.pecos/cuda/
-    if let Some(home) = dirs::home_dir() {
-        let paths = [
-            home.join(".pecos").join("deps").join("cuda"),
-            home.join(".pecos").join("cuda"),
-        ];
-        for pecos_cuda in paths {
-            let nvcc_path = pecos_cuda.join("bin").join("nvcc");
-            if nvcc_path.exists() {
-                info!(
-                    "Found CUDA in {} (installed via pecos)",
-                    pecos_cuda.display()
-                );
-                return Some(pecos_cuda.to_string_lossy().to_string());
-            }
-        }
-    }
-
-    // 2. Check environment variables
-    if let Ok(cuda_path) = env::var("CUDA_PATH") {
-        info!("Found CUDA via CUDA_PATH: {cuda_path}");
-        return Some(cuda_path);
-    }
-
-    // 3. Try to find nvcc in PATH
-    if let Ok(nvcc_output) = Command::new("nvcc").arg("--version").output()
-        && nvcc_output.status.success()
-    {
-        // Try to get CUDA path from nvcc location using 'which nvcc'
-        if let Ok(which_output) = Command::new("which").arg("nvcc").output()
-            && which_output.status.success()
-        {
-            let nvcc_path = String::from_utf8_lossy(&which_output.stdout)
-                .trim()
-                .to_string();
-            // nvcc is typically at /usr/local/cuda[-version]/bin/nvcc
-            // We want /usr/local/cuda[-version]
-            let path = Path::new(&nvcc_path);
-            if let Some(bin_dir) = path.parent()
-                && let Some(cuda_root) = bin_dir.parent()
-            {
-                info!("Found CUDA via nvcc in PATH: {}", cuda_root.display());
-                return Some(cuda_root.to_string_lossy().to_string());
-            }
-        }
-    }
-
-    // 4. Fallback to checking standard installation paths
-    // Check symlinks first, then specific versions
-    for path in &[
-        "/usr/local/cuda",      // Common symlink
-        "/usr/local/cuda-13",   // Version symlink
-        "/usr/local/cuda-13.0", // Specific CUDA 13.0
-        "/usr/local/cuda-13.1", // Specific CUDA 13.1
-        "/usr/local/cuda-12",   // Version symlink
-        "/usr/local/cuda-12.0", // Specific CUDA 12.0
-        "/usr/local/cuda-11",   // Version symlink
-        "/usr/local/cuda-11.0", // Specific CUDA 11.0
-    ] {
-        if Path::new(path).exists() {
-            info!("Found CUDA at standard path: {path}");
-            return Some((*path).to_string());
-        }
-    }
-
-    None
-}
-
-/// Build the GPU shared library (`libpecos_quest_cuda.so`)
-///
-/// This library contains the GPU-accelerated `QuEST` implementation and is loaded
-/// at runtime via dlopen. This allows the main library to work on systems without CUDA.
-#[allow(clippy::too_many_lines)]
-fn build_gpu_shared_library(cuda_path: &str, quest_dir: &Path, out_dir: &Path) -> Option<PathBuf> {
-    info!("Building GPU shared library (libpecos_quest_cuda.so)...");
-
-    // nvcc executable name differs by platform
-    let nvcc_name = if cfg!(target_os = "windows") {
-        "nvcc.exe"
-    } else {
-        "nvcc"
-    };
-    let nvcc_path = Path::new(cuda_path).join("bin").join(nvcc_name);
-    info!("Using nvcc at: {}", nvcc_path.display());
-    let quest_include_dir = quest_dir.join("include");
-    let quest_src_dir = quest_dir.join("src");
-    let gpu_dir = quest_src_dir.join("gpu");
-
-    // Source files for the GPU library
-    let bridge_gpu = PathBuf::from("src/bridge_cuda.cpp");
-    let gpu_config = gpu_dir.join("gpu_config.cpp");
-    let gpu_subroutines = gpu_dir.join("gpu_subroutines.cpp");
-
-    // QuEST core files needed by the GPU library
-    let api_dir = quest_src_dir.join("api");
-    let core_dir = quest_src_dir.join("core");
-    let cpu_dir = quest_src_dir.join("cpu");
-    let comm_dir = quest_src_dir.join("comm");
-
-    // Collect all source files
-    let source_files = vec![
-        bridge_gpu,
-        gpu_config,
-        gpu_subroutines,
-        // API layer
-        api_dir.join("calculations.cpp"),
-        api_dir.join("channels.cpp"),
-        api_dir.join("debug.cpp"),
-        api_dir.join("decoherence.cpp"),
-        api_dir.join("environment.cpp"),
-        api_dir.join("initialisations.cpp"),
-        api_dir.join("matrices.cpp"),
-        api_dir.join("modes.cpp"),
-        api_dir.join("operations.cpp"),
-        api_dir.join("paulis.cpp"),
-        api_dir.join("qureg.cpp"),
-        api_dir.join("types.cpp"),
-        api_dir.join("multiplication.cpp"),
-        api_dir.join("trotterisation.cpp"),
-        // Core utilities
-        core_dir.join("errors.cpp"),
-        core_dir.join("utilities.cpp"),
-        core_dir.join("validation.cpp"),
-        core_dir.join("memory.cpp"),
-        core_dir.join("printer.cpp"),
-        core_dir.join("randomiser.cpp"),
-        core_dir.join("parser.cpp"),
-        core_dir.join("localiser.cpp"),
-        core_dir.join("autodeployer.cpp"),
-        core_dir.join("accelerator.cpp"),
-        core_dir.join("envvars.cpp"),
-        core_dir.join("paulilogic.cpp"),
-        // CPU backend (still needed for some operations)
-        cpu_dir.join("cpu_config.cpp"),
-        cpu_dir.join("cpu_subroutines.cpp"),
-        // Communication
-        comm_dir.join("comm_config.cpp"),
-        comm_dir.join("comm_routines.cpp"),
-    ];
-
-    // Compile all source files to object files
-    let mut object_files = Vec::new();
-    for src_file in &source_files {
-        let file_stem = src_file.file_stem()?.to_str()?;
-        // Windows uses .obj extension, Unix uses .o
-        let obj_ext = if cfg!(target_os = "windows") {
-            "obj"
-        } else {
-            "o"
-        };
-        let obj_file = out_dir.join(format!("gpu_{file_stem}.{obj_ext}"));
-
-        debug!("Compiling for GPU lib: {}", src_file.display());
-        let mut compile_cmd = Command::new(&nvcc_path);
-        compile_cmd
-            .arg("-c")
-            .arg(src_file)
-            .arg("-o")
-            .arg(&obj_file)
-            .arg("-x")
-            .arg("cu") // Treat .cpp files as CUDA source
-            .arg("-I")
-            .arg(&quest_include_dir)
-            .arg("-I")
-            .arg(&quest_src_dir)
-            .arg("-I")
-            .arg(quest_dir.parent()?)
-            .arg("-I")
-            .arg("include") // For quest_ffi.h
-            .arg("--std=c++20")
-            .arg("-DCOMPILE_GPU=1")
-            .arg("-DCOMPILE_CUDA=1")
-            .arg("-DCOMPILE_CPU=1")
-            .arg("-DCOMPILE_OPENMP=0")
-            .arg("-DCOMPILE_MPI=0")
-            .arg("-DCOMPILE_CUQUANTUM=0")
-            .arg("-DFLOAT_PRECISION=2")
-            // Target compute capability 7.5 (Turing) which supports atomicAdd(double*, double)
-            // sm_75 is the minimum supported by both CUDA 12.x and 13.x
-            .arg("-arch=sm_75")
-            // Allow newer GCC versions (e.g., GCC 14 in manylinux_2_28)
-            .arg("-allow-unsupported-compiler");
-
-        // Platform-specific compiler flags
-        if cfg!(target_os = "windows") {
-            // Windows/MSVC: no -fPIC needed (not applicable)
-            // Use /EHsc for C++ exception handling
-            compile_cmd.arg("-Xcompiler").arg("/EHsc");
-        } else {
-            // Unix: position-independent code for shared libraries
-            compile_cmd.arg("-Xcompiler").arg("-fPIC");
-        }
-
-        let output = compile_cmd.output().ok()?;
-
-        if !output.status.success() {
-            let stdout_str = String::from_utf8_lossy(&output.stdout);
-            let stderr_str = String::from_utf8_lossy(&output.stderr);
-            eprintln!(
-                "ERROR: Failed to compile {} for GPU library",
-                src_file.display()
-            );
-            eprintln!("Exit status: {:?}", output.status);
-            if !stdout_str.is_empty() {
-                eprintln!("stdout:\n{stdout_str}");
-            }
-            if !stderr_str.is_empty() {
-                eprintln!("stderr:\n{stderr_str}");
-            }
-            return None;
-        }
-
-        object_files.push(obj_file);
-    }
-
-    // Link into a shared library
-    let lib_name = if cfg!(target_os = "macos") {
-        "libpecos_quest_cuda.dylib"
-    } else if cfg!(target_os = "windows") {
-        "pecos_quest_cuda.dll"
-    } else {
-        "libpecos_quest_cuda.so"
-    };
-
-    let gpu_lib_path = out_dir.join(lib_name);
-
-    info!("Linking GPU shared library: {}", gpu_lib_path.display());
-
-    let mut link_cmd = Command::new(&nvcc_path);
-    link_cmd
-        .arg("-shared")
-        .arg("-o")
-        .arg(&gpu_lib_path)
-        .args(&object_files);
-
-    // Platform-specific library paths and linking
-    if cfg!(target_os = "windows") {
-        // Windows: CUDA libraries are in lib\x64
-        link_cmd
-            .arg(format!("-L{cuda_path}/lib/x64"))
-            .arg("-lcudart")
-            .arg("-lcublas");
-        // Windows uses MSVC runtime, no need to explicitly link C++ stdlib
-    } else {
-        // Unix: CUDA libraries are in lib64
-        link_cmd
-            .arg(format!("-L{cuda_path}/lib64"))
-            .arg("-lcudart")
-            .arg("-lcublas");
-        // Add C++ standard library
-        if cfg!(target_os = "macos") {
-            link_cmd.arg("-lc++");
-        } else {
-            link_cmd.arg("-lstdc++");
-        }
-    }
-
-    let output = link_cmd.output().ok()?;
-
-    if !output.status.success() {
-        let stderr_str = String::from_utf8_lossy(&output.stderr);
-        eprintln!("ERROR: Failed to link GPU shared library");
-        eprintln!("{stderr_str}");
-        return None;
-    }
-
-    info!(
-        "Successfully built GPU shared library: {}",
-        gpu_lib_path.display()
-    );
-
-    // Also copy to target directory for easier discovery
-    // Try CARGO_TARGET_DIR first, then derive from OUT_DIR
-    let target_lib_dir = if let Ok(target_dir) = env::var("CARGO_TARGET_DIR") {
-        let profile = get_build_profile();
-        Some(Path::new(&target_dir).join(&profile))
-    } else {
-        // OUT_DIR is something like: target/release/build/pecos-quest-xxx/out
-        // We want: target/release/
-        out_dir
-            .parent() // build/pecos-quest-xxx
-            .and_then(|p| p.parent()) // build
-            .and_then(|p| p.parent()) // release or debug
-            .map(std::path::Path::to_path_buf)
-    };
-
-    if let Some(target_dir) = target_lib_dir {
-        let target_lib_path = target_dir.join(lib_name);
-        if let Some(parent) = target_lib_path.parent() {
-            let _ = fs::create_dir_all(parent);
-        }
-        if let Err(e) = fs::copy(&gpu_lib_path, &target_lib_path) {
-            debug!("Could not copy CUDA lib to target dir: {e}");
-        } else {
-            info!("Copied CUDA lib to: {}", target_lib_path.display());
-        }
-    }
-
-    Some(gpu_lib_path)
-}
-
-/// Patch `QuEST` GPU code for CUDA 13 compatibility
-///
-/// Removes `thrust::unary_function` and `thrust::binary_function` inheritance
-/// which were deprecated and removed in modern CUDA/Thrust versions.
-/// With C++20, these base classes are no longer needed.
-fn patch_quest_for_cuda13(quest_dir: &Path) -> Result<()> {
-    let thrust_file = quest_dir.join("src/gpu/gpu_thrust.cuh");
-
-    if !thrust_file.exists() {
-        // GPU files don't exist, nothing to patch
-        return Ok(());
-    }
-
-    info!("Patching QuEST for CUDA 13 compatibility...");
-
-    let content = fs::read_to_string(&thrust_file)?;
-
-    // Use regex to remove thrust::unary_function and thrust::binary_function inheritance
-    // Pattern: "struct NAME : public thrust::(unary|binary)_function<...>"
-    // Replace with: "struct NAME"
-
-    // First, handle single-line patterns (with opening brace)
-    let patched = content
-        .replace(": public thrust::unary_function<cu_qcomp,cu_qcomp> {", " {")
-        .replace(": public thrust::unary_function<cu_qcomp,qreal> {", " {")
-        .replace(": public thrust::unary_function<qindex,cu_qcomp> {", " {")
-        .replace(": public thrust::unary_function<qindex,qindex> {", " {")
-        .replace(
-            ": public thrust::binary_function<cu_qcomp,cu_qcomp,cu_qcomp> {",
-            " {",
-        )
-        .replace(
-            ": public thrust::binary_function<cu_qcomp,cu_qcomp,qreal> {",
-            " {",
-        )
-        .replace(
-            ": public thrust::binary_function<qindex,cu_qcomp,qreal> {",
-            " {",
-        )
-        .replace(
-            ": public thrust::binary_function<qindex,cu_qcomp,cu_qcomp> {",
-            " {",
-        )
-        // Handle multi-line patterns (no opening brace on same line)
-        .replace(": public thrust::unary_function<cu_qcomp,cu_qcomp>", "")
-        .replace(": public thrust::unary_function<cu_qcomp,qreal>", "")
-        .replace(": public thrust::unary_function<qindex,cu_qcomp>", "")
-        .replace(": public thrust::unary_function<qindex,qindex>", "")
-        .replace(
-            ": public thrust::binary_function<cu_qcomp,cu_qcomp,cu_qcomp>",
-            "",
-        )
-        .replace(
-            ": public thrust::binary_function<cu_qcomp,cu_qcomp,qreal>",
-            "",
-        )
-        .replace(
-            ": public thrust::binary_function<qindex,cu_qcomp,qreal>",
-            "",
-        )
-        .replace(
-            ": public thrust::binary_function<qindex,cu_qcomp,cu_qcomp>",
-            "",
-        );
-
-    fs::write(&thrust_file, patched)?;
-
-    info!("Successfully patched gpu_thrust.cuh for CUDA 13");
-
-    Ok(())
-}
-
-/// Generate config.h from config.h.in template (`QuEST` v4.2.0+)
-/// or quest.h from quest.h.in (`QuEST` v4.1.x).
-///
-/// The main library is ALWAYS CPU-only (`COMPILE_CUDA=0`).
-/// GPU support is provided via a separate shared library loaded at runtime.
-fn generate_quest_header(quest_dir: &Path) -> Result<()> {
-    // v4.2.0+: config.h.in
-    let config_template = quest_dir.join("include/config.h.in");
-    if config_template.exists() {
-        let output = quest_dir.join("include/config.h");
-        if output.exists() {
-            return Ok(());
-        }
-        info!("Generating config.h from config.h.in...");
-        let template = fs::read_to_string(&config_template)?;
-        let config_h = template
-            .lines()
-            .map(|line| {
-                if line.contains("#cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@") {
-                    "#define FLOAT_PRECISION 2".to_string()
-                } else if line.contains("#cmakedefine01 COMPILE_OPENMP") {
-                    "#define COMPILE_OPENMP 0".to_string()
-                } else if line.contains("#cmakedefine01 COMPILE_MPI") {
-                    "#define COMPILE_MPI 0".to_string()
-                } else if line.contains("#cmakedefine01 COMPILE_CUDA") {
-                    "#define COMPILE_CUDA 0".to_string()
-                } else if line.contains("#cmakedefine01 COMPILE_HIP") {
-                    "#define COMPILE_HIP 0".to_string()
-                } else if line.contains("#cmakedefine01 COMPILE_CUQUANTUM") {
-                    "#define COMPILE_CUQUANTUM 0".to_string()
-                } else if line.contains("#cmakedefine01 NUMA_AWARE") {
-                    "#define NUMA_AWARE 0".to_string()
-                } else if line.contains("#cmakedefine01 INCLUDE_DEPRECATED_FUNCTIONS") {
-                    "#define INCLUDE_DEPRECATED_FUNCTIONS 0".to_string()
-                } else if line.contains("#cmakedefine01 DISABLE_DEPRECATION_WARNINGS") {
-                    "#define DISABLE_DEPRECATION_WARNINGS 1".to_string()
-                } else if line.contains("@PROJECT_VERSION_MAJOR@") {
-                    "#define QUEST_VERSION_MAJOR 4".to_string()
-                } else if line.contains("@PROJECT_VERSION_MINOR@") {
-                    "#define QUEST_VERSION_MINOR 2".to_string()
-                } else if line.contains("@PROJECT_VERSION_PATCH@") {
-                    "#define QUEST_VERSION_PATCH 0".to_string()
-                } else if line.contains("@PROJECT_VERSION@") {
-                    "#define QUEST_VERSION_STRING \"4.2.0\"".to_string()
-                } else {
-                    line.to_string()
-                }
-            })
-            .collect::<Vec<_>>()
-            .join("\n");
-        fs::write(&output, config_h)?;
-        info!("Generated config.h");
-        return Ok(());
-    }
-
-    // v4.1.x: quest.h.in
-    let quest_template = quest_dir.join("include/quest.h.in");
-    if quest_template.exists() {
-        let output = quest_dir.join("include/quest.h");
-        info!("Generating quest.h from quest.h.in...");
-        let template = fs::read_to_string(&quest_template)?;
-        let mut in_multi_lib_block = false;
-        let mut found_cmakedefine = false;
-        let quest_h = template
-            .lines()
-            .filter_map(|line| {
-                if line.contains("#if !@MULTI_LIB_HEADERS@") {
-                    in_multi_lib_block = true;
-                    return None;
-                }
-                if line.contains("#cmakedefine") {
-                    found_cmakedefine = true;
-                    if line.contains("#cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@") {
-                        return Some("#define FLOAT_PRECISION 2".to_string());
-                    }
-                    if line.contains("#cmakedefine01 COMPILE_MPI") {
-                        return Some("#define COMPILE_MPI 0".to_string());
-                    }
-                    if line.contains("#cmakedefine01 COMPILE_OPENMP") {
-                        return Some("#define COMPILE_OPENMP 0".to_string());
-                    }
-                    if line.contains("#cmakedefine01 COMPILE_CUDA") {
-                        return Some("#define COMPILE_CUDA 0".to_string());
-                    }
-                    if line.contains("#cmakedefine01 COMPILE_CUQUANTUM") {
-                        return Some("#define COMPILE_CUQUANTUM 0".to_string());
-                    }
-                }
-                if line.contains("#endif") && in_multi_lib_block && found_cmakedefine {
-                    in_multi_lib_block = false;
-                    found_cmakedefine = false;
-                    return None;
-                }
-                Some(line.to_string())
-            })
-            .collect::<Vec<_>>()
-            .join("\n");
-        fs::write(&output, quest_h)?;
-        info!("Generated quest.h");
-    }
-
-    Ok(())
-}
-
-/// Get the build profile from Cargo's environment
-/// Returns "debug", "release", or "native"
-///
-/// Note: Cargo's PROFILE env var only reports "debug" or "release" even for custom profiles
-/// (due to backward compatibility - see RFC 2678). Custom profiles inherit from these base
-/// profiles, so PROFILE reflects the parent. To detect custom profiles like "native", we
-/// check the `OUT_DIR` path which contains the actual profile directory name.
-///
-/// Profile behavior:
-/// - "debug" -> no C++ optimization, fast compile
-/// - "release" -> full optimization (-O3)
-/// - "native" -> full optimization + CPU-specific (-O3 -march=native)
-fn get_build_profile() -> String {
-    // First check OUT_DIR for custom profile name (e.g., target/native/build/...)
-    // Custom profiles get their own directory under target/
-    if let Ok(out_dir) = env::var("OUT_DIR") {
-        // OUT_DIR looks like: .../target/<profile>/build/<crate>-<hash>/out
-        // We want to extract <profile>
-        let parts: Vec<&str> = out_dir.split(std::path::MAIN_SEPARATOR).collect();
-        if let Some(target_idx) = parts.iter().position(|&p| p == "target")
-            && let Some(profile_name) = parts.get(target_idx + 1)
-        {
-            return match *profile_name {
-                "native" => "native",
-                "release" => "release",
-                "debug" => "debug",
-                _ => {
-                    // Unknown profile, fall back to PROFILE env var
-                    if env::var("PROFILE").as_deref() == Ok("release") {
-                        "release"
-                    } else {
-                        "debug"
-                    }
-                }
-            }
-            .to_string();
-        }
-    }
-
-    // Fallback to PROFILE env var (will be "debug" or "release")
-    match env::var("PROFILE").as_deref() {
-        Ok("release") => "release".to_string(),
-        _ => "debug".to_string(),
-    }
-}
-
-/// Main build function for `QuEST`
-pub fn build() -> Result<()> {
-    // Tell Cargo when to rerun this build script
-    println!("cargo:rerun-if-changed=build_quest.rs");
-    println!("cargo:rerun-if-changed=src/bridge.rs");
-    println!("cargo:rerun-if-changed=src/bridge.cpp");
-    println!("cargo:rerun-if-changed=src/bridge_cuda.cpp");
-    println!("cargo:rerun-if-changed=src/gpu_stubs.cpp");
-    println!("cargo:rerun-if-changed=src/cuda_loader.rs");
-    println!("cargo:rerun-if-changed=include/quest_ffi.h");
-
-    // Also rerun if the user forces a rebuild
-    println!("cargo:rerun-if-env-changed=FORCE_REBUILD");
-
-    // Check for GPU feature
-    println!("cargo:rerun-if-env-changed=QUEST_ENABLE_GPU");
-    println!("cargo:rerun-if-env-changed=CUDA_PATH");
-    println!("cargo:rerun-if-env-changed=CUDACXX");
-
-    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
-
-    // Always emit link directives - these are cached by Cargo
-    println!("cargo:rustc-link-search=native={}", out_dir.display());
-    println!("cargo:rustc-link-lib=static=quest-bridge");
-
-    // Get QuEST source from ~/.pecos/deps/ (persists across cargo clean)
-    let quest_dir = get_quest_source()?;
-
-    // Build using cxx
-    build_cxx_bridge(&quest_dir, &out_dir);
-
-    Ok(())
-}
-
-/// Get `QuEST` source directory, downloading and extracting if needed
-///
-/// Returns the path to the `quest/` subdirectory within the extracted archive.
-/// Also applies patches for CUDA 13 compatibility and generates quest.h header.
-fn get_quest_source() -> Result<PathBuf> {
-    // Load manifest and get QuEST dependency
-    let manifest = Manifest::find_and_load_validated()?;
-
-    // ensure_dep_ready downloads to ~/.pecos/cache/ and extracts to ~/.pecos/deps/
-    let deps_path = ensure_dep_ready("quest", &manifest)?;
-
-    // The QuEST archive extracts as: deps/quest-<version>/quest/
-    // (contains quest/ subdirectory with actual source)
-    let quest_dir = deps_path.join("quest");
-
-    if !quest_dir.exists() {
-        return Err(pecos_build::Error::Archive(format!(
-            "QuEST source directory not found at: {}",
-            quest_dir.display()
-        )));
-    }
-
-    // Apply CUDA 13 compatibility patches (idempotent)
-    patch_quest_for_cuda13(&quest_dir)?;
-
-    // Generate quest.h from quest.h.in (idempotent - only runs if template exists)
-    generate_quest_header(&quest_dir)?;
-
-    info!("Using QuEST source from {}", quest_dir.display());
-    Ok(quest_dir)
-}
-
-#[allow(clippy::too_many_lines)]
-fn build_cxx_bridge(quest_dir: &Path, out_dir: &Path) {
-    let quest_src_dir = quest_dir.join("src");
-    let quest_include_dir = quest_dir.join("include");
-
-    // Build the cxx bridge first to generate headers
-    let mut build = cxx_build::bridge("src/bridge.rs");
-
-    // On macOS, explicitly use system clang to ensure SDK paths are correct.
-    // The PECOS LLVM clang may be in PATH but doesn't have SDK headers configured,
-    // causing "math.h file not found" errors during compilation.
-    let target = env::var("TARGET").unwrap_or_default();
-    if target.contains("darwin") && env::var("CXX").is_err() && env::var("CC").is_err() {
-        build.compiler("/usr/bin/clang++");
-    }
-
-    // Determine if we're building with GPU support
-    // Check if the gpu feature is enabled via CARGO_FEATURE_CUDA env var
-    let gpu_feature_enabled = env::var("CARGO_FEATURE_CUDA").is_ok();
-
-    // Detect CUDA installation
-    let cuda_path = detect_cuda_path();
-    let cuda_available = cuda_path.is_some();
-
-    // Only enable GPU if both the feature is enabled AND CUDA is available
-    let gpu_enabled = gpu_feature_enabled && cuda_available;
-
-    // Error if GPU feature was requested but CUDA is not available
-    if gpu_feature_enabled && !cuda_available {
-        eprintln!("ERROR: GPU feature enabled but CUDA not found");
-        eprintln!("  CUDA Toolkit must be installed to build with GPU support");
-        eprintln!("  Solutions:");
-        eprintln!("    1. Install CUDA Toolkit (https://developer.nvidia.com/cuda-downloads)");
-        eprintln!("    2. Ensure nvcc is in PATH or set CUDA_PATH environment variable");
-        eprintln!("    3. Build without GPU feature: cargo build -p pecos-quest");
-        std::process::exit(1);
-    }
-
-    // Add QuEST source files
-    let api_dir = quest_src_dir.join("api");
-    let core_dir = quest_src_dir.join("core");
-    let cpu_dir = quest_src_dir.join("cpu");
-    let comm_dir = quest_src_dir.join("comm");
-
-    // IMPORTANT: The main library ALWAYS uses gpu_stubs.cpp (CPU only).
-    // GPU support is provided by a separate shared library (libpecos_quest_cuda.so)
-    // that is loaded at runtime via dlopen. This allows a single binary to work
-    // on systems with and without CUDA installed.
-    build.file("src/gpu_stubs.cpp");
-
-    build
-        .file("src/bridge.cpp")
-        // API layer
-        .file(api_dir.join("calculations.cpp"))
-        .file(api_dir.join("channels.cpp"))
-        .file(api_dir.join("debug.cpp"))
-        .file(api_dir.join("decoherence.cpp"))
-        .file(api_dir.join("environment.cpp"))
-        .file(api_dir.join("initialisations.cpp"))
-        .file(api_dir.join("matrices.cpp"))
-        .file(api_dir.join("modes.cpp"))
-        .file(api_dir.join("operations.cpp"))
-        .file(api_dir.join("paulis.cpp"))
-        .file(api_dir.join("qureg.cpp"))
-        .file(api_dir.join("types.cpp"));
-
-    // v4.2.0 added these files
-    for f in ["multiplication.cpp", "trotterisation.cpp"] {
-        let path = api_dir.join(f);
-        if path.exists() {
-            build.file(path);
-        }
-    }
-
-    build
-        // Core utilities
-        .file(core_dir.join("errors.cpp"))
-        .file(core_dir.join("utilities.cpp"))
-        .file(core_dir.join("validation.cpp"))
-        .file(core_dir.join("memory.cpp"))
-        .file(core_dir.join("printer.cpp"))
-        .file(core_dir.join("randomiser.cpp"))
-        .file(core_dir.join("parser.cpp"))
-        .file(core_dir.join("localiser.cpp"))
-        .file(core_dir.join("autodeployer.cpp"))
-        .file(core_dir.join("accelerator.cpp"));
-
-    // v4.2.0 added these core files
-    for f in ["envvars.cpp", "paulilogic.cpp"] {
-        let path = core_dir.join(f);
-        if path.exists() {
-            build.file(path);
-        }
-    }
-
-    // Build the separate GPU shared library if GPU feature is enabled
-    // This library will be loaded at runtime via dlopen
-    if gpu_enabled {
-        let gpu_dir = quest_src_dir.join("gpu");
-        if !gpu_dir.exists() {
-            eprintln!("\nERROR: GPU feature enabled but QuEST GPU source not found");
-            eprintln!("  Expected directory: {}", gpu_dir.display());
-            eprintln!("  This may indicate an incomplete QuEST download");
-            std::process::exit(1);
-        }
-
-        // Build the separate GPU shared library
-        if let Some(gpu_lib_path) =
-            build_gpu_shared_library(cuda_path.as_ref().unwrap(), quest_dir, out_dir)
-        {
-            info!(
-                "GPU shared library built successfully: {}",
-                gpu_lib_path.display()
-            );
-            // Emit the GPU library path so downstream crates can find it
-            println!(
-                "cargo:rustc-env=PECOS_QUEST_CUDA_LIB={}",
-                gpu_lib_path.display()
-            );
-        } else {
-            eprintln!("\nERROR: GPU feature enabled but GPU library build failed");
-            eprintln!("  See warnings above for compilation errors");
-            eprintln!("  Solutions:");
-            eprintln!("    1. Use CUDA 11 or 12 instead of CUDA 13 (QuEST incompatibility)");
-            eprintln!("    2. Build without GPU feature: cargo build -p pecos-quest");
-            eprintln!("    3. Use Python GPU simulators (CuStateVec/MPS) which work with CUDA 13");
-            std::process::exit(1);
-        }
-    }
-
-    // CPU backend
-    build
-        .file(cpu_dir.join("cpu_config.cpp"))
-        .file(cpu_dir.join("cpu_subroutines.cpp"))
-        // Communication (even for single-node)
-        .file(comm_dir.join("comm_config.cpp"))
-        .file(comm_dir.join("comm_routines.cpp"));
-
-    // Include directories
-    build
-        .include(&quest_include_dir)
-        .include(&quest_src_dir)
-        .include(quest_dir.parent().unwrap()) // Add out_dir so "quest/include/..." resolves correctly
-        .include("include");
-
-    // v4.2.0+ defines come from generated config.h (included by quest.h).
-    // v4.1.x needs them as compiler flags since quest.h.in was processed differently.
-    if !quest_dir.join("include/config.h").exists() {
-        build
-            .define("COMPILE_CPU", "1")
-            .define("COMPILE_OPENMP", "0")
-            .define("COMPILE_MPI", "0")
-            .define("FLOAT_PRECISION", "2")
-            .define("COMPILE_CUDA", "0")
-            .define("COMPILE_GPU", "0")
-            .define("COMPILE_CUQUANTUM", "0");
-    }
-
-    // v4.2.0+ requires COMPLEX_OVERLOADS_PATCHED for cpu_subroutines.cpp.
-    // In release builds, -Ofast enables fast-math which makes std::complex
-    // operator overloads as fast as hand-rolled arithmetic.
-    let profile = get_build_profile();
-    if profile == "release" || profile == "native" {
-        build.flag_if_supported("-Ofast");
-        build.define("COMPLEX_OVERLOADS_PATCHED", "1");
-    } else {
-        build.define("COMPLEX_OVERLOADS_PATCHED", "0");
-    }
-
-    // Note: We do NOT link cudart/cublas here. The GPU library handles CUDA linking
-    // and is loaded at runtime only when GPU is requested.
-
-    // Use C++20 standard (QuEST v4 uses designated initializers which require C++20)
-    // However, on macOS there's a known issue with C++20 and cxx crate's pointer_traits
-    // specializations, so we use C++17 there (designated initializers are a GNU extension
-    // that works in C++17 with Clang)
-    if std::env::var("TARGET")
-        .unwrap_or_default()
-        .contains("darwin")
-    {
-        build.std("c++17");
-        // Enable GNU extensions to support designated initializers in C++17
-        build.flag_if_supported("-Wno-c++20-designator");
-    } else {
-        build.std("c++20");
-    }
-
-    // Report ccache/sccache configuration
-    report_cache_config();
-
-    // Disable warnings for external QuEST code
-    // This properly handles warning flags without conflicts
-    build.warnings(false);
-
-    // Use build profile for optimization settings
-    let profile = get_build_profile();
-    match profile.as_str() {
-        "native" => {
-            // Native profile: release optimizations + CPU-specific optimizations
-            build.flag_if_supported("-O3");
-            build.flag_if_supported("-march=native");
-        }
-        "release" => {
-            // Release profile: full optimization
-            build.flag_if_supported("-O3");
-        }
-        _ => {
-            // Dev profile: no optimization for faster compilation
-            build.flag_if_supported("-O0");
-            build.flag_if_supported("-g"); // Include debug symbols
-        }
-    }
-
-    // Platform-specific flags
-    if cfg!(not(target_env = "msvc")) {
-        // For GCC/Clang
-        build.flag_if_supported("-fPIC"); // Position-independent code
-    } else {
-        // For MSVC
-        build
-            .flag_if_supported("/permissive-") // Enable standards-compliant C++ parsing
-            .flag_if_supported("/Zc:__cplusplus") // Report correct __cplusplus macro value
-            .flag("/Z7"); // Embed debug info in .obj files (no PDB) - required for parallel builds
-    }
-
-    // Platform-specific C++ library linking configuration
-    if cfg!(not(target_env = "msvc")) {
-        // On macOS, use the -stdlib=libc++ flag to ensure proper C++ standard library linkage
-        // This tells the linker to use the system libc++ from the dyld shared cache
-        // without creating problematic @rpath references
-        if std::env::var("TARGET")
-            .unwrap_or_default()
-            .contains("darwin")
-        {
-            build.flag("-stdlib=libc++");
-            // Note: Linker-specific flags are passed via cargo:rustc-link-arg below, not here
-        }
-    }
-
-    build.compile("quest-bridge");
-
-    // Note: GPU object files are now compiled into a separate shared library
-    // (libpecos_quest_cuda.so) which is built by build_gpu_shared_library()
-    // and loaded at runtime via dlopen.
-
-    // On macOS, ensure the C++ standard library is linked correctly
-    // Use the system libc++ which is in the dyld shared cache (macOS Big Sur+)
-    // We rely on the compiler's default behavior rather than explicit cargo directives
-    // which can create problematic @rpath references
-    if std::env::var("TARGET")
-        .unwrap_or_default()
-        .contains("darwin")
-    {
-        // Link against the system C++ library
-        // Use -L flag to prioritize system library paths over Homebrew
-        println!("cargo:rustc-link-search=native=/usr/lib");
-        println!("cargo:rustc-link-lib=c++");
-
-        // Prevent Homebrew's libunwind from being opportunistically linked
-        // by ensuring system paths are searched first
-        println!("cargo:rustc-link-arg=-Wl,-search_paths_first");
-    }
-}
diff --git a/crates/pecos-quest/examples/bell_state.rs b/crates/pecos-quest/examples/bell_state.rs
deleted file mode 100644
index e0dd93fa7..000000000
--- a/crates/pecos-quest/examples/bell_state.rs
+++ /dev/null
@@ -1,135 +0,0 @@
-//! Example: Creating and measuring a Bell state using `QuEST` with PECOS-style API
-
-use pecos_core::{QubitId, qid};
-use pecos_quest::{CliffordGateable, QuantumSimulator, QuestStateVec};
-
-fn main() {
-    println!("QuEST Bell State Example");
-    println!("========================");
-
-    // Create a 2-qubit quantum state vector
-    let mut state = QuestStateVec::new(2);
-    println!("Created {} qubit state vector", state.num_qubits());
-
-    // Explicitly reset the state to make sure it's initialized
-    state.reset();
-    println!("Reset state explicitly");
-    println!();
-
-    // Display initial state probabilities
-    println!("Initial state |00⟩:");
-    display_state_probabilities(&state);
-
-    // Check individual probabilities
-    println!("  Probability |00⟩: {:.6}", state.probability(0b00));
-    println!("  Probability |01⟩: {:.6}", state.probability(0b01));
-    println!("  Probability |10⟩: {:.6}", state.probability(0b10));
-    println!("  Probability |11⟩: {:.6}", state.probability(0b11));
-
-    let amp00 = state.get_amplitude(0b00);
-    let amp01 = state.get_amplitude(0b01);
-    println!("  Amplitude |00⟩: {:.6} + {:.6}i", amp00.re, amp00.im);
-    println!("  Amplitude |01⟩: {:.6} + {:.6}i", amp01.re, amp01.im);
-    println!();
-
-    // Create Bell state: (|00⟩ + |11⟩)/√2
-    println!("Creating Bell state...");
-    state.h(&qid(0)); // Apply Hadamard to qubit 0
-    println!("Applied Hadamard to qubit 0");
-
-    state.cx(&[(QubitId(0), QubitId(1))]); // Apply CNOT with control=0, target=1
-    println!("Applied CNOT(0, 1)");
-    println!();
-
-    // Display Bell state probabilities
-    println!("Bell state probabilities:");
-    display_state_probabilities(&state);
-    println!();
-
-    // Display the state amplitudes
-    println!("Bell state amplitudes:");
-    for i in 0..4 {
-        let amp = state.get_amplitude(i);
-        let prob = amp.norm_sqr();
-        println!(
-            "  |{:02b}⟩: {:.3} + {:.3}i (prob = {:.3})",
-            i, amp.re, amp.im, prob
-        );
-    }
-    println!();
-
-    // Measure the qubits and demonstrate entanglement correlation
-    println!("Measuring qubits to demonstrate entanglement:");
-
-    // Create multiple copies to demonstrate correlation
-    for measurement_round in 1..=5 {
-        // Reset and recreate Bell state for each measurement
-        let mut measurement_state: QuestStateVec = QuestStateVec::with_seed(2, measurement_round);
-        measurement_state.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-
-        let result0 = measurement_state.mz(&qid(0))[0].outcome;
-        let result1 = measurement_state.mz(&qid(1))[0].outcome;
-
-        println!(
-            "  Round {}: Qubit 0: {} | Qubit 1: {} | Correlated: {}",
-            measurement_round,
-            if result0 { "1" } else { "0" },
-            if result1 { "1" } else { "0" },
-            if result0 == result1 { "" } else { "FAIL" }
-        );
-    }
-    println!();
-
-    // Demonstrate other PECOS-style operations
-    println!("Demonstrating other quantum operations:");
-
-    // Reset and apply different gates
-    state.reset();
-    println!("Reset to |00⟩");
-
-    // Create |++⟩ state
-    state.h(&qid(0)).h(&qid(1));
-    println!("Applied H⊗H to create |++⟩");
-    println!("Probability of |00⟩: {:.3}", state.probability(0b00));
-    println!("Probability of |01⟩: {:.3}", state.probability(0b01));
-    println!("Probability of |10⟩: {:.3}", state.probability(0b10));
-    println!("Probability of |11⟩: {:.3}", state.probability(0b11));
-    println!();
-
-    // Apply some Pauli gates
-    state.reset();
-    state.x(&qid(0)); // |10⟩
-    println!("Applied X(0) to create |10⟩");
-    println!("Probability of |10⟩: {:.3}", state.probability(0b01));
-
-    state.z(&qid(0)); // Add phase to |10⟩
-    println!("Applied Z(0) (adds phase, probability unchanged)");
-    println!("Probability of |10⟩: {:.3}", state.probability(0b01));
-    println!();
-
-    // Demonstrate method chaining
-    println!("Demonstrating method chaining:");
-    state
-        .reset()
-        .h(&qid(0))
-        .cx(&[(QubitId(0), QubitId(1))])
-        .z(&qid(1));
-    println!("Applied: reset().h(&qid(0)).cx(0,1).z(&qid(1))");
-    display_state_probabilities(&state);
-}
-
-fn display_state_probabilities(state: &QuestStateVec) {
-    let num_states = 1 << state.num_qubits();
-    for i in 0..num_states {
-        let prob = state.probability(i);
-        if prob > 1e-10 {
-            // Only show non-zero probabilities
-            println!(
-                "  |{:0width$b}⟩: {:.6}",
-                i,
-                prob,
-                width = state.num_qubits()
-            );
-        }
-    }
-}
diff --git a/crates/pecos-quest/examples/test_rz.rs b/crates/pecos-quest/examples/test_rz.rs
deleted file mode 100644
index f7d6223b2..000000000
--- a/crates/pecos-quest/examples/test_rz.rs
+++ /dev/null
@@ -1,42 +0,0 @@
-use pecos_core::{Angle64, qid};
-use pecos_quest::{ArbitraryRotationGateable, CliffordGateable, QuestStateVec};
-use std::f64::consts::PI;
-
-fn main() {
-    println!("Testing RZ gate behavior");
-
-    // Test 1: Apply RZ(π) to |0⟩
-    println!("\nTest 1: RZ(π) on |0⟩");
-    let mut sim = QuestStateVec::new(1);
-    sim.rz(Angle64::from_radians(PI), &qid(0));
-    println!("|0⟩ amplitude: {:?}", sim.get_amplitude(0));
-    println!("|1⟩ amplitude: {:?}", sim.get_amplitude(1));
-
-    // Test 2: Apply RZ(π) to |1⟩
-    println!("\nTest 2: RZ(π) on |1⟩");
-    let mut sim = QuestStateVec::new(1);
-    sim.x(&qid(0)); // Create |1⟩
-    sim.rz(Angle64::from_radians(PI), &qid(0));
-    println!("|0⟩ amplitude: {:?}", sim.get_amplitude(0));
-    println!("|1⟩ amplitude: {:?}", sim.get_amplitude(1));
-
-    // Test 3: Apply RZ(π) to |+⟩
-    println!("\nTest 3: RZ(π) on |+⟩ = (|0⟩ + |1⟩)/√2");
-    let mut sim = QuestStateVec::new(1);
-    sim.h(&qid(0)); // Create |+⟩
-    println!("Before RZ:");
-    println!("|0⟩ amplitude: {:?}", sim.get_amplitude(0));
-    println!("|1⟩ amplitude: {:?}", sim.get_amplitude(1));
-
-    sim.rz(Angle64::from_radians(PI), &qid(0));
-    println!("After RZ(π):");
-    println!("|0⟩ amplitude: {:?}", sim.get_amplitude(0));
-    println!("|1⟩ amplitude: {:?}", sim.get_amplitude(1));
-
-    // Expected: |+⟩ -> |-⟩ = (|0⟩ - |1⟩)/√2
-    let expected_0 = 1.0 / 2.0_f64.sqrt();
-    let expected_1 = -1.0 / 2.0_f64.sqrt();
-    println!("\nExpected after RZ(π):");
-    println!("|0⟩ amplitude: {expected_0}");
-    println!("|1⟩ amplitude: {expected_1}");
-}
diff --git a/crates/pecos-quest/examples/test_rzz.rs b/crates/pecos-quest/examples/test_rzz.rs
deleted file mode 100644
index a207510af..000000000
--- a/crates/pecos-quest/examples/test_rzz.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-use pecos_core::{Angle64, QubitId, qid};
-use pecos_quest::{ArbitraryRotationGateable, CliffordGateable, QuestStateVec};
-use std::f64::consts::{FRAC_PI_2, FRAC_PI_4, PI};
-
-fn main() {
-    println!("Testing RZZ gate behavior");
-
-    // Test RZZ(π/2) on |11⟩
-    println!("\nTest: RZZ(π/2) on |11⟩");
-    let mut sim = QuestStateVec::new(2);
-
-    // Prepare |11⟩ state
-    sim.x(&qid(0)).x(&qid(1));
-    println!("Initial |11⟩ amplitude: {:?}", sim.get_amplitude(0b11));
-
-    // Apply RZZ(π/2)
-    sim.rzz(
-        Angle64::from_radians(FRAC_PI_2),
-        &[(QubitId(0), QubitId(1))],
-    );
-    println!("After RZZ(π/2):");
-    println!("|00⟩ amplitude: {:?}", sim.get_amplitude(0b00));
-    println!("|01⟩ amplitude: {:?}", sim.get_amplitude(0b01));
-    println!("|10⟩ amplitude: {:?}", sim.get_amplitude(0b10));
-    println!("|11⟩ amplitude: {:?}", sim.get_amplitude(0b11));
-
-    // Check the phase
-    let amp11 = sim.get_amplitude(0b11);
-    let phase = amp11.im.atan2(amp11.re);
-    let magnitude = (amp11.re * amp11.re + amp11.im * amp11.im).sqrt();
-    println!("\n|11⟩ magnitude: {magnitude}");
-    println!("|11⟩ phase: {} (in units of π: {})", phase, phase / PI);
-    println!("Expected phase -π/4 = {}", -FRAC_PI_4);
-}
diff --git a/crates/pecos-quest/include/quest_ffi.h b/crates/pecos-quest/include/quest_ffi.h
deleted file mode 100644
index 6359f46c0..000000000
--- a/crates/pecos-quest/include/quest_ffi.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef QUEST_FFI_H
-#define QUEST_FFI_H
-
-#include <cstdint>
-
-// Include rust/cxx.h before <memory> to ensure proper pointer_traits specializations
-#include "rust/cxx.h"
-
-// Now include <memory> - pointer_traits should already be specialized by cxx
-#include <memory>
-
-// Include CXX-generated structs
-#include "pecos-quest/src/bridge.rs.h"
-
-// Simple functions that work with pointers to opaque handles
-// The handles will be managed by the C++ implementation
-
-// Environment management
-uint8_t* quest_create_env();
-void quest_destroy_env(uint8_t* env);
-QuESTEnvInfo quest_get_env_info(uint8_t* env);
-void quest_sync_env(uint8_t* env);
-
-// Qureg creation and destruction
-uint8_t* quest_create_qureg(uint8_t* env, int32_t num_qubits);
-uint8_t* quest_create_density_qureg(uint8_t* env, int32_t num_qubits);
-void quest_destroy_qureg(uint8_t* qureg);
-uint8_t* quest_clone_qureg(uint8_t* qureg);
-QuregInfo quest_get_qureg_info(uint8_t* qureg);
-
-// State initialization
-void quest_init_zero_state(uint8_t* qureg);
-void quest_init_plus_state(uint8_t* qureg);
-void quest_init_classical_state(uint8_t* qureg, int64_t state_ind);
-void quest_init_pure_state(uint8_t* qureg, uint8_t* pure_qureg);
-void quest_init_random_state(uint8_t* qureg, rust::Slice<const uint64_t> seed);
-
-// Single-qubit gates
-void quest_apply_pauli_x(uint8_t* qureg, int32_t qubit);
-void quest_apply_pauli_y(uint8_t* qureg, int32_t qubit);
-void quest_apply_pauli_z(uint8_t* qureg, int32_t qubit);
-void quest_apply_hadamard(uint8_t* qureg, int32_t qubit);
-void quest_apply_s_gate(uint8_t* qureg, int32_t qubit);
-void quest_apply_t_gate(uint8_t* qureg, int32_t qubit);
-void quest_apply_phase_shift(uint8_t* qureg, int32_t qubit, double angle);
-void quest_apply_rotation_x(uint8_t* qureg, int32_t qubit, double angle);
-void quest_apply_rotation_y(uint8_t* qureg, int32_t qubit, double angle);
-void quest_apply_rotation_z(uint8_t* qureg, int32_t qubit, double angle);
-
-// Two-qubit gates
-void quest_apply_cnot(uint8_t* qureg, int32_t control, int32_t target);
-void quest_apply_cz(uint8_t* qureg, int32_t control, int32_t target);
-void quest_apply_swap(uint8_t* qureg, int32_t qubit1, int32_t qubit2);
-void quest_apply_controlled_phase_shift(uint8_t* qureg, int32_t control, int32_t target, double angle);
-
-// Multi-controlled gates
-void quest_apply_multi_controlled_pauli_z(uint8_t* qureg, rust::Slice<const int32_t> controls, int32_t target);
-
-// Measurements
-int32_t quest_measure(uint8_t* qureg, int32_t qubit);
-int32_t quest_measure_with_stats(uint8_t* qureg, int32_t qubit, double& outcome_prob);
-double quest_calc_prob_of_outcome(uint8_t* qureg, int32_t qubit, int32_t outcome);
-double quest_apply_forced_measurement(uint8_t* qureg, int32_t qubit, int32_t outcome);
-double quest_calc_total_prob(uint8_t* qureg);
-
-// Amplitude access
-double quest_get_real_amp(uint8_t* qureg, int64_t index);
-double quest_get_imag_amp(uint8_t* qureg, int64_t index);
-Complex quest_get_complex_amp(uint8_t* qureg, int64_t index);
-double quest_get_prob_amp(uint8_t* qureg, int64_t index);
-
-// State vector properties
-int64_t quest_get_num_amps(uint8_t* qureg);
-int32_t quest_get_num_qubits(uint8_t* qureg);
-bool quest_is_density_matrix(uint8_t* qureg);
-
-// Utility functions
-Complex quest_calc_inner_product(uint8_t* qureg1, uint8_t* qureg2);
-double quest_calc_fidelity(uint8_t* qureg1, uint8_t* qureg2);
-double quest_calc_purity(uint8_t* qureg);
-
-#endif // QUEST_FFI_H
diff --git a/crates/pecos-quest/pecos.toml b/crates/pecos-quest/pecos.toml
deleted file mode 100644
index 45c38753e..000000000
--- a/crates/pecos-quest/pecos.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-version = 1
-
-[dependencies.quest]
-version = "v4.2.0"
-url = "https://github.com/QuEST-Kit/QuEST/archive/refs/tags/v4.2.0.tar.gz"
-sha256 = "2c812a7ec4d727e0947ffd0daf05452963c3f1c10e428c8bc30c35164921fcba"
-description = "QuEST quantum simulator"
diff --git a/crates/pecos-quest/src/bridge.cpp b/crates/pecos-quest/src/bridge.cpp
deleted file mode 100644
index 799286c3f..000000000
--- a/crates/pecos-quest/src/bridge.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-//! C++ bridge implementation for QuEST with independent simulator instances
-//! Each simulator gets its own independent Qureg, but they share a global QuEST environment
-//! since QuEST only supports one environment per process.
-
-#include "quest_ffi.h"
-#include "quest.h"
-// Note: quest_ffi.h includes the cxx-generated header and rust/cxx.h before <memory>
-
-#include <stdexcept>
-#include <vector>
-#include <cstring>
-#include <mutex>
-#include <atomic>
-
-// Global singleton QuEST environment management
-// QuEST requires a single global environment, but Quregs are independent
-
-class GlobalQuestEnv {
-private:
-    static std::mutex init_mutex;
-    static std::atomic<bool> is_initialized;
-    static std::atomic<int> ref_count;
-    static QuESTEnv* global_env_ptr;
-
-    GlobalQuestEnv() = delete;
-
-public:
-    static QuESTEnv& getInstance() {
-        std::lock_guard<std::mutex> lock(init_mutex);
-
-        if (!is_initialized.load()) {
-            // Initialize QuEST environment in CPU-only mode.
-            // GPU acceleration is handled separately via the CUDA engine builder
-            // (QuestCudaStateVecEngine) which loads a dedicated GPU backend at
-            // runtime via dlopen, allowing a single binary to work on systems
-            // with and without CUDA.
-            initCustomQuESTEnv(/*useDistrib=*/0, /*useGpuAccel=*/0, /*useMultithread=*/0);
-            global_env_ptr = new QuESTEnv(getQuESTEnv());
-            is_initialized = true;
-        }
-
-        return *global_env_ptr;
-    }
-
-    static void addRef() {
-        std::lock_guard<std::mutex> lock(init_mutex);
-        ref_count++;
-    }
-
-    static void releaseRef() {
-        std::lock_guard<std::mutex> lock(init_mutex);
-        ref_count--;
-        // Never finalize - let process termination handle it
-        // This avoids re-initialization issues in tests
-    }
-};
-
-// Static member definitions
-std::mutex GlobalQuestEnv::init_mutex;
-std::atomic<bool> GlobalQuestEnv::is_initialized(false);
-std::atomic<int> GlobalQuestEnv::ref_count(0);
-QuESTEnv* GlobalQuestEnv::global_env_ptr = nullptr;
-
-// Environment handle that each simulator instance gets
-// This provides the illusion of independent environments while sharing the global one
-struct QuestEnvHandle {
-    QuESTEnv cached_env;  // Cache a copy for thread-safe access
-
-    QuestEnvHandle() {
-        cached_env = GlobalQuestEnv::getInstance();
-        GlobalQuestEnv::addRef();
-    }
-
-    ~QuestEnvHandle() {
-        GlobalQuestEnv::releaseRef();
-    }
-
-    // Make it non-copyable but moveable
-    QuestEnvHandle(const QuestEnvHandle&) = delete;
-    QuestEnvHandle& operator=(const QuestEnvHandle&) = delete;
-    QuestEnvHandle(QuestEnvHandle&& other) noexcept
-        : cached_env(other.cached_env) {
-        // Transfer ownership
-        other.cached_env = {};
-    }
-    QuestEnvHandle& operator=(QuestEnvHandle&& other) noexcept {
-        if (this != &other) {
-            GlobalQuestEnv::releaseRef();
-            cached_env = other.cached_env;
-            other.cached_env = {};
-        }
-        return *this;
-    }
-
-    QuESTEnv& getEnv() { return cached_env; }
-};
-
-// Simple handle struct that owns a QuEST Qureg
-struct QuregHandle {
-    Qureg qureg;
-    bool owned;
-
-    QuregHandle(int numQubits, bool isDensity) : owned(true) {
-        if (isDensity) {
-            qureg = createDensityQureg(numQubits);
-            // Initialization will be done from Rust
-        } else {
-            qureg = createQureg(numQubits);
-            // Initialization will be done from Rust
-        }
-    }
-
-    QuregHandle(const Qureg& q) : qureg(q), owned(false) {}
-
-    ~QuregHandle() {
-        if (owned && qureg.cpuAmps != nullptr) {
-            destroyQureg(qureg);
-        }
-    }
-
-    // Make it non-copyable but moveable
-    QuregHandle(const QuregHandle&) = delete;
-    QuregHandle& operator=(const QuregHandle&) = delete;
-    QuregHandle(QuregHandle&&) = default;
-    QuregHandle& operator=(QuregHandle&&) = default;
-};
-
-// Environment management functions
-uint8_t* quest_create_env() {
-    try {
-        return reinterpret_cast<uint8_t*>(new QuestEnvHandle());
-    } catch (const std::exception& e) {
-        throw std::runtime_error(std::string("Failed to create QuEST environment: ") + e.what());
-    }
-}
-
-void quest_destroy_env(uint8_t* env) {
-    if (env) {
-        delete reinterpret_cast<QuestEnvHandle*>(env);
-    }
-}
-
-QuESTEnvInfo quest_get_env_info(uint8_t* env) {
-    auto* handle = reinterpret_cast<QuestEnvHandle*>(env);
-    QuESTEnv& questEnv = handle->getEnv();
-
-    QuESTEnvInfo info;
-    info.is_multithreaded = questEnv.isMultithreaded != 0;
-    info.is_gpu_accelerated = questEnv.isGpuAccelerated != 0;
-    info.is_distributed = questEnv.isDistributed != 0;
-    info.rank = questEnv.rank;
-    info.num_nodes = questEnv.numNodes;
-    return info;
-}
-
-void quest_sync_env(uint8_t* env) {
-    // For thread-safe usage, we avoid global sync operations
-    // Each thread works independently
-}
-
-// Qureg creation and destruction - each is completely independent
-uint8_t* quest_create_qureg(uint8_t* env, int32_t numQubits) {
-    if (numQubits < 1) {
-        throw std::invalid_argument("Number of qubits must be at least 1");
-    }
-    try {
-        return reinterpret_cast<uint8_t*>(new QuregHandle(numQubits, false));
-    } catch (const std::exception& e) {
-        throw std::runtime_error(std::string("Failed to create qureg: ") + e.what());
-    }
-}
-
-uint8_t* quest_create_density_qureg(uint8_t* env, int32_t numQubits) {
-    if (numQubits < 1) {
-        throw std::invalid_argument("Number of qubits must be at least 1");
-    }
-    try {
-        return reinterpret_cast<uint8_t*>(new QuregHandle(numQubits, true));
-    } catch (const std::exception& e) {
-        throw std::runtime_error(std::string("Failed to create density qureg: ") + e.what());
-    }
-}
-
-void quest_destroy_qureg(uint8_t* qureg) {
-    if (qureg) {
-        delete reinterpret_cast<QuregHandle*>(qureg);
-    }
-}
-
-uint8_t* quest_clone_qureg(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    try {
-        // Note: QuregHandle constructor will lock for creation
-        auto* cloned = new QuregHandle(handle->qureg.numQubits, handle->qureg.isDensityMatrix != 0);
-        {
-            setQuregToClone(cloned->qureg, handle->qureg);
-        }
-        return reinterpret_cast<uint8_t*>(cloned);
-    } catch (const std::exception& e) {
-        throw std::runtime_error(std::string("Failed to clone qureg: ") + e.what());
-    }
-}
-
-QuregInfo quest_get_qureg_info(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    QuregInfo info;
-    info.num_qubits = handle->qureg.numQubits;
-    info.num_amps = handle->qureg.numAmps;
-    info.is_density_matrix = handle->qureg.isDensityMatrix != 0;
-    return info;
-}
-
-// State initialization - operates on independent Quregs
-void quest_init_zero_state(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-
-    // Initialize to |00...0⟩ state
-    initZeroState(handle->qureg);
-}
-
-void quest_init_plus_state(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    initPlusState(handle->qureg);
-}
-
-void quest_init_classical_state(uint8_t* qureg, int64_t stateInd) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    initClassicalState(handle->qureg, stateInd);
-}
-
-void quest_init_pure_state(uint8_t* qureg, uint8_t* pureQureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    auto* pureHandle = reinterpret_cast<QuregHandle*>(pureQureg);
-    initPureState(handle->qureg, pureHandle->qureg);
-}
-
-void quest_init_random_state(uint8_t* qureg, rust::Slice<const uint64_t> seed) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    // Convert seed to QuEST format
-    std::vector<unsigned long> questSeed;
-    for (auto s : seed) {
-        questSeed.push_back(static_cast<unsigned long>(s));
-    }
-    // Each qureg gets its own random state, completely independent
-    // Note: QuEST v4 doesn't use seed arrays, just call initRandomPureState
-    initRandomPureState(handle->qureg);
-}
-
-// All quantum operations operate on independent Quregs
-void quest_apply_pauli_x(uint8_t* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyPauliX(handle->qureg, qubit);
-}
-
-void quest_apply_pauli_y(uint8_t* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyPauliY(handle->qureg, qubit);
-}
-
-void quest_apply_pauli_z(uint8_t* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyPauliZ(handle->qureg, qubit);
-}
-
-void quest_apply_hadamard(uint8_t* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyHadamard(handle->qureg, qubit);
-}
-
-void quest_apply_s_gate(uint8_t* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyS(handle->qureg, qubit);
-}
-
-void quest_apply_t_gate(uint8_t* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyT(handle->qureg, qubit);
-}
-
-void quest_apply_phase_shift(uint8_t* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyPhaseShift(handle->qureg, qubit, angle);
-}
-
-void quest_apply_rotation_x(uint8_t* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyRotateX(handle->qureg, qubit, angle);
-}
-
-void quest_apply_rotation_y(uint8_t* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyRotateY(handle->qureg, qubit, angle);
-}
-
-void quest_apply_rotation_z(uint8_t* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyRotateZ(handle->qureg, qubit, angle);
-}
-
-void quest_apply_cnot(uint8_t* qureg, int32_t control, int32_t target) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyControlledPauliX(handle->qureg, control, target);
-}
-
-void quest_apply_cz(uint8_t* qureg, int32_t control, int32_t target) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyTwoQubitPhaseFlip(handle->qureg, control, target);
-}
-
-void quest_apply_swap(uint8_t* qureg, int32_t qubit1, int32_t qubit2) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applySwap(handle->qureg, qubit1, qubit2);
-}
-
-void quest_apply_controlled_phase_shift(uint8_t* qureg, int32_t control, int32_t target, double angle) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    applyTwoQubitPhaseShift(handle->qureg, control, target, angle);
-}
-
-void quest_apply_multi_controlled_pauli_z(uint8_t* qureg, rust::Slice<const int32_t> controls, int32_t target) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    std::vector<int> controlVec(controls.data(), controls.data() + controls.size());
-    applyMultiControlledPauliZ(handle->qureg, controlVec.data(), controlVec.size(), target);
-}
-
-// Measurements - each qureg maintains its own state
-int32_t quest_measure(uint8_t* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return applyQubitMeasurement(handle->qureg, qubit);
-}
-
-int32_t quest_measure_with_stats(uint8_t* qureg, int32_t qubit, double& outcomeProb) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return applyQubitMeasurementAndGetProb(handle->qureg, qubit, &outcomeProb);
-}
-
-double quest_calc_prob_of_outcome(uint8_t* qureg, int32_t qubit, int32_t outcome) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return calcProbOfQubitOutcome(handle->qureg, qubit, outcome);
-}
-
-double quest_apply_forced_measurement(uint8_t* qureg, int32_t qubit, int32_t outcome) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return applyForcedQubitMeasurement(handle->qureg, qubit, outcome);
-}
-
-double quest_calc_total_prob(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return calcTotalProb(handle->qureg);
-}
-
-// Amplitude access - read-only operations on independent states
-double quest_get_real_amp(uint8_t* qureg, int64_t index) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return real(getQuregAmp(handle->qureg, index));
-}
-
-double quest_get_imag_amp(uint8_t* qureg, int64_t index) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return imag(getQuregAmp(handle->qureg, index));
-}
-
-Complex quest_get_complex_amp(uint8_t* qureg, int64_t index) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    qcomp amp = getQuregAmp(handle->qureg, index);
-    Complex result;
-    result.real = real(amp);
-    result.imag = imag(amp);
-    return result;
-}
-
-double quest_get_prob_amp(uint8_t* qureg, int64_t index) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return calcProbOfBasisState(handle->qureg, index);
-}
-
-int64_t quest_get_num_amps(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return handle->qureg.numAmps;
-}
-
-int32_t quest_get_num_qubits(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return handle->qureg.numQubits;
-}
-
-bool quest_is_density_matrix(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return handle->qureg.isDensityMatrix != 0;
-}
-
-// Utility functions for independent quregs
-Complex quest_calc_inner_product(uint8_t* qureg1, uint8_t* qureg2) {
-    auto* handle1 = reinterpret_cast<QuregHandle*>(qureg1);
-    auto* handle2 = reinterpret_cast<QuregHandle*>(qureg2);
-    qcomp prod = calcInnerProduct(handle1->qureg, handle2->qureg);
-    Complex result;
-    result.real = real(prod);
-    result.imag = imag(prod);
-    return result;
-}
-
-double quest_calc_fidelity(uint8_t* qureg1, uint8_t* qureg2) {
-    auto* handle1 = reinterpret_cast<QuregHandle*>(qureg1);
-    auto* handle2 = reinterpret_cast<QuregHandle*>(qureg2);
-    return calcFidelity(handle1->qureg, handle2->qureg);
-}
-
-double quest_calc_purity(uint8_t* qureg) {
-    auto* handle = reinterpret_cast<QuregHandle*>(qureg);
-    return calcPurity(handle->qureg);
-}
diff --git a/crates/pecos-quest/src/bridge.rs b/crates/pecos-quest/src/bridge.rs
deleted file mode 100644
index c86a0ac6d..000000000
--- a/crates/pecos-quest/src/bridge.rs
+++ /dev/null
@@ -1,112 +0,0 @@
-//! CXX bridge definitions for `QuEST` simulator
-
-#[cxx::bridge]
-pub mod ffi {
-    // QuEST environment struct
-    #[derive(Debug, Clone)]
-    struct QuESTEnvInfo {
-        pub is_multithreaded: bool,
-        pub is_gpu_accelerated: bool,
-        pub is_distributed: bool,
-        pub rank: i32,
-        pub num_nodes: i32,
-    }
-
-    // Qureg info struct for reporting parameters
-    #[derive(Debug, Clone)]
-    struct QuregInfo {
-        pub num_qubits: i32,
-        pub num_amps: i64,
-        pub is_density_matrix: bool,
-    }
-
-    // Complex number representation
-    #[derive(Debug, Clone, Copy)]
-    struct Complex {
-        pub real: f64,
-        pub imag: f64,
-    }
-
-    #[allow(clippy::missing_safety_doc)]
-    unsafe extern "C++" {
-        include!("quest_ffi.h");
-
-        // Environment management
-        #[must_use]
-        fn quest_create_env() -> *mut u8;
-        unsafe fn quest_destroy_env(env: *mut u8);
-        unsafe fn quest_get_env_info(env: *mut u8) -> QuESTEnvInfo;
-        unsafe fn quest_sync_env(env: *mut u8);
-
-        // Qureg creation and destruction
-        unsafe fn quest_create_qureg(env: *mut u8, num_qubits: i32) -> *mut u8;
-        unsafe fn quest_create_density_qureg(env: *mut u8, num_qubits: i32) -> *mut u8;
-        unsafe fn quest_destroy_qureg(qureg: *mut u8);
-        unsafe fn quest_clone_qureg(qureg: *mut u8) -> *mut u8;
-        unsafe fn quest_get_qureg_info(qureg: *mut u8) -> QuregInfo;
-
-        // State initialization
-        unsafe fn quest_init_zero_state(qureg: *mut u8);
-        unsafe fn quest_init_plus_state(qureg: *mut u8);
-        unsafe fn quest_init_classical_state(qureg: *mut u8, state_ind: i64);
-        unsafe fn quest_init_pure_state(qureg: *mut u8, pure_qureg: *mut u8);
-        unsafe fn quest_init_random_state(qureg: *mut u8, seed: &[u64]);
-
-        // Single-qubit gates
-        unsafe fn quest_apply_pauli_x(qureg: *mut u8, qubit: i32);
-        unsafe fn quest_apply_pauli_y(qureg: *mut u8, qubit: i32);
-        unsafe fn quest_apply_pauli_z(qureg: *mut u8, qubit: i32);
-        unsafe fn quest_apply_hadamard(qureg: *mut u8, qubit: i32);
-        unsafe fn quest_apply_s_gate(qureg: *mut u8, qubit: i32);
-        unsafe fn quest_apply_t_gate(qureg: *mut u8, qubit: i32);
-        unsafe fn quest_apply_phase_shift(qureg: *mut u8, qubit: i32, angle: f64);
-        unsafe fn quest_apply_rotation_x(qureg: *mut u8, qubit: i32, angle: f64);
-        unsafe fn quest_apply_rotation_y(qureg: *mut u8, qubit: i32, angle: f64);
-        unsafe fn quest_apply_rotation_z(qureg: *mut u8, qubit: i32, angle: f64);
-
-        // Two-qubit gates
-        unsafe fn quest_apply_cnot(qureg: *mut u8, control: i32, target: i32);
-        unsafe fn quest_apply_cz(qureg: *mut u8, control: i32, target: i32);
-        unsafe fn quest_apply_swap(qureg: *mut u8, qubit1: i32, qubit2: i32);
-        unsafe fn quest_apply_controlled_phase_shift(
-            qureg: *mut u8,
-            control: i32,
-            target: i32,
-            angle: f64,
-        );
-
-        // Multi-controlled gates
-        unsafe fn quest_apply_multi_controlled_pauli_z(
-            qureg: *mut u8,
-            controls: &[i32],
-            target: i32,
-        );
-
-        // Measurements
-        unsafe fn quest_measure(qureg: *mut u8, qubit: i32) -> i32;
-        unsafe fn quest_measure_with_stats(
-            qureg: *mut u8,
-            qubit: i32,
-            outcome_prob: &mut f64,
-        ) -> i32;
-        unsafe fn quest_calc_prob_of_outcome(qureg: *mut u8, qubit: i32, outcome: i32) -> f64;
-        unsafe fn quest_apply_forced_measurement(qureg: *mut u8, qubit: i32, outcome: i32) -> f64;
-        unsafe fn quest_calc_total_prob(qureg: *mut u8) -> f64;
-
-        // Amplitude access
-        unsafe fn quest_get_real_amp(qureg: *mut u8, index: i64) -> f64;
-        unsafe fn quest_get_imag_amp(qureg: *mut u8, index: i64) -> f64;
-        unsafe fn quest_get_complex_amp(qureg: *mut u8, index: i64) -> Complex;
-        unsafe fn quest_get_prob_amp(qureg: *mut u8, index: i64) -> f64;
-
-        // State vector properties
-        unsafe fn quest_get_num_amps(qureg: *mut u8) -> i64;
-        unsafe fn quest_get_num_qubits(qureg: *mut u8) -> i32;
-        unsafe fn quest_is_density_matrix(qureg: *mut u8) -> bool;
-
-        // Utility functions
-        unsafe fn quest_calc_inner_product(qureg1: *mut u8, qureg2: *mut u8) -> Complex;
-        unsafe fn quest_calc_fidelity(qureg1: *mut u8, qureg2: *mut u8) -> f64;
-        unsafe fn quest_calc_purity(qureg: *mut u8) -> f64;
-    }
-}
diff --git a/crates/pecos-quest/src/bridge_cuda.cpp b/crates/pecos-quest/src/bridge_cuda.cpp
deleted file mode 100644
index bbce81bc8..000000000
--- a/crates/pecos-quest/src/bridge_cuda.cpp
+++ /dev/null
@@ -1,321 +0,0 @@
-//! GPU-specific bridge for PECOS QuEST
-//!
-//! This file is compiled into a separate shared library (libpecos_quest_cuda.so)
-//! that is loaded at runtime via dlopen when GPU acceleration is requested.
-//! This allows the main library to work on systems without CUDA installed.
-//!
-//! Note: This file is intentionally self-contained and does not depend on
-//! quest_ffi.h or CXX bridge headers, as it needs to compile independently
-//! with nvcc for CUDA support.
-
-#include "quest.h"
-
-#include <cstdint>
-#include <stdexcept>
-#include <mutex>
-#include <atomic>
-
-// GPU environment info structure - must match Rust's CudaEnvInfo in cuda_loader.rs
-struct CudaEnvInfo {
-    bool is_multithreaded;
-    bool is_gpu_accelerated;
-    bool is_distributed;
-    int32_t rank;
-    int32_t num_nodes;
-};
-
-// Global singleton QuEST environment management for GPU
-// Same pattern as bridge.cpp but for the GPU library
-class GpuGlobalQuestEnv {
-private:
-    static std::mutex init_mutex;
-    static std::atomic<bool> is_initialized;
-    static std::atomic<int> ref_count;
-    static QuESTEnv* global_env_ptr;
-
-    GpuGlobalQuestEnv() = delete;
-
-public:
-    static QuESTEnv& getInstance() {
-        std::lock_guard<std::mutex> lock(init_mutex);
-
-        if (!is_initialized.load()) {
-            // Initialize QuEST environment only once per process
-            initQuESTEnv();
-            global_env_ptr = new QuESTEnv(getQuESTEnv());
-            is_initialized = true;
-        }
-
-        return *global_env_ptr;
-    }
-
-    static void addRef() {
-        std::lock_guard<std::mutex> lock(init_mutex);
-        ref_count++;
-    }
-
-    static void releaseRef() {
-        std::lock_guard<std::mutex> lock(init_mutex);
-        ref_count--;
-        // Never finalize - let process termination handle it
-    }
-};
-
-// Static member definitions
-std::mutex GpuGlobalQuestEnv::init_mutex;
-std::atomic<bool> GpuGlobalQuestEnv::is_initialized(false);
-std::atomic<int> GpuGlobalQuestEnv::ref_count(0);
-QuESTEnv* GpuGlobalQuestEnv::global_env_ptr = nullptr;
-
-// GPU environment handle
-struct GpuQuestEnvHandle {
-    QuESTEnv cached_env;
-
-    GpuQuestEnvHandle() {
-        cached_env = GpuGlobalQuestEnv::getInstance();
-        GpuGlobalQuestEnv::addRef();
-    }
-
-    ~GpuQuestEnvHandle() {
-        GpuGlobalQuestEnv::releaseRef();
-    }
-
-    // Non-copyable
-    GpuQuestEnvHandle(const GpuQuestEnvHandle&) = delete;
-    GpuQuestEnvHandle& operator=(const GpuQuestEnvHandle&) = delete;
-
-    QuESTEnv& getEnv() { return cached_env; }
-};
-
-// GPU Qureg handle
-struct GpuQuregHandle {
-    Qureg qureg;
-    bool owned;
-
-    GpuQuregHandle(int numQubits, bool isDensity) : owned(true) {
-        if (isDensity) {
-            qureg = createDensityQureg(numQubits);
-        } else {
-            qureg = createQureg(numQubits);
-        }
-    }
-
-    ~GpuQuregHandle() {
-        if (owned && qureg.cpuAmps != nullptr) {
-            destroyQureg(qureg);
-        }
-    }
-
-    // Non-copyable
-    GpuQuregHandle(const GpuQuregHandle&) = delete;
-    GpuQuregHandle& operator=(const GpuQuregHandle&) = delete;
-};
-
-// Export C functions for dlopen
-extern "C" {
-
-// Environment management
-void* pecos_quest_cuda_create_env() {
-    try {
-        return reinterpret_cast<void*>(new GpuQuestEnvHandle());
-    } catch (const std::exception& e) {
-        return nullptr;
-    }
-}
-
-void pecos_quest_cuda_destroy_env(void* env) {
-    if (env) {
-        delete reinterpret_cast<GpuQuestEnvHandle*>(env);
-    }
-}
-
-CudaEnvInfo pecos_quest_cuda_get_env_info(void* env) {
-    auto* handle = reinterpret_cast<GpuQuestEnvHandle*>(env);
-    QuESTEnv& questEnv = handle->getEnv();
-
-    CudaEnvInfo info;
-    info.is_multithreaded = questEnv.isMultithreaded != 0;
-    info.is_gpu_accelerated = questEnv.isGpuAccelerated != 0;
-    info.is_distributed = questEnv.isDistributed != 0;
-    info.rank = questEnv.rank;
-    info.num_nodes = questEnv.numNodes;
-    return info;
-}
-
-// Qureg management
-void* pecos_quest_cuda_create_qureg(void* env, int32_t numQubits) {
-    if (numQubits < 1) {
-        return nullptr;
-    }
-    try {
-        return reinterpret_cast<void*>(new GpuQuregHandle(numQubits, false));
-    } catch (const std::exception& e) {
-        return nullptr;
-    }
-}
-
-void* pecos_quest_cuda_create_density_qureg(void* env, int32_t numQubits) {
-    if (numQubits < 1) {
-        return nullptr;
-    }
-    try {
-        return reinterpret_cast<void*>(new GpuQuregHandle(numQubits, true));
-    } catch (const std::exception& e) {
-        return nullptr;
-    }
-}
-
-void pecos_quest_cuda_destroy_qureg(void* qureg) {
-    if (qureg) {
-        delete reinterpret_cast<GpuQuregHandle*>(qureg);
-    }
-}
-
-// State initialization
-void pecos_quest_cuda_init_zero_state(void* qureg) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    initZeroState(handle->qureg);
-}
-
-void pecos_quest_cuda_init_plus_state(void* qureg) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    initPlusState(handle->qureg);
-}
-
-void pecos_quest_cuda_init_classical_state(void* qureg, int64_t stateInd) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    initClassicalState(handle->qureg, stateInd);
-}
-
-// Single-qubit gates
-void pecos_quest_cuda_apply_pauli_x(void* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyPauliX(handle->qureg, qubit);
-}
-
-void pecos_quest_cuda_apply_pauli_y(void* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyPauliY(handle->qureg, qubit);
-}
-
-void pecos_quest_cuda_apply_pauli_z(void* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyPauliZ(handle->qureg, qubit);
-}
-
-void pecos_quest_cuda_apply_hadamard(void* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyHadamard(handle->qureg, qubit);
-}
-
-void pecos_quest_cuda_apply_s_gate(void* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyS(handle->qureg, qubit);
-}
-
-void pecos_quest_cuda_apply_t_gate(void* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyT(handle->qureg, qubit);
-}
-
-void pecos_quest_cuda_apply_phase_shift(void* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyPhaseShift(handle->qureg, qubit, angle);
-}
-
-// Rotation gates
-void pecos_quest_cuda_apply_rotation_x(void* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyRotateX(handle->qureg, qubit, angle);
-}
-
-void pecos_quest_cuda_apply_rotation_y(void* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyRotateY(handle->qureg, qubit, angle);
-}
-
-void pecos_quest_cuda_apply_rotation_z(void* qureg, int32_t qubit, double angle) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyRotateZ(handle->qureg, qubit, angle);
-}
-
-// Two-qubit gates
-void pecos_quest_cuda_apply_cnot(void* qureg, int32_t control, int32_t target) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyControlledPauliX(handle->qureg, control, target);
-}
-
-void pecos_quest_cuda_apply_cz(void* qureg, int32_t control, int32_t target) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyTwoQubitPhaseFlip(handle->qureg, control, target);
-}
-
-void pecos_quest_cuda_apply_swap(void* qureg, int32_t qubit1, int32_t qubit2) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applySwap(handle->qureg, qubit1, qubit2);
-}
-
-void pecos_quest_cuda_apply_controlled_phase_shift(void* qureg, int32_t control, int32_t target, double angle) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    applyTwoQubitPhaseShift(handle->qureg, control, target, angle);
-}
-
-// Measurement
-int32_t pecos_quest_cuda_measure(void* qureg, int32_t qubit) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return applyQubitMeasurement(handle->qureg, qubit);
-}
-
-double pecos_quest_cuda_calc_prob_of_outcome(void* qureg, int32_t qubit, int32_t outcome) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return calcProbOfQubitOutcome(handle->qureg, qubit, outcome);
-}
-
-double pecos_quest_cuda_apply_forced_measurement(void* qureg, int32_t qubit, int32_t outcome) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return applyForcedQubitMeasurement(handle->qureg, qubit, outcome);
-}
-
-// Amplitude access
-double pecos_quest_cuda_get_real_amp(void* qureg, int64_t index) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return real(getQuregAmp(handle->qureg, index));
-}
-
-double pecos_quest_cuda_get_imag_amp(void* qureg, int64_t index) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return imag(getQuregAmp(handle->qureg, index));
-}
-
-double pecos_quest_cuda_get_prob_amp(void* qureg, int64_t index) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return calcProbOfBasisState(handle->qureg, index);
-}
-
-double pecos_quest_cuda_calc_total_prob(void* qureg) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return calcTotalProb(handle->qureg);
-}
-
-double pecos_quest_cuda_calc_purity(void* qureg) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return calcPurity(handle->qureg);
-}
-
-// Info
-int64_t pecos_quest_cuda_get_num_amps(void* qureg) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return handle->qureg.numAmps;
-}
-
-int32_t pecos_quest_cuda_get_num_qubits(void* qureg) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return handle->qureg.numQubits;
-}
-
-bool pecos_quest_cuda_is_density_matrix(void* qureg) {
-    auto* handle = reinterpret_cast<GpuQuregHandle*>(qureg);
-    return handle->qureg.isDensityMatrix != 0;
-}
-
-} // extern "C"
diff --git a/crates/pecos-quest/src/cuda_loader.rs b/crates/pecos-quest/src/cuda_loader.rs
deleted file mode 100644
index acaad38a5..000000000
--- a/crates/pecos-quest/src/cuda_loader.rs
+++ /dev/null
@@ -1,456 +0,0 @@
-//! Runtime loader for the CUDA-accelerated `QuEST` backend
-//!
-//! This module provides functionality to dynamically load the PECOS `QuEST` CUDA
-//! backend library at runtime, enabling a single binary to work on both systems
-//! with and without NVIDIA CUDA installed.
-
-use libloading::{Library, Symbol};
-use std::path::PathBuf;
-use std::sync::OnceLock;
-use thiserror::Error;
-
-/// Errors that can occur when loading the `QuEST` CUDA backend
-#[derive(Error, Debug, Clone)]
-pub enum CudaLoadError {
-    #[error("QuEST CUDA backend not found. Searched paths: {searched_paths}")]
-    LibraryNotFound { searched_paths: String },
-
-    #[error("Failed to load QuEST CUDA backend: {0}")]
-    LoadFailed(String),
-
-    #[error("Missing symbol in QuEST CUDA backend: {0}")]
-    MissingSymbol(String),
-
-    #[error("NVIDIA CUDA runtime not available: {0}")]
-    CudaUnavailable(String),
-}
-
-/// Result type for CUDA loading operations
-pub type CudaResult<T> = std::result::Result<T, CudaLoadError>;
-
-/// `QuEST` CUDA backend that holds the loaded library and function pointers
-pub struct CudaBackend {
-    /// Keep the backend library loaded for the lifetime of this struct
-    _library: Library,
-
-    // Function pointers for QuEST CUDA backend operations
-    // Environment management
-    pub create_env: unsafe extern "C" fn() -> *mut u8,
-    pub destroy_env: unsafe extern "C" fn(*mut u8),
-    pub get_env_info: unsafe extern "C" fn(*mut u8) -> CudaEnvInfo,
-
-    // Qureg management
-    pub create_qureg: unsafe extern "C" fn(*mut u8, i32) -> *mut u8,
-    pub create_density_qureg: unsafe extern "C" fn(*mut u8, i32) -> *mut u8,
-    pub destroy_qureg: unsafe extern "C" fn(*mut u8),
-
-    // State initialization
-    pub init_zero_state: unsafe extern "C" fn(*mut u8),
-    pub init_plus_state: unsafe extern "C" fn(*mut u8),
-    pub init_classical_state: unsafe extern "C" fn(*mut u8, i64),
-
-    // Single-qubit gates
-    pub apply_pauli_x: unsafe extern "C" fn(*mut u8, i32),
-    pub apply_pauli_y: unsafe extern "C" fn(*mut u8, i32),
-    pub apply_pauli_z: unsafe extern "C" fn(*mut u8, i32),
-    pub apply_hadamard: unsafe extern "C" fn(*mut u8, i32),
-    pub apply_s_gate: unsafe extern "C" fn(*mut u8, i32),
-    pub apply_t_gate: unsafe extern "C" fn(*mut u8, i32),
-    pub apply_phase_shift: unsafe extern "C" fn(*mut u8, i32, f64),
-
-    // Rotation gates
-    pub apply_rotation_x: unsafe extern "C" fn(*mut u8, i32, f64),
-    pub apply_rotation_y: unsafe extern "C" fn(*mut u8, i32, f64),
-    pub apply_rotation_z: unsafe extern "C" fn(*mut u8, i32, f64),
-
-    // Two-qubit gates
-    pub apply_cnot: unsafe extern "C" fn(*mut u8, i32, i32),
-    pub apply_cz: unsafe extern "C" fn(*mut u8, i32, i32),
-    pub apply_swap: unsafe extern "C" fn(*mut u8, i32, i32),
-    pub apply_controlled_phase_shift: unsafe extern "C" fn(*mut u8, i32, i32, f64),
-
-    // Measurement
-    pub measure: unsafe extern "C" fn(*mut u8, i32) -> i32,
-    pub calc_prob_of_outcome: unsafe extern "C" fn(*mut u8, i32, i32) -> f64,
-    pub apply_forced_measurement: unsafe extern "C" fn(*mut u8, i32, i32) -> f64,
-
-    // Amplitude access
-    pub get_real_amp: unsafe extern "C" fn(*mut u8, i64) -> f64,
-    pub get_imag_amp: unsafe extern "C" fn(*mut u8, i64) -> f64,
-    pub get_prob_amp: unsafe extern "C" fn(*mut u8, i64) -> f64,
-    pub calc_total_prob: unsafe extern "C" fn(*mut u8) -> f64,
-    pub calc_purity: unsafe extern "C" fn(*mut u8) -> f64,
-
-    // Info
-    pub get_num_amps: unsafe extern "C" fn(*mut u8) -> i64,
-    pub get_num_qubits: unsafe extern "C" fn(*mut u8) -> i32,
-    pub is_density_matrix: unsafe extern "C" fn(*mut u8) -> bool,
-}
-
-/// CUDA environment info returned by the CUDA library
-#[repr(C)]
-#[derive(Debug, Clone, Copy)]
-pub struct CudaEnvInfo {
-    pub is_multithreaded: bool,
-    pub is_gpu_accelerated: bool,
-    pub is_distributed: bool,
-    pub rank: i32,
-    pub num_nodes: i32,
-}
-
-/// Global CUDA backend instance (lazily initialized)
-static CUDA_BACKEND: OnceLock<Result<CudaBackend, CudaLoadError>> = OnceLock::new();
-
-/// Wrapper for the CUDA environment handle that can be shared across threads
-///
-/// # Safety
-/// The CUDA environment handle is thread-safe through `QuEST`'s internal synchronization.
-/// All operations on the environment go through the CUDA backend functions which handle
-/// synchronization appropriately.
-struct SharedEnvHandle(*mut u8);
-
-// SAFETY: The CUDA environment handle is thread-safe through QuEST's internal synchronization
-// and is only accessed through the loaded CUDA backend functions which are also thread-safe.
-unsafe impl Send for SharedEnvHandle {}
-unsafe impl Sync for SharedEnvHandle {}
-
-/// Shared CUDA environment handle (lazily initialized, never destroyed)
-///
-/// `QuEST`'s CUDA environment has issues with destruction and recreation - once destroyed,
-/// subsequent attempts to create a new environment often fail. This static environment
-/// is shared across all `QuestCudaStateVecEngine` instances and persists for the lifetime
-/// of the process. Only the quantum registers (quregs) are created/destroyed per engine.
-static SHARED_CUDA_ENV: OnceLock<SharedEnvHandle> = OnceLock::new();
-
-/// Get or create the shared CUDA environment
-///
-/// This function returns the shared CUDA environment handle, creating it if necessary.
-/// The environment is never destroyed, avoiding `QuEST` CUDA recreation issues.
-///
-/// # Errors
-/// Returns `CudaLoadError` if:
-/// - The CUDA backend cannot be loaded
-/// - The environment cannot be created
-pub fn get_shared_cuda_env() -> Result<(*mut u8, &'static CudaBackend), CudaLoadError> {
-    let backend = try_load_cuda().map_err(std::clone::Clone::clone)?;
-
-    let env = SHARED_CUDA_ENV.get_or_init(|| {
-        let env_handle = unsafe { (backend.create_env)() };
-        if env_handle.is_null() {
-            log::error!("Failed to create shared CUDA QuEST environment");
-            SharedEnvHandle(std::ptr::null_mut())
-        } else {
-            log::info!("Created shared CUDA QuEST environment");
-            SharedEnvHandle(env_handle)
-        }
-    });
-
-    if env.0.is_null() {
-        return Err(CudaLoadError::CudaUnavailable(
-            "Failed to create shared CUDA environment".to_string(),
-        ));
-    }
-
-    Ok((env.0, backend))
-}
-
-/// Library name varies by platform
-#[cfg(target_os = "linux")]
-const CUDA_LIB_NAME: &str = "libpecos_quest_cuda.so";
-#[cfg(target_os = "macos")]
-const CUDA_LIB_NAME: &str = "libpecos_quest_cuda.dylib";
-#[cfg(target_os = "windows")]
-const CUDA_LIB_NAME: &str = "pecos_quest_cuda.dll";
-
-/// Attempt to load the CUDA backend library.
-///
-/// This function is thread-safe and will only attempt to load the library once.
-/// Subsequent calls return the cached result.
-///
-/// # Returns
-/// - `Ok(&CudaBackend)` if the CUDA library was loaded successfully
-/// - `Err(&CudaLoadError)` if loading failed (CUDA not available, library not found, etc.)
-///
-/// # Errors
-/// Returns a `CudaLoadError` if:
-/// - The CUDA library cannot be found in any of the search paths (`LibraryNotFound`)
-/// - The library exists but cannot be loaded (`LoadFailed`)
-/// - Required symbols are missing from the library (`MissingSymbol`)
-pub fn try_load_cuda() -> Result<&'static CudaBackend, &'static CudaLoadError> {
-    CUDA_BACKEND.get_or_init(load_cuda_library).as_ref()
-}
-
-/// Check if CUDA acceleration is available without fully initializing it
-#[must_use]
-pub fn is_cuda_available() -> bool {
-    try_load_cuda().is_ok()
-}
-
-/// Get the search paths for the CUDA library
-fn get_cuda_library_search_paths() -> Vec<PathBuf> {
-    let mut paths = vec![];
-
-    // 1. Environment variable set by Python package (highest priority)
-    if let Ok(pkg_path) = std::env::var("PECOS_QUEST_CUDA_LIB") {
-        paths.push(PathBuf::from(pkg_path));
-    }
-
-    // 2. Same directory as the current executable
-    if let Ok(exe_path) = std::env::current_exe()
-        && let Some(dir) = exe_path.parent()
-    {
-        paths.push(dir.join(CUDA_LIB_NAME));
-    }
-
-    // 3. PECOS home directory (~/.pecos/lib/)
-    if let Some(home) = dirs::home_dir() {
-        paths.push(home.join(".pecos").join("lib").join(CUDA_LIB_NAME));
-    }
-
-    // 4. Cargo target directory (for development)
-    // Check both debug and release directories relative to current dir
-    let cargo_target_paths = [
-        PathBuf::from("target/release").join(CUDA_LIB_NAME),
-        PathBuf::from("target/debug").join(CUDA_LIB_NAME),
-    ];
-    paths.extend(cargo_target_paths);
-
-    // 5. System library path (let the dynamic linker search)
-    paths.push(PathBuf::from(CUDA_LIB_NAME));
-
-    paths
-}
-
-/// Load the CUDA library from one of the search paths
-fn load_cuda_library() -> Result<CudaBackend, CudaLoadError> {
-    let search_paths = get_cuda_library_search_paths();
-
-    for path in &search_paths {
-        log::debug!("Trying to load CUDA library from: {}", path.display());
-
-        match unsafe { Library::new(path) } {
-            Ok(lib) => {
-                log::info!("Loaded CUDA library from: {}", path.display());
-                return load_symbols(lib);
-            }
-            Err(e) => {
-                log::debug!("Failed to load from {}: {e}", path.display());
-            }
-        }
-    }
-
-    let searched = search_paths
-        .iter()
-        .map(|p| p.display().to_string())
-        .collect::<Vec<_>>()
-        .join(", ");
-
-    Err(CudaLoadError::LibraryNotFound {
-        searched_paths: searched,
-    })
-}
-
-/// Helper macro to load a symbol from the library
-macro_rules! load_symbol {
-    ($lib:expr, $name:expr, $type:ty) => {{
-        let symbol: Symbol<$type> = $lib
-            .get(concat!("pecos_quest_cuda_", $name, "\0").as_bytes())
-            .map_err(|e| CudaLoadError::MissingSymbol(format!("{}: {e}", $name)))?;
-        *symbol
-    }};
-}
-
-/// Load all required symbols from the CUDA library
-#[allow(clippy::too_many_lines)]
-fn load_symbols(lib: Library) -> Result<CudaBackend, CudaLoadError> {
-    // Load all symbols and extract function pointers
-    // We use a macro to reduce boilerplate
-    let backend = unsafe {
-        CudaBackend {
-            // Environment management
-            create_env: load_symbol!(lib, "create_env", unsafe extern "C" fn() -> *mut u8),
-            destroy_env: load_symbol!(lib, "destroy_env", unsafe extern "C" fn(*mut u8)),
-            get_env_info: load_symbol!(
-                lib,
-                "get_env_info",
-                unsafe extern "C" fn(*mut u8) -> CudaEnvInfo
-            ),
-
-            // Qureg management
-            create_qureg: load_symbol!(
-                lib,
-                "create_qureg",
-                unsafe extern "C" fn(*mut u8, i32) -> *mut u8
-            ),
-            create_density_qureg: load_symbol!(
-                lib,
-                "create_density_qureg",
-                unsafe extern "C" fn(*mut u8, i32) -> *mut u8
-            ),
-            destroy_qureg: load_symbol!(lib, "destroy_qureg", unsafe extern "C" fn(*mut u8)),
-
-            // State initialization
-            init_zero_state: load_symbol!(lib, "init_zero_state", unsafe extern "C" fn(*mut u8)),
-            init_plus_state: load_symbol!(lib, "init_plus_state", unsafe extern "C" fn(*mut u8)),
-            init_classical_state: load_symbol!(
-                lib,
-                "init_classical_state",
-                unsafe extern "C" fn(*mut u8, i64)
-            ),
-
-            // Single-qubit gates
-            apply_pauli_x: load_symbol!(lib, "apply_pauli_x", unsafe extern "C" fn(*mut u8, i32)),
-            apply_pauli_y: load_symbol!(lib, "apply_pauli_y", unsafe extern "C" fn(*mut u8, i32)),
-            apply_pauli_z: load_symbol!(lib, "apply_pauli_z", unsafe extern "C" fn(*mut u8, i32)),
-            apply_hadamard: load_symbol!(lib, "apply_hadamard", unsafe extern "C" fn(*mut u8, i32)),
-            apply_s_gate: load_symbol!(lib, "apply_s_gate", unsafe extern "C" fn(*mut u8, i32)),
-            apply_t_gate: load_symbol!(lib, "apply_t_gate", unsafe extern "C" fn(*mut u8, i32)),
-            apply_phase_shift: load_symbol!(
-                lib,
-                "apply_phase_shift",
-                unsafe extern "C" fn(*mut u8, i32, f64)
-            ),
-
-            // Rotation gates
-            apply_rotation_x: load_symbol!(
-                lib,
-                "apply_rotation_x",
-                unsafe extern "C" fn(*mut u8, i32, f64)
-            ),
-            apply_rotation_y: load_symbol!(
-                lib,
-                "apply_rotation_y",
-                unsafe extern "C" fn(*mut u8, i32, f64)
-            ),
-            apply_rotation_z: load_symbol!(
-                lib,
-                "apply_rotation_z",
-                unsafe extern "C" fn(*mut u8, i32, f64)
-            ),
-
-            // Two-qubit gates
-            apply_cnot: load_symbol!(lib, "apply_cnot", unsafe extern "C" fn(*mut u8, i32, i32)),
-            apply_cz: load_symbol!(lib, "apply_cz", unsafe extern "C" fn(*mut u8, i32, i32)),
-            apply_swap: load_symbol!(lib, "apply_swap", unsafe extern "C" fn(*mut u8, i32, i32)),
-            apply_controlled_phase_shift: load_symbol!(
-                lib,
-                "apply_controlled_phase_shift",
-                unsafe extern "C" fn(*mut u8, i32, i32, f64)
-            ),
-
-            // Measurement
-            measure: load_symbol!(lib, "measure", unsafe extern "C" fn(*mut u8, i32) -> i32),
-            calc_prob_of_outcome: load_symbol!(
-                lib,
-                "calc_prob_of_outcome",
-                unsafe extern "C" fn(*mut u8, i32, i32) -> f64
-            ),
-            apply_forced_measurement: load_symbol!(
-                lib,
-                "apply_forced_measurement",
-                unsafe extern "C" fn(*mut u8, i32, i32) -> f64
-            ),
-
-            // Amplitude access
-            get_real_amp: load_symbol!(
-                lib,
-                "get_real_amp",
-                unsafe extern "C" fn(*mut u8, i64) -> f64
-            ),
-            get_imag_amp: load_symbol!(
-                lib,
-                "get_imag_amp",
-                unsafe extern "C" fn(*mut u8, i64) -> f64
-            ),
-            get_prob_amp: load_symbol!(
-                lib,
-                "get_prob_amp",
-                unsafe extern "C" fn(*mut u8, i64) -> f64
-            ),
-            calc_total_prob: load_symbol!(
-                lib,
-                "calc_total_prob",
-                unsafe extern "C" fn(*mut u8) -> f64
-            ),
-            calc_purity: load_symbol!(lib, "calc_purity", unsafe extern "C" fn(*mut u8) -> f64),
-
-            // Info
-            get_num_amps: load_symbol!(lib, "get_num_amps", unsafe extern "C" fn(*mut u8) -> i64),
-            get_num_qubits: load_symbol!(
-                lib,
-                "get_num_qubits",
-                unsafe extern "C" fn(*mut u8) -> i32
-            ),
-            is_density_matrix: load_symbol!(
-                lib,
-                "is_density_matrix",
-                unsafe extern "C" fn(*mut u8) -> bool
-            ),
-
-            // Keep library loaded
-            _library: lib,
-        }
-    };
-
-    Ok(backend)
-}
-
-/// Get a detailed error message for when CUDA acceleration is requested but unavailable
-#[must_use]
-pub fn cuda_unavailable_error_message() -> String {
-    let search_paths = get_cuda_library_search_paths();
-    let paths_str = search_paths
-        .iter()
-        .map(|p| format!("  - {}", p.display()))
-        .collect::<Vec<_>>()
-        .join("\n");
-
-    format!(
-        r"CUDA acceleration requested but not available.
-
-Possible causes:
-  - NVIDIA CUDA runtime (libcudart.so, libcublas.so) is not installed
-  - No NVIDIA GPU driver is installed
-  - The QuEST CUDA backend ({CUDA_LIB_NAME}) was not found
-
-Searched locations:
-{paths_str}
-
-Solutions:
-  - Install NVIDIA CUDA Toolkit: https://developer.nvidia.com/cuda-downloads
-  - Verify GPU availability: nvidia-smi
-  - Set PECOS_QUEST_CUDA_LIB environment variable to the backend library path
-  - Use CPU mode by setting use_cuda=False"
-    )
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_search_paths_not_empty() {
-        let paths = get_cuda_library_search_paths();
-        assert!(!paths.is_empty(), "Should have at least one search path");
-    }
-
-    #[test]
-    fn test_cuda_load_returns_result() {
-        // This test just verifies the function doesn't panic
-        // On systems without CUDA, it should return an error
-        let result = try_load_cuda();
-        // Either success or error is fine, we just verify it works
-        match result {
-            Ok(_) => println!("CUDA library loaded successfully"),
-            Err(e) => println!("CUDA library not available: {e}"),
-        }
-    }
-
-    #[test]
-    fn test_error_message_is_helpful() {
-        let msg = cuda_unavailable_error_message();
-        assert!(msg.contains("CUDA acceleration requested"));
-        assert!(msg.contains("CUDA"));
-        assert!(msg.contains("nvidia-smi"));
-    }
-}
diff --git a/crates/pecos-quest/src/gpu_stubs.cpp b/crates/pecos-quest/src/gpu_stubs.cpp
deleted file mode 100644
index 117df91c6..000000000
--- a/crates/pecos-quest/src/gpu_stubs.cpp
+++ /dev/null
@@ -1,958 +0,0 @@
-// Minimal GPU stub implementations for CPU-only build
-// These functions are referenced by QuEST code but not actually used in CPU mode
-
-#include <complex>
-#include <vector>
-#include <cstddef>
-#include <algorithm>
-
-// Forward declare Qureg structure to match QuEST's definition in qureg.h
-typedef long long qindex;
-typedef std::complex<double> qcomp;
-
-struct Qureg {
-    // deployment configuration
-    int isMultithreaded;
-    int isGpuAccelerated;
-    int isDistributed;
-
-    // distributed configuration
-    int rank;
-    int numNodes;
-    int logNumNodes;
-
-    // dimension
-    int isDensityMatrix;
-    int numQubits;
-    qindex numAmps;
-    qindex logNumAmps;
-
-    // distributed load
-    qindex numAmpsPerNode;
-    qindex logNumAmpsPerNode;
-    qindex logNumColsPerNode;
-
-    // amplitudes in CPU and GPU memory
-    qcomp* cpuAmps;
-    qcomp* gpuAmps;
-
-    // communication buffer in CPU and GPU memory
-    qcomp* cpuCommBuffer;
-    qcomp* gpuCommBuffer;
-};
-
-// GPU availability functions - these use C++ linkage to match QuEST's expectations
-bool gpu_isGpuCompiled() { return false; }
-bool gpu_isGpuAvailable() { return false; }
-bool gpu_isDirectGpuCommPossible() { return false; }
-bool gpu_isCuQuantumCompiled() { return false; }
-bool gpu_areAnyNodesBoundToSameGpu() { return false; }
-bool gpu_doesGpuSupportMemPools() { return false; }
-
-size_t gpu_getCurrentAvailableMemoryInBytes() { return 0; }
-int gpu_getComputeCapability() { return 0; }
-
-void gpu_bindLocalGPUsToNodes() {}
-void gpu_initCuQuantum() {}
-
-// GPU sync and memory functions
-void gpu_sync() {}
-std::complex<double>* gpu_allocArray(long long size) { return nullptr; }
-void gpu_deallocArray(std::complex<double>* ptr) {}
-
-// GPU copy functions - these need C++ linkage for overloading
-void gpu_copyGpuToCpu(Qureg qureg) {}
-void gpu_copyGpuToCpu(Qureg qureg, std::complex<double>* cpuPtr, std::complex<double>* gpuPtr, long long size) {}
-void gpu_copyGpuToCpu(std::complex<double>* gpuPtr, std::complex<double>* cpuPtr, long long size) {}
-void gpu_copyCpuToGpu(Qureg qureg) {}
-void gpu_copyCpuToGpu(Qureg qureg, std::complex<double>* cpuPtr, std::complex<double>* gpuPtr, long long size) {}
-void gpu_copyCpuToGpu(std::complex<double>* cpuPtr, std::complex<double>* gpuPtr, long long size) {}
-
-
-// Most accelerator functions are now provided by accelerator.cpp
-// We only need to stub functions that accelerator.cpp calls but aren't defined
-
-void gpu_statevec_setQuregToSuperposition_sub(std::complex<double> a, Qureg q1,
-    std::complex<double> b, Qureg q2, std::complex<double> c, Qureg q3) {}
-void gpu_densmatr_mixQureg_subA(double a, Qureg q1, double b, Qureg q2) {}
-void gpu_densmatr_mixQureg_subB(double a, Qureg q1, double b, Qureg q2) {}
-void gpu_densmatr_mixQureg_subC(double a, Qureg q1, double b) {}
-// Note: gpu_statevec_calcTotalProb_sub is defined later with correct return type
-void gpu_statevec_initUniformState_sub(Qureg q, std::complex<double> a) {}
-
-
-// Additional structures needed for templates
-struct CompMatr1 {
-    qcomp elems[4];
-};
-struct DiagMatr1 {};
-
-// Template instantiations outside of extern "C"
-// These need to exist but won't be called in CPU mode
-template<int N>
-long long gpu_statevec_packAmpsIntoBuffer(Qureg q, std::vector<int> a, std::vector<int> b) { return 0; }
-
-template<int N>
-void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg q, std::vector<int> a,
-    std::vector<int> b, int c, CompMatr1 d) {}
-
-template<int N>
-void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg q, std::vector<int> a,
-    std::vector<int> b, qcomp c, qcomp d) {}
-
-template<int N>
-void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg q, std::vector<int> a,
-    std::vector<int> b, int c, DiagMatr1 d) {}
-
-template<int N>
-void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg q, std::vector<int> a,
-    std::vector<int> b, std::vector<int> c, std::complex<double> d, std::complex<double> e) {}
-
-template<int N, int M = 0>
-void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg q, std::vector<int> a,
-    std::vector<int> b, std::vector<int> c, std::vector<int> d,
-    std::vector<int> e, std::complex<double> f, std::complex<double> g) {}
-
-template<int N>
-void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg q, std::vector<int> a,
-    std::vector<int> b, std::vector<int> c, std::vector<int> d,
-    std::vector<int> e, std::complex<double> f, std::complex<double> g, long long h) {}
-
-template<int N>
-double gpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg q, std::vector<int> a, std::vector<int> b) { return 0.0; }
-
-template<int N>
-double gpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg q, std::vector<int> a, std::vector<int> b) { return 0.0; }
-
-template<int N>
-void gpu_statevec_multiQubitProjector_sub(Qureg q, std::vector<int> a, std::vector<int> b, double c) {}
-
-template<int N>
-void gpu_densmatr_multiQubitProjector_sub(Qureg q, std::vector<int> a, std::vector<int> b, double c) {}
-
-// Explicit instantiations for all the template values QuEST uses
-// gpu_statevec_packAmpsIntoBuffer
-template long long gpu_statevec_packAmpsIntoBuffer<0>(Qureg, std::vector<int>, std::vector<int>);
-template long long gpu_statevec_packAmpsIntoBuffer<1>(Qureg, std::vector<int>, std::vector<int>);
-template long long gpu_statevec_packAmpsIntoBuffer<2>(Qureg, std::vector<int>, std::vector<int>);
-template long long gpu_statevec_packAmpsIntoBuffer<3>(Qureg, std::vector<int>, std::vector<int>);
-template long long gpu_statevec_packAmpsIntoBuffer<4>(Qureg, std::vector<int>, std::vector<int>);
-template long long gpu_statevec_packAmpsIntoBuffer<5>(Qureg, std::vector<int>, std::vector<int>);
-template long long gpu_statevec_packAmpsIntoBuffer<-1>(Qureg, std::vector<int>, std::vector<int>);
-
-// gpu_statevec_anyCtrlOneTargDenseMatr_subA
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subA<0>(Qureg, std::vector<int>, std::vector<int>, int, CompMatr1);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subA<1>(Qureg, std::vector<int>, std::vector<int>, int, CompMatr1);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subA<2>(Qureg, std::vector<int>, std::vector<int>, int, CompMatr1);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subA<3>(Qureg, std::vector<int>, std::vector<int>, int, CompMatr1);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subA<4>(Qureg, std::vector<int>, std::vector<int>, int, CompMatr1);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subA<5>(Qureg, std::vector<int>, std::vector<int>, int, CompMatr1);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subA<-1>(Qureg, std::vector<int>, std::vector<int>, int, CompMatr1);
-
-// gpu_statevec_anyCtrlOneTargDenseMatr_subB
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subB<0>(Qureg, std::vector<int>, std::vector<int>, qcomp, qcomp);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subB<1>(Qureg, std::vector<int>, std::vector<int>, qcomp, qcomp);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subB<2>(Qureg, std::vector<int>, std::vector<int>, qcomp, qcomp);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subB<3>(Qureg, std::vector<int>, std::vector<int>, qcomp, qcomp);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subB<4>(Qureg, std::vector<int>, std::vector<int>, qcomp, qcomp);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subB<5>(Qureg, std::vector<int>, std::vector<int>, qcomp, qcomp);
-template void gpu_statevec_anyCtrlOneTargDenseMatr_subB<-1>(Qureg, std::vector<int>, std::vector<int>, qcomp, qcomp);
-
-// gpu_statevec_anyCtrlOneTargDiagMatr_sub
-template void gpu_statevec_anyCtrlOneTargDiagMatr_sub<0>(Qureg, std::vector<int>, std::vector<int>, int, DiagMatr1);
-template void gpu_statevec_anyCtrlOneTargDiagMatr_sub<1>(Qureg, std::vector<int>, std::vector<int>, int, DiagMatr1);
-template void gpu_statevec_anyCtrlOneTargDiagMatr_sub<2>(Qureg, std::vector<int>, std::vector<int>, int, DiagMatr1);
-template void gpu_statevec_anyCtrlOneTargDiagMatr_sub<3>(Qureg, std::vector<int>, std::vector<int>, int, DiagMatr1);
-template void gpu_statevec_anyCtrlOneTargDiagMatr_sub<4>(Qureg, std::vector<int>, std::vector<int>, int, DiagMatr1);
-template void gpu_statevec_anyCtrlOneTargDiagMatr_sub<5>(Qureg, std::vector<int>, std::vector<int>, int, DiagMatr1);
-template void gpu_statevec_anyCtrlOneTargDiagMatr_sub<-1>(Qureg, std::vector<int>, std::vector<int>, int, DiagMatr1);
-
-template void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub<0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub<1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub<2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub<3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub<4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub<5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub<-1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-// Note: Single template parameter versions are removed as they conflict with
-// two-parameter versions where M=0 (which are included below)
-
-// Two template parameter versions - need all combinations
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<0, 0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<0, 1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<0, 2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<0, 3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<0, 4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<0, 5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<0, -1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<1, 0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<1, 1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<1, 2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<1, 3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<1, 4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<1, 5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<1, -1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<2, 0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<2, 1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<2, 2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<2, 3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<2, 4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<2, 5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<2, -1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<3, 0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<3, 1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<3, 2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<3, 3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<3, 4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<3, 5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<3, -1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<4, 0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<4, 1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<4, 2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<4, 3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<4, 4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<4, 5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<4, -1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<5, 0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<5, 1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<5, 2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<5, 3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<5, 4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<5, 5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<5, -1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<-1, 0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<-1, 1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<-1, 2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<-1, 3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<-1, 4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<-1, 5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subA<-1, -1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>);
-
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subB<0>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>, long long);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subB<1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>, long long);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subB<2>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>, long long);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subB<3>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>, long long);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subB<4>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>, long long);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subB<5>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>, long long);
-template void gpu_statevector_anyCtrlPauliTensorOrGadget_subB<-1>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>, std::complex<double>, std::complex<double>, long long);
-
-template double gpu_statevec_calcProbOfMultiQubitOutcome_sub<0>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_statevec_calcProbOfMultiQubitOutcome_sub<1>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_statevec_calcProbOfMultiQubitOutcome_sub<2>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_statevec_calcProbOfMultiQubitOutcome_sub<3>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_statevec_calcProbOfMultiQubitOutcome_sub<4>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_statevec_calcProbOfMultiQubitOutcome_sub<5>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_statevec_calcProbOfMultiQubitOutcome_sub<-1>(Qureg, std::vector<int>, std::vector<int>);
-
-template double gpu_densmatr_calcProbOfMultiQubitOutcome_sub<0>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_densmatr_calcProbOfMultiQubitOutcome_sub<1>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_densmatr_calcProbOfMultiQubitOutcome_sub<2>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_densmatr_calcProbOfMultiQubitOutcome_sub<3>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_densmatr_calcProbOfMultiQubitOutcome_sub<4>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_densmatr_calcProbOfMultiQubitOutcome_sub<5>(Qureg, std::vector<int>, std::vector<int>);
-template double gpu_densmatr_calcProbOfMultiQubitOutcome_sub<-1>(Qureg, std::vector<int>, std::vector<int>);
-
-template void gpu_statevec_multiQubitProjector_sub<0>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_statevec_multiQubitProjector_sub<1>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_statevec_multiQubitProjector_sub<2>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_statevec_multiQubitProjector_sub<3>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_statevec_multiQubitProjector_sub<4>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_statevec_multiQubitProjector_sub<5>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_statevec_multiQubitProjector_sub<-1>(Qureg, std::vector<int>, std::vector<int>, double);
-
-template void gpu_densmatr_multiQubitProjector_sub<0>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_densmatr_multiQubitProjector_sub<1>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_densmatr_multiQubitProjector_sub<2>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_densmatr_multiQubitProjector_sub<3>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_densmatr_multiQubitProjector_sub<4>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_densmatr_multiQubitProjector_sub<5>(Qureg, std::vector<int>, std::vector<int>, double);
-template void gpu_densmatr_multiQubitProjector_sub<-1>(Qureg, std::vector<int>, std::vector<int>, double);
-
-// Additional GPU stubs needed for finalizeQuESTEnv
-void gpu_clearCache() {
-    // No-op for CPU-only builds
-}
-
-void gpu_finalizeCuQuantum() {
-    // No-op for CPU-only builds
-}
-
-// Additional GPU info functions
-int gpu_getNumberOfLocalGpus() { return 0; }
-size_t gpu_getTotalMemoryInBytes() { return 0; }
-size_t gpu_getCacheMemoryInBytes() { return 0; }
-
-// Additional matrix structures
-struct CompMatr {
-    int isDensityMatrix;
-    int numQubits;
-    qindex numAmps;
-    qcomp* real;
-    qcomp* imag;
-};
-
-struct DiagMatr {
-    int numQubits;
-    qindex numAmps;
-    qcomp* elems;
-};
-
-struct SuperOp {
-    int numQubits;
-    qindex numAmps;
-    qcomp* real;
-    qcomp* imag;
-};
-
-struct FullStateDiagMatr {
-    int numQubits;
-    qindex numAmps;
-    qcomp* elems;
-};
-
-struct PauliStrSum {
-    int numQubits;
-    int numTerms;
-    double* coeffs;
-    int* pauliCodes;
-};
-
-struct CompMatr2 {
-    qcomp elems[16];  // 4x4 matrix
-};
-
-struct DiagMatr2 {
-    qcomp elems[4];   // 2 qubit diagonal
-};
-
-// GPU copy functions for matrices
-void gpu_copyGpuToCpu(CompMatr m) {}
-void gpu_copyGpuToCpu(SuperOp m) {}
-void gpu_copyCpuToGpu(CompMatr m) {}
-void gpu_copyCpuToGpu(DiagMatr m) {}
-void gpu_copyCpuToGpu(FullStateDiagMatr m) {}
-
-// GPU accelerator stub functions
-std::complex<double> gpu_statevec_getAmp_sub(Qureg q, long long idx) { return 0.0; }
-void gpu_densmatr_setAmpsToPauliStrSum_sub(Qureg q, PauliStrSum p) {}
-void gpu_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr m, PauliStrSum p) {}
-long long gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg q, int a, int b, int c, int d) { return 0; }
-
-// Decoherence functions
-void gpu_densmatr_oneQubitDephasing_subA(Qureg q, int target, double dephase) {}
-void gpu_densmatr_oneQubitDephasing_subB(Qureg q, int target, double dephase) {}
-void gpu_densmatr_twoQubitDephasing_subA(Qureg q, int q1, int q2, double dephase) {}
-void gpu_densmatr_twoQubitDephasing_subB(Qureg q, int q1, int q2, double dephase) {}
-void gpu_densmatr_oneQubitDepolarising_subA(Qureg q, int target, double depolProb) {}
-void gpu_densmatr_oneQubitDepolarising_subB(Qureg q, int target, double depolProb) {}
-void gpu_densmatr_twoQubitDepolarising_subA(Qureg q, int q1, int q2, double depolProb) {}
-void gpu_densmatr_twoQubitDepolarising_subB(Qureg q, int q1, int q2, double depolProb) {}
-void gpu_densmatr_twoQubitDepolarising_subC(Qureg q, int q1, int q2, double depolProb) {}
-void gpu_densmatr_twoQubitDepolarising_subD(Qureg q, int q1, int q2, double depolProb) {}
-void gpu_densmatr_twoQubitDepolarising_subE(Qureg q, int q1, int q2, double depolProb) {}
-void gpu_densmatr_twoQubitDepolarising_subF(Qureg q, int q1, int q2, double depolProb) {}
-void gpu_densmatr_oneQubitPauliChannel_subA(Qureg q, int target, double px, double py, double pz, double pi) {}
-void gpu_densmatr_oneQubitPauliChannel_subB(Qureg q, int target, double px, double py, double pz, double pi) {}
-void gpu_densmatr_oneQubitDamping_subA(Qureg q, int target, double damping) {}
-void gpu_densmatr_oneQubitDamping_subB(Qureg q, int target, double damping) {}
-void gpu_densmatr_oneQubitDamping_subC(Qureg q, int target, double damping) {}
-void gpu_densmatr_oneQubitDamping_subD(Qureg q, int target, double damping) {}
-
-// Calculation functions - note return types
-double gpu_statevec_calcTotalProb_sub(Qureg q) { return 1.0; }
-double gpu_densmatr_calcTotalProb_sub(Qureg q) { return 1.0; }
-std::complex<double> gpu_statevec_calcInnerProduct_sub(Qureg q1, Qureg q2) { return 0.0; }
-double gpu_densmatr_calcHilbertSchmidtDistance_sub(Qureg q1, Qureg q2) { return 0.0; }
-// Note: Function names use calcExpec (not calcExpec) to match QuEST v4.1.0
-double gpu_statevec_calcExpecAnyTargZ_sub(Qureg q, std::vector<int> targets) { return 0.0; }
-std::complex<double> gpu_densmatr_calcExpecAnyTargZ_sub(Qureg q, std::vector<int> targets) { return 0.0; }
-std::complex<double> gpu_statevec_calcExpecPauliStr_subA(Qureg q, std::vector<int> a, std::vector<int> b, std::vector<int> c) { return 0.0; }
-std::complex<double> gpu_statevec_calcExpecPauliStr_subB(Qureg q, std::vector<int> a, std::vector<int> b, std::vector<int> c) { return 0.0; }
-std::complex<double> gpu_densmatr_calcExpecPauliStr_sub(Qureg q, std::vector<int> a, std::vector<int> b, std::vector<int> c) { return 0.0; }
-
-// Weighted sum (v4.2.0+)
-template<int NumQuregs>
-void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, std::vector<std::complex<double>> coeffs, std::vector<Qureg> inQuregs) {}
-
-// Init functions
-void gpu_statevec_initDebugState_sub(Qureg q) {}
-void gpu_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg q) {}
-
-// Template stubs for SWAP operations
-template<int N>
-void gpu_statevec_anyCtrlSwap_subA(Qureg q, std::vector<int> ctrls, std::vector<int> ctrlVals, int q1, int q2) {}
-
-template<int N>
-void gpu_statevec_anyCtrlSwap_subB(Qureg q, std::vector<int> ctrls, std::vector<int> ctrlVals) {}
-
-template<int N>
-void gpu_statevec_anyCtrlSwap_subC(Qureg q, std::vector<int> ctrls, std::vector<int> ctrlVals, int q1, int q2) {}
-
-// Template stubs for two-target dense matrix operations
-template<int N>
-void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg q, std::vector<int> ctrls, std::vector<int> ctrlVals, int t1, int t2, CompMatr2 m) {}
-
-// Template stubs for any-target dense matrix operations
-template<int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
-void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg q, std::vector<int> ctrls, std::vector<int> ctrlVals, std::vector<int> targets, CompMatr m) {}
-
-// Template stubs for two-target diagonal matrix operations
-template<int N>
-void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg q, std::vector<int> ctrls, std::vector<int> ctrlVals, int t1, int t2, DiagMatr2 m) {}
-
-// Template stubs for any-target diagonal matrix operations
-template<int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower>
-void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg q, std::vector<int> ctrls, std::vector<int> ctrlVals, std::vector<int> targets, DiagMatr m, std::complex<double> globalPhase) {}
-
-// Template stubs for all-target diagonal matrix operations
-template<bool HasPower>
-void gpu_statevec_allTargDiagMatr_sub(Qureg q, FullStateDiagMatr m, std::complex<double> globalPhase) {}
-
-template<bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight>
-void gpu_densmatr_allTargDiagMatr_sub(Qureg q, FullStateDiagMatr m, std::complex<double> globalPhase) {}
-
-// Template stubs for partial trace operations
-template<int N>
-void gpu_densmatr_partialTrace_sub(Qureg traceOut, Qureg traceIn, std::vector<int> targets, std::vector<int> controls) {}
-
-// Template stubs for probability calculations
-template<int N>
-void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(double* probs, Qureg q, std::vector<int> qubits) {}
-
-template<int N>
-void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(double* probs, Qureg q, std::vector<int> qubits) {}
-
-// Template stubs for fidelity calculations
-template<bool Conj>
-std::complex<double> gpu_densmatr_calcFidelityWithPureState_sub(Qureg densMatr, Qureg pureState) {
-    return std::complex<double>(0.0, 0.0);
-}
-
-// Template stubs for expectation value calculations
-template<bool HasPower, bool UseRealPow>
-std::complex<double> gpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg q, FullStateDiagMatr m, std::complex<double> globalPhase) {
-    return std::complex<double>(0.0, 0.0);
-}
-
-template<bool HasPower, bool UseRealPow>
-std::complex<double> gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg q, FullStateDiagMatr m, std::complex<double> globalPhase) {
-    return std::complex<double>(0.0, 0.0);
-}
-
-// Explicit template instantiations for weighted sum (v4.2.0+)
-template void gpu_statevec_setQuregToWeightedSum_sub<0>(Qureg, std::vector<std::complex<double>>, std::vector<Qureg>);
-template void gpu_statevec_setQuregToWeightedSum_sub<1>(Qureg, std::vector<std::complex<double>>, std::vector<Qureg>);
-template void gpu_statevec_setQuregToWeightedSum_sub<2>(Qureg, std::vector<std::complex<double>>, std::vector<Qureg>);
-template void gpu_statevec_setQuregToWeightedSum_sub<3>(Qureg, std::vector<std::complex<double>>, std::vector<Qureg>);
-template void gpu_statevec_setQuregToWeightedSum_sub<4>(Qureg, std::vector<std::complex<double>>, std::vector<Qureg>);
-template void gpu_statevec_setQuregToWeightedSum_sub<5>(Qureg, std::vector<std::complex<double>>, std::vector<Qureg>);
-template void gpu_statevec_setQuregToWeightedSum_sub<-1>(Qureg, std::vector<std::complex<double>>, std::vector<Qureg>);
-
-// Explicit template instantiations for SWAP operations
-template void gpu_statevec_anyCtrlSwap_subA<0>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subA<1>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subA<2>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subA<3>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subA<4>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subA<5>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subA<-1>(Qureg, std::vector<int>, std::vector<int>, int, int);
-
-template void gpu_statevec_anyCtrlSwap_subB<0>(Qureg, std::vector<int>, std::vector<int>);
-template void gpu_statevec_anyCtrlSwap_subB<1>(Qureg, std::vector<int>, std::vector<int>);
-template void gpu_statevec_anyCtrlSwap_subB<2>(Qureg, std::vector<int>, std::vector<int>);
-template void gpu_statevec_anyCtrlSwap_subB<3>(Qureg, std::vector<int>, std::vector<int>);
-template void gpu_statevec_anyCtrlSwap_subB<4>(Qureg, std::vector<int>, std::vector<int>);
-template void gpu_statevec_anyCtrlSwap_subB<5>(Qureg, std::vector<int>, std::vector<int>);
-template void gpu_statevec_anyCtrlSwap_subB<-1>(Qureg, std::vector<int>, std::vector<int>);
-
-template void gpu_statevec_anyCtrlSwap_subC<0>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subC<1>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subC<2>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subC<3>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subC<4>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subC<5>(Qureg, std::vector<int>, std::vector<int>, int, int);
-template void gpu_statevec_anyCtrlSwap_subC<-1>(Qureg, std::vector<int>, std::vector<int>, int, int);
-
-// Explicit template instantiations for two-target dense matrix operations
-template void gpu_statevec_anyCtrlTwoTargDenseMatr_sub<0>(Qureg, std::vector<int>, std::vector<int>, int, int, CompMatr2);
-template void gpu_statevec_anyCtrlTwoTargDenseMatr_sub<1>(Qureg, std::vector<int>, std::vector<int>, int, int, CompMatr2);
-template void gpu_statevec_anyCtrlTwoTargDenseMatr_sub<2>(Qureg, std::vector<int>, std::vector<int>, int, int, CompMatr2);
-template void gpu_statevec_anyCtrlTwoTargDenseMatr_sub<3>(Qureg, std::vector<int>, std::vector<int>, int, int, CompMatr2);
-template void gpu_statevec_anyCtrlTwoTargDenseMatr_sub<4>(Qureg, std::vector<int>, std::vector<int>, int, int, CompMatr2);
-template void gpu_statevec_anyCtrlTwoTargDenseMatr_sub<5>(Qureg, std::vector<int>, std::vector<int>, int, int, CompMatr2);
-template void gpu_statevec_anyCtrlTwoTargDenseMatr_sub<-1>(Qureg, std::vector<int>, std::vector<int>, int, int, CompMatr2);
-
-// Explicit template instantiations for two-target diagonal matrix operations
-template void gpu_statevec_anyCtrlTwoTargDiagMatr_sub<0>(Qureg, std::vector<int>, std::vector<int>, int, int, DiagMatr2);
-template void gpu_statevec_anyCtrlTwoTargDiagMatr_sub<1>(Qureg, std::vector<int>, std::vector<int>, int, int, DiagMatr2);
-template void gpu_statevec_anyCtrlTwoTargDiagMatr_sub<2>(Qureg, std::vector<int>, std::vector<int>, int, int, DiagMatr2);
-template void gpu_statevec_anyCtrlTwoTargDiagMatr_sub<3>(Qureg, std::vector<int>, std::vector<int>, int, int, DiagMatr2);
-template void gpu_statevec_anyCtrlTwoTargDiagMatr_sub<4>(Qureg, std::vector<int>, std::vector<int>, int, int, DiagMatr2);
-template void gpu_statevec_anyCtrlTwoTargDiagMatr_sub<5>(Qureg, std::vector<int>, std::vector<int>, int, int, DiagMatr2);
-template void gpu_statevec_anyCtrlTwoTargDiagMatr_sub<-1>(Qureg, std::vector<int>, std::vector<int>, int, int, DiagMatr2);
-// Explicit template instantiations for any-target dense matrix operations
-// gpu_statevec_anyCtrlAnyTargDenseMatr_sub<NumCtrls, NumTargs, ApplyConj, ApplyTransp>
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<0, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<1, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<2, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<3, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<4, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<5, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-template void gpu_statevec_anyCtrlAnyTargDenseMatr_sub<-1, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, CompMatr);
-
-// Explicit template instantiations for any-target diagonal matrix operations
-// gpu_statevec_anyCtrlAnyTargDiagMatr_sub<N1, N2, N3, N4>
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<0, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<1, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<2, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<3, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<4, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<5, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 0, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 0, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 0, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 0, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 2, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 2, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 2, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 2, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 3, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 3, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 3, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 3, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 4, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 4, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 4, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 4, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 5, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 5, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 5, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, 5, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, -1, false, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, -1, false, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, -1, true, false>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-template void gpu_statevec_anyCtrlAnyTargDiagMatr_sub<-1, -1, true, true>(Qureg, std::vector<int>, std::vector<int>, std::vector<int>, DiagMatr, std::complex<double>);
-
-// Explicit template instantiations for all-target diagonal matrix operations
-template void gpu_statevec_allTargDiagMatr_sub<false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_statevec_allTargDiagMatr_sub<true>(Qureg, FullStateDiagMatr, std::complex<double>);
-
-template void gpu_densmatr_allTargDiagMatr_sub<false, false, false, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<false, false, false, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<false, false, true, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<false, false, true, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<false, true, false, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<false, true, false, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<false, true, true, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<false, true, true, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, false, false, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, false, false, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, false, true, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, false, true, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, true, false, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, true, false, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, true, true, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template void gpu_densmatr_allTargDiagMatr_sub<true, true, true, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-
-// Explicit template instantiations for partial trace operations
-template void gpu_densmatr_partialTrace_sub<0>(Qureg, Qureg, std::vector<int>, std::vector<int>);
-template void gpu_densmatr_partialTrace_sub<1>(Qureg, Qureg, std::vector<int>, std::vector<int>);
-template void gpu_densmatr_partialTrace_sub<2>(Qureg, Qureg, std::vector<int>, std::vector<int>);
-template void gpu_densmatr_partialTrace_sub<3>(Qureg, Qureg, std::vector<int>, std::vector<int>);
-template void gpu_densmatr_partialTrace_sub<4>(Qureg, Qureg, std::vector<int>, std::vector<int>);
-template void gpu_densmatr_partialTrace_sub<5>(Qureg, Qureg, std::vector<int>, std::vector<int>);
-template void gpu_densmatr_partialTrace_sub<-1>(Qureg, Qureg, std::vector<int>, std::vector<int>);
-
-// Explicit template instantiations for probability calculations
-template void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub<0>(double*, Qureg, std::vector<int>);
-template void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub<1>(double*, Qureg, std::vector<int>);
-template void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub<2>(double*, Qureg, std::vector<int>);
-template void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub<3>(double*, Qureg, std::vector<int>);
-template void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub<4>(double*, Qureg, std::vector<int>);
-template void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub<5>(double*, Qureg, std::vector<int>);
-template void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub<-1>(double*, Qureg, std::vector<int>);
-
-template void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<0>(double*, Qureg, std::vector<int>);
-template void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<1>(double*, Qureg, std::vector<int>);
-template void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<2>(double*, Qureg, std::vector<int>);
-template void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<3>(double*, Qureg, std::vector<int>);
-template void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<4>(double*, Qureg, std::vector<int>);
-template void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<5>(double*, Qureg, std::vector<int>);
-template void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<-1>(double*, Qureg, std::vector<int>);
-
-// Explicit template instantiations for fidelity calculations
-template std::complex<double> gpu_densmatr_calcFidelityWithPureState_sub<false>(Qureg, Qureg);
-template std::complex<double> gpu_densmatr_calcFidelityWithPureState_sub<true>(Qureg, Qureg);
-
-// Explicit template instantiations for expectation value calculations
-template std::complex<double> gpu_statevec_calcExpecFullStateDiagMatr_sub<false, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template std::complex<double> gpu_statevec_calcExpecFullStateDiagMatr_sub<false, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template std::complex<double> gpu_statevec_calcExpecFullStateDiagMatr_sub<true, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template std::complex<double> gpu_statevec_calcExpecFullStateDiagMatr_sub<true, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-
-template std::complex<double> gpu_densmatr_calcExpecFullStateDiagMatr_sub<false, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template std::complex<double> gpu_densmatr_calcExpecFullStateDiagMatr_sub<false, true>(Qureg, FullStateDiagMatr, std::complex<double>);
-template std::complex<double> gpu_densmatr_calcExpecFullStateDiagMatr_sub<true, false>(Qureg, FullStateDiagMatr, std::complex<double>);
-template std::complex<double> gpu_densmatr_calcExpecFullStateDiagMatr_sub<true, true>(Qureg, FullStateDiagMatr, std::complex<double>);
diff --git a/crates/pecos-quest/src/lib.rs b/crates/pecos-quest/src/lib.rs
deleted file mode 100644
index c10c2affa..000000000
--- a/crates/pecos-quest/src/lib.rs
+++ /dev/null
@@ -1,1023 +0,0 @@
-//! `QuEST` quantum simulator wrapper for PECOS
-//!
-//! # Thread Safety Warning
-//!
-//! **CRITICAL**: `QuEST` has a fundamental limitation - it uses a single global environment
-//! per process. This means ALL `QuestStateVec` instances share the same underlying `QuEST`
-//! environment, which can lead to race conditions and segmentation faults when used
-//! concurrently from multiple threads.
-//!
-//! For safe usage:
-//! - Run tests with `--test-threads=1`
-//! - Use only one `QuestStateVec` instance per process in production
-//! - See `THREAD_SAFETY_WARNING.md` for detailed information
-
-use core::fmt::Debug;
-use num_complex::Complex64;
-use pecos_random::{PecosRng, Rng, SeedableRng, time_seed};
-use std::f64::consts::FRAC_PI_4;
-use thiserror::Error;
-
-pub mod bridge;
-use bridge::ffi;
-
-pub mod cuda_loader;
-
-pub mod quantum_engine;
-pub use quantum_engine::{
-    QuestCudaStateVecEngine, QuestDensityMatrixEngine, QuestDensityMatrixEngineBuilder,
-    QuestStateVecEngine, QuestStateVectorEngineBuilder, quest_density_matrix, quest_state_vec,
-};
-
-pub use pecos_core::rng::RngManageable;
-use pecos_core::{Angle64, QubitId};
-pub use pecos_simulators::{
-    ArbitraryRotationGateable, CliffordGateable, MeasurementResult, QuantumSimulator,
-};
-
-#[derive(Error, Debug)]
-pub enum QuestError {
-    #[error("QuEST initialization failed: {0}")]
-    InitializationError(String),
-
-    #[error("Invalid qubit index: {0}")]
-    InvalidQubit(usize),
-
-    #[error("Invalid operation: {0}")]
-    InvalidOperation(String),
-
-    #[error("FFI error: {0}")]
-    FfiError(#[from] cxx::Exception),
-}
-
-pub type Result<T> = std::result::Result<T, QuestError>;
-
-/// RAII wrapper for `QuEST` environment pointer
-#[derive(Debug)]
-struct QuestEnvWrapper {
-    ptr: *mut u8,
-}
-
-impl QuestEnvWrapper {
-    fn new() -> Result<Self> {
-        let ptr = ffi::quest_create_env();
-        if ptr.is_null() {
-            return Err(QuestError::InitializationError(
-                "Failed to create QuEST environment".into(),
-            ));
-        }
-        Ok(Self { ptr })
-    }
-}
-
-impl Drop for QuestEnvWrapper {
-    fn drop(&mut self) {
-        if !self.ptr.is_null() {
-            unsafe {
-                ffi::quest_destroy_env(self.ptr);
-            }
-        }
-    }
-}
-
-// SAFETY: QuestEnvWrapper owns its env pointer exclusively.
-// WARNING: QuEST uses a global environment. Concurrent access from multiple threads is unsafe.
-// Sync is required by the Engine trait but callers must ensure single-threaded access.
-unsafe impl Send for QuestEnvWrapper {}
-unsafe impl Sync for QuestEnvWrapper {}
-
-/// RAII wrapper for `QuEST` qureg pointer
-#[derive(Debug)]
-struct QuregWrapper {
-    ptr: *mut u8,
-}
-
-impl QuregWrapper {
-    fn new(env: &QuestEnvWrapper, num_qubits: i32, is_density: bool) -> Result<Self> {
-        let ptr = unsafe {
-            if is_density {
-                ffi::quest_create_density_qureg(env.ptr, num_qubits)
-            } else {
-                ffi::quest_create_qureg(env.ptr, num_qubits)
-            }
-        };
-
-        if ptr.is_null() {
-            return Err(QuestError::InitializationError(
-                "Failed to create QuEST qureg".into(),
-            ));
-        }
-        Ok(Self { ptr })
-    }
-}
-
-impl Drop for QuregWrapper {
-    fn drop(&mut self) {
-        if !self.ptr.is_null() {
-            unsafe {
-                ffi::quest_destroy_qureg(self.ptr);
-            }
-        }
-    }
-}
-
-// SAFETY: QuregWrapper owns its qureg pointer exclusively.
-// WARNING: QuEST qureg operations are not thread-safe. Sync is required by the Engine trait
-// but callers must ensure single-threaded access (e.g., --test-threads=1).
-unsafe impl Send for QuregWrapper {}
-unsafe impl Sync for QuregWrapper {}
-
-/// A quantum state simulator using the `QuEST` state vector representation
-#[derive(Debug)]
-pub struct QuestStateVec<R = PecosRng>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    num_qubits: usize,
-    // The QuEST environment must be kept alive for the lifetime of the simulator.
-    // This field manages the global QuEST environment reference count via RAII.
-    env: QuestEnvWrapper,
-    qureg: QuregWrapper,
-    rng: R,
-}
-
-impl QuestStateVec {
-    /// Creates a new `QuestStateVec` with the specified number of qubits.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `QuEST` environment cannot be created or if the quantum register
-    /// allocation fails.
-    #[must_use]
-    pub fn new(num_qubits: usize) -> Self {
-        Self::with_seed(num_qubits, time_seed())
-    }
-}
-
-impl<R> QuestStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    /// Creates a new `QuestStateVec` with the specified number of qubits and seed.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `QuEST` environment cannot be created or if the quantum register
-    /// allocation fails.
-    #[must_use]
-    pub fn with_seed(num_qubits: usize, seed: u64) -> Self {
-        let env = QuestEnvWrapper::new().expect("Failed to create QuEST environment");
-        let qureg = QuregWrapper::new(
-            &env,
-            i32::try_from(num_qubits).expect("Too many qubits"),
-            false,
-        )
-        .expect("Failed to create QuEST qureg");
-        let rng = R::seed_from_u64(seed);
-
-        let state = Self {
-            num_qubits,
-            env,
-            qureg,
-            rng,
-        };
-
-        unsafe {
-            ffi::quest_init_zero_state(state.qureg.ptr);
-        }
-        state
-    }
-
-    /// Returns the probability of measuring the given computational basis state.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is too large to be converted to `i64`.
-    pub fn probability(&self, index: usize) -> f64 {
-        let quest_index = self.convert_basis_state(index);
-        unsafe {
-            ffi::quest_get_prob_amp(
-                self.qureg.ptr,
-                i64::try_from(quest_index).expect("Index too large"),
-            )
-        }
-    }
-
-    /// Convert PECOS basis state to `QuEST` basis state by reversing bit order
-    #[inline]
-    fn convert_basis_state(&self, pecos_basis: usize) -> usize {
-        let mut quest_basis = 0;
-        for i in 0..self.num_qubits {
-            if (pecos_basis >> i) & 1 == 1 {
-                // Bit i in PECOS maps to bit (n-1-i) in QuEST
-                quest_basis |= 1 << (self.num_qubits - 1 - i);
-            }
-        }
-        quest_basis
-    }
-
-    /// Prepares the quantum state in the specified computational basis state.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is too large to be converted to `i64`.
-    pub fn prepare_computational_basis(&mut self, index: usize) {
-        let quest_index = self.convert_basis_state(index);
-        unsafe {
-            ffi::quest_init_classical_state(
-                self.qureg.ptr,
-                i64::try_from(quest_index).expect("Index too large"),
-            );
-        }
-    }
-
-    pub fn prepare_plus_state(&mut self) {
-        unsafe {
-            ffi::quest_init_plus_state(self.qureg.ptr);
-        }
-    }
-
-    pub fn num_qubits(&self) -> usize {
-        self.num_qubits
-    }
-
-    /// Get information about the quantum register (for debugging/introspection)
-    pub fn get_info(&self) -> ffi::QuregInfo {
-        unsafe { ffi::quest_get_qureg_info(self.qureg.ptr) }
-    }
-
-    /// Get information about the `QuEST` environment (for debugging/introspection)
-    pub fn get_env_info(&self) -> ffi::QuESTEnvInfo {
-        unsafe { ffi::quest_get_env_info(self.env.ptr) }
-    }
-
-    fn check_qubit_index(&self, qubit: usize) -> Result<()> {
-        if qubit >= self.num_qubits {
-            Err(QuestError::InvalidQubit(qubit))
-        } else {
-            Ok(())
-        }
-    }
-
-    /// Converts from PECOS qubit indexing (qubit 0 is MSB) to `QuEST` indexing (qubit 0 is LSB)
-    fn convert_qubit_index(&self, pecos_qubit: usize) -> i32 {
-        i32::try_from(self.num_qubits - 1 - pecos_qubit).expect("Qubit index out of range")
-    }
-}
-
-impl<R> Clone for QuestStateVec<R>
-where
-    R: Rng + SeedableRng + Debug + Clone,
-{
-    fn clone(&self) -> Self {
-        // Create a new independent instance with same parameters
-        let env = QuestEnvWrapper::new().expect("Failed to create QuEST environment");
-
-        // Clone the quantum state - quest_clone_qureg creates a new qureg with cloned state
-        let cloned_qureg_ptr = unsafe { ffi::quest_clone_qureg(self.qureg.ptr) };
-
-        let qureg = QuregWrapper {
-            ptr: cloned_qureg_ptr,
-        };
-
-        Self {
-            num_qubits: self.num_qubits,
-            env,
-            qureg,
-            rng: self.rng.clone(),
-        }
-    }
-}
-
-impl<R> QuantumSimulator for QuestStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn reset(&mut self) -> &mut Self {
-        unsafe {
-            ffi::quest_init_zero_state(self.qureg.ptr);
-        }
-        self
-    }
-}
-
-impl<R> CliffordGateable for QuestStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_hadamard(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_s_gate(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q0, q1) in pairs {
-            let control = q0.index();
-            let target = q1.index();
-            self.check_qubit_index(control)
-                .expect("Invalid control qubit");
-            self.check_qubit_index(target)
-                .expect("Invalid target qubit");
-            let quest_control = self.convert_qubit_index(control);
-            let quest_target = self.convert_qubit_index(target);
-            unsafe {
-                ffi::quest_apply_cnot(self.qureg.ptr, quest_control, quest_target);
-            }
-        }
-        self
-    }
-
-    fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
-        use rand::RngExt;
-
-        let mut results = Vec::with_capacity(qubits.len());
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-
-            // Get probability of measuring |0⟩ (deterministic calculation)
-            let prob_0 = unsafe { ffi::quest_calc_prob_of_outcome(self.qureg.ptr, quest_qubit, 0) };
-
-            // Sample outcome using our seeded Rust RNG
-            let outcome = i32::from(self.rng.random::<f64>() >= prob_0);
-
-            // Collapse state to the sampled outcome
-            let actual_prob = unsafe {
-                ffi::quest_apply_forced_measurement(self.qureg.ptr, quest_qubit, outcome)
-            };
-
-            results.push(MeasurementResult {
-                outcome: outcome != 0,
-                is_deterministic: (actual_prob - 1.0).abs() < f64::EPSILON,
-            });
-        }
-        results
-    }
-
-    // Override with native QuEST implementations for better performance
-
-    fn x(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_pauli_x(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn y(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_pauli_y(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn z(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_pauli_z(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn cz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q0, q1) in pairs {
-            let control = q0.index();
-            let target = q1.index();
-            self.check_qubit_index(control)
-                .expect("Invalid control qubit");
-            self.check_qubit_index(target)
-                .expect("Invalid target qubit");
-            let quest_control = self.convert_qubit_index(control);
-            let quest_target = self.convert_qubit_index(target);
-            unsafe {
-                ffi::quest_apply_cz(self.qureg.ptr, quest_control, quest_target);
-            }
-        }
-        self
-    }
-
-    // SWAP gate - using trait default implementation
-    // The native QuEST swap has GPU dependencies that cause linking issues
-}
-
-impl<R> ArbitraryRotationGateable for QuestStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_rotation_x(self.qureg.ptr, quest_qubit, theta);
-            }
-        }
-        self
-    }
-
-    fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_rotation_z(self.qureg.ptr, quest_qubit, theta);
-            }
-        }
-        self
-    }
-
-    fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        let half_angle = theta / 2.0;
-        for &(q1, q2) in pairs {
-            self.check_qubit_index(q1.index())
-                .expect("Invalid qubit1 index");
-            self.check_qubit_index(q2.index())
-                .expect("Invalid qubit2 index");
-
-            let half_angle_a = Angle64::from_radians(half_angle);
-            let neg_half_angle_a = Angle64::from_radians(-half_angle);
-            self.rz(half_angle_a, &[q1]).rz(half_angle_a, &[q2]);
-            self.cz(&[(q1, q2)]);
-            self.rz(neg_half_angle_a, &[q1]).rz(neg_half_angle_a, &[q2]);
-        }
-        self
-    }
-
-    // Override with native QuEST implementations
-
-    fn ry(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_rotation_y(self.qureg.ptr, quest_qubit, theta);
-            }
-        }
-        self
-    }
-
-    fn t(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_t_gate(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn tdg(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_phase_shift(self.qureg.ptr, quest_qubit, -FRAC_PI_4);
-            }
-        }
-        self
-    }
-}
-
-impl<R> RngManageable for QuestStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    type Rng = R;
-
-    fn set_rng(&mut self, rng: Self::Rng) {
-        self.rng = rng;
-    }
-
-    fn rng(&self) -> &Self::Rng {
-        &self.rng
-    }
-
-    fn rng_mut(&mut self) -> &mut Self::Rng {
-        &mut self.rng
-    }
-}
-
-// Additional methods for QuestStateVec
-impl<R> QuestStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    /// Returns the complex amplitude of the specified computational basis state.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is too large to be converted to `i64`.
-    pub fn get_amplitude(&self, index: usize) -> Complex64 {
-        let complex_amp = unsafe {
-            ffi::quest_get_complex_amp(
-                self.qureg.ptr,
-                i64::try_from(index).expect("Index too large"),
-            )
-        };
-        Complex64::new(complex_amp.real, complex_amp.imag)
-    }
-}
-
-// SAFETY: QuestStateVec owns all its fields exclusively.
-// WARNING: QuEST's global env is not thread-safe. Sync is required by the Engine trait
-// but callers must ensure single-threaded access.
-unsafe impl<R> Send for QuestStateVec<R> where R: Rng + SeedableRng + Debug + Send {}
-unsafe impl<R> Sync for QuestStateVec<R> where R: Rng + SeedableRng + Debug + Sync {}
-
-/// A quantum density matrix simulator using `QuEST`'s density matrix representation
-#[derive(Debug)]
-pub struct QuestDensityMatrix<R = PecosRng>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    num_qubits: usize,
-    // The QuEST environment must be kept alive for the lifetime of the simulator.
-    // This field manages the global QuEST environment reference count via RAII.
-    env: QuestEnvWrapper,
-    qureg: QuregWrapper,
-    rng: R,
-}
-
-impl QuestDensityMatrix {
-    /// Creates a new `QuestDensityMatrix` with the specified number of qubits.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `QuEST` environment cannot be created or if the quantum register
-    /// allocation fails.
-    #[must_use]
-    pub fn new(num_qubits: usize) -> Self {
-        Self::with_seed(num_qubits, time_seed())
-    }
-}
-
-impl<R> QuestDensityMatrix<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    /// Creates a new `QuestDensityMatrix` with the specified number of qubits and seed.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the `QuEST` environment cannot be created or if the quantum register
-    /// allocation fails.
-    #[must_use]
-    pub fn with_seed(num_qubits: usize, seed: u64) -> Self {
-        let env = QuestEnvWrapper::new().expect("Failed to create QuEST environment");
-        let qureg = QuregWrapper::new(
-            &env,
-            i32::try_from(num_qubits).expect("Too many qubits"),
-            true,
-        )
-        .expect("Failed to create QuEST density matrix");
-        let rng = R::seed_from_u64(seed);
-
-        let state = Self {
-            num_qubits,
-            env,
-            qureg,
-            rng,
-        };
-
-        unsafe {
-            ffi::quest_init_zero_state(state.qureg.ptr);
-        }
-        state
-    }
-
-    /// Returns the probability of measuring the given computational basis state.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is too large to be converted to `i64`.
-    pub fn probability(&self, index: usize) -> f64 {
-        let quest_index = self.convert_basis_state(index);
-        unsafe {
-            ffi::quest_get_prob_amp(
-                self.qureg.ptr,
-                i64::try_from(quest_index).expect("Index too large"),
-            )
-        }
-    }
-
-    /// Convert PECOS basis state to `QuEST` basis state by reversing bit order
-    #[inline]
-    fn convert_basis_state(&self, pecos_basis: usize) -> usize {
-        let mut quest_basis = 0;
-        for i in 0..self.num_qubits {
-            if (pecos_basis >> i) & 1 == 1 {
-                // Bit i in PECOS maps to bit (n-1-i) in QuEST
-                quest_basis |= 1 << (self.num_qubits - 1 - i);
-            }
-        }
-        quest_basis
-    }
-
-    pub fn purity(&self) -> f64 {
-        unsafe { ffi::quest_calc_purity(self.qureg.ptr) }
-    }
-
-    /// Prepares the density matrix in the specified computational basis state.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is too large to be converted to `i64`.
-    pub fn prepare_computational_basis(&mut self, index: usize) {
-        let quest_index = self.convert_basis_state(index);
-        unsafe {
-            ffi::quest_init_classical_state(
-                self.qureg.ptr,
-                i64::try_from(quest_index).expect("Index too large"),
-            );
-        }
-    }
-
-    pub fn prepare_plus_state(&mut self) {
-        unsafe {
-            ffi::quest_init_plus_state(self.qureg.ptr);
-        }
-    }
-
-    pub fn num_qubits(&self) -> usize {
-        self.num_qubits
-    }
-
-    /// Get information about the quantum register (for debugging/introspection)
-    pub fn get_info(&self) -> ffi::QuregInfo {
-        unsafe { ffi::quest_get_qureg_info(self.qureg.ptr) }
-    }
-
-    /// Get information about the `QuEST` environment (for debugging/introspection)
-    pub fn get_env_info(&self) -> ffi::QuESTEnvInfo {
-        unsafe { ffi::quest_get_env_info(self.env.ptr) }
-    }
-
-    fn check_qubit_index(&self, qubit: usize) -> Result<()> {
-        if qubit >= self.num_qubits {
-            Err(QuestError::InvalidQubit(qubit))
-        } else {
-            Ok(())
-        }
-    }
-
-    /// Converts from PECOS qubit indexing (qubit 0 is MSB) to `QuEST` indexing (qubit 0 is LSB)
-    fn convert_qubit_index(&self, pecos_qubit: usize) -> i32 {
-        i32::try_from(self.num_qubits - 1 - pecos_qubit).expect("Qubit index out of range")
-    }
-}
-
-impl<R> Clone for QuestDensityMatrix<R>
-where
-    R: Rng + SeedableRng + Debug + Clone,
-{
-    fn clone(&self) -> Self {
-        // Create a new independent instance with same parameters
-        let env = QuestEnvWrapper::new().expect("Failed to create QuEST environment");
-        let _qureg = QuregWrapper::new(
-            &env,
-            i32::try_from(self.num_qubits).expect("Too many qubits"),
-            true,
-        )
-        .expect("Failed to create density matrix");
-
-        // Clone the quantum state - quest_clone_qureg creates a new qureg with cloned state
-        let cloned_qureg_ptr = unsafe { ffi::quest_clone_qureg(self.qureg.ptr) };
-
-        // Replace the qureg pointer
-        let qureg = QuregWrapper {
-            ptr: cloned_qureg_ptr,
-        };
-
-        Self {
-            num_qubits: self.num_qubits,
-            env,
-            qureg,
-            rng: self.rng.clone(),
-        }
-    }
-}
-
-// Implement traits for QuestDensityMatrix (same as QuestStateVec for compatibility)
-impl<R> QuantumSimulator for QuestDensityMatrix<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn reset(&mut self) -> &mut Self {
-        unsafe {
-            ffi::quest_init_zero_state(self.qureg.ptr);
-        }
-        self
-    }
-}
-
-impl<R> CliffordGateable for QuestDensityMatrix<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_hadamard(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_s_gate(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q0, q1) in pairs {
-            let control = q0.index();
-            let target = q1.index();
-            self.check_qubit_index(control)
-                .expect("Invalid control qubit");
-            self.check_qubit_index(target)
-                .expect("Invalid target qubit");
-            let quest_control = self.convert_qubit_index(control);
-            let quest_target = self.convert_qubit_index(target);
-            unsafe {
-                ffi::quest_apply_cnot(self.qureg.ptr, quest_control, quest_target);
-            }
-        }
-        self
-    }
-
-    fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
-        use rand::RngExt;
-
-        let mut results = Vec::with_capacity(qubits.len());
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-
-            // Get probability of measuring |0⟩ (deterministic calculation)
-            let prob_0 = unsafe { ffi::quest_calc_prob_of_outcome(self.qureg.ptr, quest_qubit, 0) };
-
-            // Sample outcome using our seeded Rust RNG
-            let outcome = i32::from(self.rng.random::<f64>() >= prob_0);
-
-            // Collapse state to the sampled outcome
-            let actual_prob = unsafe {
-                ffi::quest_apply_forced_measurement(self.qureg.ptr, quest_qubit, outcome)
-            };
-
-            results.push(MeasurementResult {
-                outcome: outcome != 0,
-                is_deterministic: (actual_prob - 1.0).abs() < f64::EPSILON,
-            });
-        }
-        results
-    }
-
-    // Override with native QuEST implementations for better performance
-
-    fn x(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_pauli_x(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn y(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_pauli_y(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn z(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_pauli_z(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn cz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q0, q1) in pairs {
-            let control = q0.index();
-            let target = q1.index();
-            self.check_qubit_index(control)
-                .expect("Invalid control qubit");
-            self.check_qubit_index(target)
-                .expect("Invalid target qubit");
-            let quest_control = self.convert_qubit_index(control);
-            let quest_target = self.convert_qubit_index(target);
-            unsafe {
-                ffi::quest_apply_cz(self.qureg.ptr, quest_control, quest_target);
-            }
-        }
-        self
-    }
-
-    // SWAP gate - using trait default implementation
-    // The native QuEST swap has GPU dependencies that cause linking issues
-}
-
-impl<R> ArbitraryRotationGateable for QuestDensityMatrix<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_rotation_x(self.qureg.ptr, quest_qubit, theta);
-            }
-        }
-        self
-    }
-
-    fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_rotation_z(self.qureg.ptr, quest_qubit, theta);
-            }
-        }
-        self
-    }
-
-    fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        let half_angle = theta / 2.0;
-        for &(q1, q2) in pairs {
-            self.check_qubit_index(q1.index())
-                .expect("Invalid qubit1 index");
-            self.check_qubit_index(q2.index())
-                .expect("Invalid qubit2 index");
-
-            let half_angle_a = Angle64::from_radians(half_angle);
-            let neg_half_angle_a = Angle64::from_radians(-half_angle);
-            self.rz(half_angle_a, &[q1]).rz(half_angle_a, &[q2]);
-            self.cz(&[(q1, q2)]);
-            self.rz(neg_half_angle_a, &[q1]).rz(neg_half_angle_a, &[q2]);
-        }
-        self
-    }
-
-    // Override with native QuEST implementations
-
-    fn ry(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_rotation_y(self.qureg.ptr, quest_qubit, theta);
-            }
-        }
-        self
-    }
-
-    fn t(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_t_gate(self.qureg.ptr, quest_qubit);
-            }
-        }
-        self
-    }
-
-    fn tdg(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            self.check_qubit_index(q.index())
-                .expect("Invalid qubit index");
-            let quest_qubit = self.convert_qubit_index(q.index());
-            unsafe {
-                ffi::quest_apply_phase_shift(self.qureg.ptr, quest_qubit, -FRAC_PI_4);
-            }
-        }
-        self
-    }
-}
-
-impl<R> RngManageable for QuestDensityMatrix<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    type Rng = R;
-
-    fn set_rng(&mut self, rng: Self::Rng) {
-        self.rng = rng;
-    }
-
-    fn rng(&self) -> &Self::Rng {
-        &self.rng
-    }
-
-    fn rng_mut(&mut self) -> &mut Self::Rng {
-        &mut self.rng
-    }
-}
-
-// Additional methods for QuestDensityMatrix
-impl<R> QuestDensityMatrix<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    /// Returns the complex density matrix element at the specified index.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the index is too large to be converted to `i64`.
-    pub fn get_density_element(&self, index: usize) -> Complex64 {
-        let complex_amp = unsafe {
-            ffi::quest_get_complex_amp(
-                self.qureg.ptr,
-                i64::try_from(index).expect("Index too large"),
-            )
-        };
-        Complex64::new(complex_amp.real, complex_amp.imag)
-    }
-}
-
-// SAFETY: QuestDensityMatrix owns all its fields exclusively.
-// WARNING: QuEST's global env is not thread-safe. Sync is required by the Engine trait
-// but callers must ensure single-threaded access.
-unsafe impl<R> Send for QuestDensityMatrix<R> where R: Rng + SeedableRng + Debug + Send {}
-unsafe impl<R> Sync for QuestDensityMatrix<R> where R: Rng + SeedableRng + Debug + Sync {}
-
-#[cfg(test)]
-mod tests;
diff --git a/crates/pecos-quest/src/quantum_engine.rs b/crates/pecos-quest/src/quantum_engine.rs
deleted file mode 100644
index 325e4e12e..000000000
--- a/crates/pecos-quest/src/quantum_engine.rs
+++ /dev/null
@@ -1,1596 +0,0 @@
-//! Quest quantum engine integration with PECOS engine system
-//!
-//! This module provides wrappers and builders to integrate `QuEST` simulators
-//! with the PECOS engine system, allowing them to be used with the `sim()` API.
-
-use crate::{QuestDensityMatrix, QuestStateVec};
-use pecos_core::Angle64;
-use pecos_core::QubitId;
-use pecos_core::RngManageable;
-use pecos_core::errors::PecosError;
-use pecos_engines::{
-    Engine, IntoQuantumEngineBuilder, QuantumEngine, QuantumEngineBuilder,
-    byte_message::{ByteMessage, GateType},
-};
-use pecos_simulators::{
-    ArbitraryRotationGateable, CliffordGateable, MeasurementResult, QuantumSimulator,
-};
-use std::any::Any;
-use std::fmt::Debug;
-
-/// Convert a flat slice of `QubitId` into pairs for two-qubit gate calls.
-fn to_pairs(qubits: &[QubitId]) -> Vec<(QubitId, QubitId)> {
-    qubits
-        .chunks_exact(2)
-        .map(|pair| (pair[0], pair[1]))
-        .collect()
-}
-
-/// Quest state vector quantum engine wrapper
-#[derive(Debug, Clone)]
-pub struct QuestStateVecEngine {
-    simulator: QuestStateVec,
-}
-
-impl QuestStateVecEngine {
-    /// Create a new Quest state vector engine with the specified number of qubits
-    #[must_use]
-    pub fn new(num_qubits: usize) -> Self {
-        Self {
-            simulator: QuestStateVec::new(num_qubits),
-        }
-    }
-
-    /// Create a new Quest state vector engine with a specific seed
-    #[must_use]
-    pub fn with_seed(num_qubits: usize, seed: u64) -> Self {
-        Self {
-            simulator: QuestStateVec::with_seed(num_qubits, seed),
-        }
-    }
-}
-
-impl Engine for QuestStateVecEngine {
-    type Input = ByteMessage;
-    type Output = ByteMessage;
-
-    #[allow(clippy::too_many_lines)]
-    fn process(&mut self, message: Self::Input) -> Result<Self::Output, PecosError> {
-        // Parse commands from the message
-        let batch = message.quantum_ops()?;
-        let mut measurements = Vec::new();
-
-        for cmd in &batch {
-            match cmd.gate_type {
-                GateType::X => {
-                    self.simulator.x(&cmd.qubits);
-                }
-                GateType::Y => {
-                    self.simulator.y(&cmd.qubits);
-                }
-                GateType::Z => {
-                    self.simulator.z(&cmd.qubits);
-                }
-                GateType::H => {
-                    self.simulator.h(&cmd.qubits);
-                }
-                GateType::SZ => {
-                    self.simulator.sz(&cmd.qubits);
-                }
-                GateType::SZdg => {
-                    self.simulator.szdg(&cmd.qubits);
-                }
-                GateType::T => {
-                    self.simulator.t(&cmd.qubits);
-                }
-                GateType::Tdg => {
-                    self.simulator.tdg(&cmd.qubits);
-                }
-                GateType::CX => {
-                    self.simulator.cx(&to_pairs(&cmd.qubits));
-                }
-                GateType::CY => {
-                    self.simulator.cy(&to_pairs(&cmd.qubits));
-                }
-                GateType::CZ => {
-                    self.simulator.cz(&to_pairs(&cmd.qubits));
-                }
-                // CH = Ry(π/4)_target, CX(control, target), Ry(-π/4)_target
-                GateType::CH => {
-                    for pair in cmd.qubits.chunks_exact(2) {
-                        let target_slice = &[pair[1]];
-                        self.simulator.ry(
-                            Angle64::from_radians(std::f64::consts::FRAC_PI_4),
-                            target_slice,
-                        );
-                        self.simulator.cx(&[(pair[0], pair[1])]);
-                        self.simulator.ry(
-                            Angle64::from_radians(-std::f64::consts::FRAC_PI_4),
-                            target_slice,
-                        );
-                    }
-                }
-                GateType::RZZ => {
-                    self.simulator.rzz(cmd.angles[0], &to_pairs(&cmd.qubits));
-                }
-                GateType::SZZ => {
-                    self.simulator.szz(&to_pairs(&cmd.qubits));
-                }
-                GateType::SZZdg => {
-                    self.simulator.szzdg(&to_pairs(&cmd.qubits));
-                }
-                GateType::F => {
-                    self.simulator.f(&cmd.qubits);
-                }
-                GateType::Fdg => {
-                    self.simulator.fdg(&cmd.qubits);
-                }
-                GateType::SY => {
-                    self.simulator.sy(&cmd.qubits);
-                }
-                GateType::SYdg => {
-                    self.simulator.sydg(&cmd.qubits);
-                }
-                GateType::SXX => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SXX gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.sxx(&to_pairs(&cmd.qubits));
-                }
-                GateType::SXXdg => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SXXdg gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.sxxdg(&to_pairs(&cmd.qubits));
-                }
-                GateType::SYY => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SYY gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.syy(&to_pairs(&cmd.qubits));
-                }
-                GateType::SYYdg => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SYYdg gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.syydg(&to_pairs(&cmd.qubits));
-                }
-                GateType::SWAP => {
-                    self.simulator.swap(&to_pairs(&cmd.qubits));
-                }
-                GateType::CRZ => {
-                    if !cmd.angles.is_empty() {
-                        let angle = cmd.angles[0];
-                        let half_angle = angle / 2u64;
-                        for pair in cmd.qubits.chunks_exact(2) {
-                            // CRZ(θ) = Rz(θ/2) on target, CX, Rz(-θ/2) on target, CX
-                            self.simulator.rz(half_angle, &[pair[1]]);
-                            self.simulator.cx(&[(pair[0], pair[1])]);
-                            self.simulator.rz(-half_angle, &[pair[1]]);
-                            self.simulator.cx(&[(pair[0], pair[1])]);
-                        }
-                    }
-                }
-                GateType::CCX => {
-                    for qubits in cmd.qubits.chunks_exact(3) {
-                        // Toffoli decomposition into Clifford+T gates
-                        let c0 = qubits[0];
-                        let c1 = qubits[1];
-                        let target = qubits[2];
-                        self.simulator.h(&[target]);
-                        self.simulator.cx(&[(c1, target)]);
-                        self.simulator.tdg(&[target]);
-                        self.simulator.cx(&[(c0, target)]);
-                        self.simulator.t(&[target]);
-                        self.simulator.cx(&[(c1, target)]);
-                        self.simulator.tdg(&[target]);
-                        self.simulator.cx(&[(c0, target)]);
-                        self.simulator.t(&[c1]);
-                        self.simulator.t(&[target]);
-                        self.simulator.cx(&[(c0, c1)]);
-                        self.simulator.h(&[target]);
-                        self.simulator.t(&[c0]);
-                        self.simulator.tdg(&[c1]);
-                        self.simulator.cx(&[(c0, c1)]);
-                    }
-                }
-                GateType::SX => {
-                    self.simulator.sx(&cmd.qubits);
-                }
-                GateType::SXdg => {
-                    self.simulator.sxdg(&cmd.qubits);
-                }
-                GateType::RX => {
-                    if !cmd.angles.is_empty() {
-                        self.simulator.rx(cmd.angles[0], &cmd.qubits);
-                    }
-                }
-                GateType::RY => {
-                    if !cmd.angles.is_empty() {
-                        self.simulator.ry(cmd.angles[0], &cmd.qubits);
-                    }
-                }
-                GateType::RZ => {
-                    if !cmd.angles.is_empty() {
-                        self.simulator.rz(cmd.angles[0], &cmd.qubits);
-                    }
-                }
-                GateType::R1XY => {
-                    if cmd.angles.len() >= 2 {
-                        self.simulator
-                            .r1xy(cmd.angles[0], cmd.angles[1], &cmd.qubits);
-                    }
-                }
-                GateType::MZ | GateType::MeasureLeaked | GateType::MeasureFree => {
-                    let meas_results = self.simulator.mz(&cmd.qubits);
-                    for meas_result in meas_results {
-                        let outcome = u32::from(meas_result.outcome);
-                        measurements.push(outcome);
-                    }
-                }
-                GateType::PZ | GateType::QAlloc => {
-                    self.simulator.pz(&cmd.qubits);
-                }
-                GateType::I
-                | GateType::Idle
-                | GateType::MeasCrosstalkLocalPayload
-                | GateType::MeasCrosstalkGlobalPayload
-                | GateType::QFree
-                | GateType::Custom => {
-                    // No operation needed (QFree is just a marker for qubit lifecycle)
-                }
-                GateType::U => {
-                    if cmd.angles.len() >= 3 {
-                        self.simulator
-                            .u(cmd.angles[0], cmd.angles[1], cmd.angles[2], &cmd.qubits);
-                    }
-                }
-                GateType::RXX => {
-                    if cmd.angles.is_empty() {
-                        return Err(PecosError::Processing(
-                            "RXX gate requires at least one angle".to_string(),
-                        ));
-                    }
-                    self.simulator.rxx(cmd.angles[0], &to_pairs(&cmd.qubits));
-                }
-                GateType::RYY => {
-                    if cmd.angles.is_empty() {
-                        return Err(PecosError::Processing(
-                            "RYY gate requires at least one angle".to_string(),
-                        ));
-                    }
-                    self.simulator.ryy(cmd.angles[0], &to_pairs(&cmd.qubits));
-                }
-                GateType::RXXRYYRZZ | GateType::U2q => {
-                    if cmd.angles.len() < 3 {
-                        return Err(PecosError::Processing(
-                            "RXXRYYRZZ gate requires three angles".to_string(),
-                        ));
-                    }
-                    self.simulator.rxx(cmd.angles[0], &to_pairs(&cmd.qubits));
-                    self.simulator.ryy(cmd.angles[1], &to_pairs(&cmd.qubits));
-                    self.simulator.rzz(cmd.angles[2], &to_pairs(&cmd.qubits));
-                }
-            }
-        }
-
-        // Create a message with the measurement results
-        let mut builder = ByteMessage::outcomes_builder();
-        let outcomes: Vec<usize> = measurements.iter().map(|&m| m as usize).collect();
-        builder.add_outcomes(&outcomes);
-
-        Ok(builder.build())
-    }
-
-    fn reset(&mut self) -> Result<(), PecosError> {
-        self.simulator.reset();
-        Ok(())
-    }
-}
-
-impl QuantumEngine for QuestStateVecEngine {
-    fn set_seed(&mut self, seed: u64) {
-        let rng = <QuestStateVec as RngManageable>::Rng::seed_from_u64(seed);
-        self.simulator.set_rng(rng);
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn as_any_mut(&mut self) -> &mut dyn Any {
-        self
-    }
-}
-
-/// Quest density matrix quantum engine wrapper
-#[derive(Debug, Clone)]
-pub struct QuestDensityMatrixEngine {
-    simulator: QuestDensityMatrix,
-}
-
-impl QuestDensityMatrixEngine {
-    /// Create a new Quest density matrix engine with the specified number of qubits
-    #[must_use]
-    pub fn new(num_qubits: usize) -> Self {
-        Self {
-            simulator: QuestDensityMatrix::new(num_qubits),
-        }
-    }
-
-    /// Create a new Quest density matrix engine with a specific seed
-    #[must_use]
-    pub fn with_seed(num_qubits: usize, seed: u64) -> Self {
-        Self {
-            simulator: QuestDensityMatrix::with_seed(num_qubits, seed),
-        }
-    }
-}
-
-impl Engine for QuestDensityMatrixEngine {
-    type Input = ByteMessage;
-    type Output = ByteMessage;
-
-    #[allow(clippy::too_many_lines)]
-    fn process(&mut self, message: Self::Input) -> Result<Self::Output, PecosError> {
-        // Parse commands from the message
-        let batch = message.quantum_ops()?;
-        let mut measurements = Vec::new();
-
-        for cmd in &batch {
-            match cmd.gate_type {
-                GateType::X => {
-                    self.simulator.x(&cmd.qubits);
-                }
-                GateType::Y => {
-                    self.simulator.y(&cmd.qubits);
-                }
-                GateType::Z => {
-                    self.simulator.z(&cmd.qubits);
-                }
-                GateType::H => {
-                    self.simulator.h(&cmd.qubits);
-                }
-                GateType::SZ => {
-                    self.simulator.sz(&cmd.qubits);
-                }
-                GateType::SZdg => {
-                    self.simulator.szdg(&cmd.qubits);
-                }
-                GateType::T => {
-                    self.simulator.t(&cmd.qubits);
-                }
-                GateType::Tdg => {
-                    self.simulator.tdg(&cmd.qubits);
-                }
-                GateType::CX => {
-                    self.simulator.cx(&to_pairs(&cmd.qubits));
-                }
-                GateType::CY => {
-                    self.simulator.cy(&to_pairs(&cmd.qubits));
-                }
-                GateType::CZ => {
-                    self.simulator.cz(&to_pairs(&cmd.qubits));
-                }
-                // CH = Ry(π/4)_target, CX(control, target), Ry(-π/4)_target
-                GateType::CH => {
-                    for pair in cmd.qubits.chunks_exact(2) {
-                        let target_slice = &[pair[1]];
-                        self.simulator.ry(
-                            Angle64::from_radians(std::f64::consts::FRAC_PI_4),
-                            target_slice,
-                        );
-                        self.simulator.cx(&[(pair[0], pair[1])]);
-                        self.simulator.ry(
-                            Angle64::from_radians(-std::f64::consts::FRAC_PI_4),
-                            target_slice,
-                        );
-                    }
-                }
-                GateType::RZZ => {
-                    self.simulator.rzz(cmd.angles[0], &to_pairs(&cmd.qubits));
-                }
-                GateType::SZZ => {
-                    self.simulator.szz(&to_pairs(&cmd.qubits));
-                }
-                GateType::SZZdg => {
-                    self.simulator.szzdg(&to_pairs(&cmd.qubits));
-                }
-                GateType::F => {
-                    self.simulator.f(&cmd.qubits);
-                }
-                GateType::Fdg => {
-                    self.simulator.fdg(&cmd.qubits);
-                }
-                GateType::SY => {
-                    self.simulator.sy(&cmd.qubits);
-                }
-                GateType::SYdg => {
-                    self.simulator.sydg(&cmd.qubits);
-                }
-                GateType::SXX => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SXX gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.sxx(&to_pairs(&cmd.qubits));
-                }
-                GateType::SXXdg => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SXXdg gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.sxxdg(&to_pairs(&cmd.qubits));
-                }
-                GateType::SYY => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SYY gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.syy(&to_pairs(&cmd.qubits));
-                }
-                GateType::SYYdg => {
-                    if cmd.qubits.len() % 2 != 0 {
-                        return Err(PecosError::Processing(format!(
-                            "SYYdg gate requires even number of qubits, got {}",
-                            cmd.qubits.len()
-                        )));
-                    }
-                    self.simulator.syydg(&to_pairs(&cmd.qubits));
-                }
-                GateType::SWAP => {
-                    self.simulator.swap(&to_pairs(&cmd.qubits));
-                }
-                GateType::CRZ => {
-                    if !cmd.angles.is_empty() {
-                        let angle = cmd.angles[0];
-                        let half_angle = angle / 2u64;
-                        for pair in cmd.qubits.chunks_exact(2) {
-                            // CRZ(θ) = Rz(θ/2) on target, CX, Rz(-θ/2) on target, CX
-                            self.simulator.rz(half_angle, &[pair[1]]);
-                            self.simulator.cx(&[(pair[0], pair[1])]);
-                            self.simulator.rz(-half_angle, &[pair[1]]);
-                            self.simulator.cx(&[(pair[0], pair[1])]);
-                        }
-                    }
-                }
-                GateType::CCX => {
-                    for qubits in cmd.qubits.chunks_exact(3) {
-                        // Toffoli decomposition into Clifford+T gates
-                        let c0 = qubits[0];
-                        let c1 = qubits[1];
-                        let target = qubits[2];
-                        self.simulator.h(&[target]);
-                        self.simulator.cx(&[(c1, target)]);
-                        self.simulator.tdg(&[target]);
-                        self.simulator.cx(&[(c0, target)]);
-                        self.simulator.t(&[target]);
-                        self.simulator.cx(&[(c1, target)]);
-                        self.simulator.tdg(&[target]);
-                        self.simulator.cx(&[(c0, target)]);
-                        self.simulator.t(&[c1]);
-                        self.simulator.t(&[target]);
-                        self.simulator.cx(&[(c0, c1)]);
-                        self.simulator.h(&[target]);
-                        self.simulator.t(&[c0]);
-                        self.simulator.tdg(&[c1]);
-                        self.simulator.cx(&[(c0, c1)]);
-                    }
-                }
-                GateType::SX => {
-                    self.simulator.sx(&cmd.qubits);
-                }
-                GateType::SXdg => {
-                    self.simulator.sxdg(&cmd.qubits);
-                }
-                GateType::RX => {
-                    if !cmd.angles.is_empty() {
-                        self.simulator.rx(cmd.angles[0], &cmd.qubits);
-                    }
-                }
-                GateType::RY => {
-                    if !cmd.angles.is_empty() {
-                        self.simulator.ry(cmd.angles[0], &cmd.qubits);
-                    }
-                }
-                GateType::RZ => {
-                    if !cmd.angles.is_empty() {
-                        self.simulator.rz(cmd.angles[0], &cmd.qubits);
-                    }
-                }
-                GateType::R1XY => {
-                    if cmd.angles.len() >= 2 {
-                        self.simulator
-                            .r1xy(cmd.angles[0], cmd.angles[1], &cmd.qubits);
-                    }
-                }
-                GateType::MZ | GateType::MeasureLeaked | GateType::MeasureFree => {
-                    let meas_results = self.simulator.mz(&cmd.qubits);
-                    for meas_result in meas_results {
-                        let outcome = u32::from(meas_result.outcome);
-                        measurements.push(outcome);
-                    }
-                }
-                GateType::PZ | GateType::QAlloc => {
-                    self.simulator.pz(&cmd.qubits);
-                }
-                GateType::I
-                | GateType::Idle
-                | GateType::MeasCrosstalkLocalPayload
-                | GateType::MeasCrosstalkGlobalPayload
-                | GateType::QFree
-                | GateType::Custom => {
-                    // No operation needed (QFree is just a marker for qubit lifecycle)
-                }
-                GateType::U => {
-                    if cmd.angles.len() >= 3 {
-                        self.simulator
-                            .u(cmd.angles[0], cmd.angles[1], cmd.angles[2], &cmd.qubits);
-                    }
-                }
-                GateType::RXX => {
-                    if cmd.angles.is_empty() {
-                        return Err(PecosError::Processing(
-                            "RXX gate requires at least one angle".to_string(),
-                        ));
-                    }
-                    self.simulator.rxx(cmd.angles[0], &to_pairs(&cmd.qubits));
-                }
-                GateType::RYY => {
-                    if cmd.angles.is_empty() {
-                        return Err(PecosError::Processing(
-                            "RYY gate requires at least one angle".to_string(),
-                        ));
-                    }
-                    self.simulator.ryy(cmd.angles[0], &to_pairs(&cmd.qubits));
-                }
-                GateType::RXXRYYRZZ | GateType::U2q => {
-                    if cmd.angles.len() < 3 {
-                        return Err(PecosError::Processing(
-                            "RXXRYYRZZ gate requires three angles".to_string(),
-                        ));
-                    }
-                    self.simulator.rxx(cmd.angles[0], &to_pairs(&cmd.qubits));
-                    self.simulator.ryy(cmd.angles[1], &to_pairs(&cmd.qubits));
-                    self.simulator.rzz(cmd.angles[2], &to_pairs(&cmd.qubits));
-                }
-            }
-        }
-
-        // Create a message with the measurement results
-        let mut builder = ByteMessage::outcomes_builder();
-        let outcomes: Vec<usize> = measurements.iter().map(|&m| m as usize).collect();
-        builder.add_outcomes(&outcomes);
-
-        Ok(builder.build())
-    }
-
-    fn reset(&mut self) -> Result<(), PecosError> {
-        self.simulator.reset();
-        Ok(())
-    }
-}
-
-impl QuantumEngine for QuestDensityMatrixEngine {
-    fn set_seed(&mut self, seed: u64) {
-        let rng = <QuestDensityMatrix as RngManageable>::Rng::seed_from_u64(seed);
-        self.simulator.set_rng(rng);
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn as_any_mut(&mut self) -> &mut dyn Any {
-        self
-    }
-}
-
-/// Builder for Quest state vector quantum engine
-#[derive(Debug, Clone, Default)]
-pub struct QuestStateVectorEngineBuilder {
-    /// Number of qubits (if explicitly set)
-    num_qubits: Option<usize>,
-    /// CUDA acceleration mode flag
-    use_cuda: bool,
-}
-
-impl QuestStateVectorEngineBuilder {
-    /// Create a new Quest state vector engine builder
-    #[must_use]
-    pub fn new() -> Self {
-        Self::default()
-    }
-
-    /// Set the number of qubits
-    #[must_use]
-    pub fn qubits(mut self, num_qubits: usize) -> Self {
-        self.num_qubits = Some(num_qubits);
-        self
-    }
-
-    /// Use CPU-only mode (default)
-    #[must_use]
-    pub fn with_cpu(mut self) -> Self {
-        self.use_cuda = false;
-        self
-    }
-
-    /// Use GPU acceleration mode
-    ///
-    /// This enables GPU acceleration using the best available backend.
-    /// Currently supports NVIDIA CUDA via the `QuEST` CUDA backend.
-    /// The backend is loaded at runtime, so systems without GPU support
-    /// can still use the CPU mode.
-    #[must_use]
-    pub fn with_gpu(self) -> Self {
-        Self {
-            use_cuda: true,
-            ..self
-        }
-    }
-}
-
-impl QuantumEngineBuilder for QuestStateVectorEngineBuilder {
-    fn build(&mut self) -> Result<Box<dyn QuantumEngine>, PecosError> {
-        let num_qubits = self.num_qubits.ok_or_else(|| {
-            PecosError::Input("Number of qubits not specified for Quest engine".to_string())
-        })?;
-
-        // Check if CUDA was requested
-        if self.use_cuda {
-            // Create and return CUDA-backed engine (runtime detection)
-            let engine = QuestCudaStateVecEngine::new(num_qubits)?;
-            return Ok(Box::new(engine));
-        }
-
-        // CPU mode - use the standard implementation
-        Ok(Box::new(QuestStateVecEngine::new(num_qubits)))
-    }
-
-    fn set_qubits_if_needed(&mut self, num_qubits: usize) {
-        if self.num_qubits.is_none() {
-            self.num_qubits = Some(num_qubits);
-        }
-    }
-}
-
-impl IntoQuantumEngineBuilder for QuestStateVectorEngineBuilder {
-    type Builder = Self;
-
-    fn into_quantum_engine_builder(self) -> Self::Builder {
-        self
-    }
-}
-
-/// Builder for Quest density matrix quantum engine
-#[derive(Debug, Clone, Default)]
-pub struct QuestDensityMatrixEngineBuilder {
-    /// Number of qubits (if explicitly set)
-    num_qubits: Option<usize>,
-    /// CUDA acceleration mode flag
-    use_cuda: bool,
-}
-
-impl QuestDensityMatrixEngineBuilder {
-    /// Create a new Quest density matrix engine builder
-    #[must_use]
-    pub fn new() -> Self {
-        Self::default()
-    }
-
-    /// Set the number of qubits
-    #[must_use]
-    pub fn qubits(mut self, num_qubits: usize) -> Self {
-        self.num_qubits = Some(num_qubits);
-        self
-    }
-
-    /// Use CPU-only mode (default)
-    #[must_use]
-    pub fn with_cpu(mut self) -> Self {
-        self.use_cuda = false;
-        self
-    }
-
-    /// Use GPU acceleration mode
-    ///
-    /// This enables GPU acceleration using the best available backend.
-    /// Currently supports NVIDIA CUDA via the `QuEST` CUDA backend.
-    /// The backend is loaded at runtime, so systems without GPU support
-    /// can still use the CPU mode.
-    #[must_use]
-    pub fn with_gpu(self) -> Self {
-        Self {
-            use_cuda: true,
-            ..self
-        }
-    }
-}
-
-impl QuantumEngineBuilder for QuestDensityMatrixEngineBuilder {
-    fn build(&mut self) -> Result<Box<dyn QuantumEngine>, PecosError> {
-        let num_qubits = self.num_qubits.ok_or_else(|| {
-            PecosError::Input("Number of qubits not specified for Quest engine".to_string())
-        })?;
-
-        // Check if CUDA was requested
-        if self.use_cuda {
-            // CUDA density matrix engine not yet implemented
-            return Err(PecosError::Processing(
-                "CUDA acceleration for density matrix simulation is not yet implemented. \
-                 Use QuestStateVectorEngineBuilder for GPU-accelerated state vector simulation, \
-                 or use CPU mode for density matrix simulation."
-                    .to_string(),
-            ));
-        }
-
-        // CPU mode - use the standard implementation
-        Ok(Box::new(QuestDensityMatrixEngine::new(num_qubits)))
-    }
-
-    fn set_qubits_if_needed(&mut self, num_qubits: usize) {
-        if self.num_qubits.is_none() {
-            self.num_qubits = Some(num_qubits);
-        }
-    }
-}
-
-impl IntoQuantumEngineBuilder for QuestDensityMatrixEngineBuilder {
-    type Builder = Self;
-
-    fn into_quantum_engine_builder(self) -> Self::Builder {
-        self
-    }
-}
-
-/// Create a Quest state vector quantum engine builder
-#[must_use]
-pub fn quest_state_vec() -> QuestStateVectorEngineBuilder {
-    QuestStateVectorEngineBuilder::new()
-}
-
-/// Create a Quest density matrix quantum engine builder
-#[must_use]
-pub fn quest_density_matrix() -> QuestDensityMatrixEngineBuilder {
-    QuestDensityMatrixEngineBuilder::new()
-}
-
-// ============================================================================
-// CUDA-backed quantum engine
-// ============================================================================
-
-/// CUDA-backed `QuEST` state vector quantum engine
-///
-/// This engine uses the dynamically-loaded `QuEST` CUDA backend for GPU-accelerated
-/// quantum simulation. The CUDA backend is loaded at runtime via dlopen, allowing
-/// the same binary to work on systems with and without CUDA installed.
-///
-/// The engine uses a shared CUDA environment that persists for the lifetime of the
-/// process, avoiding `QuEST` CUDA recreation issues. Only the quantum register (qureg)
-/// is created/destroyed per engine instance.
-pub struct QuestCudaStateVecEngine {
-    /// Opaque handle to the quantum register (owned by this instance)
-    qureg_handle: *mut u8,
-    /// Reference to the CUDA backend (static lifetime, lazily loaded)
-    backend: &'static crate::cuda_loader::CudaBackend,
-    /// Number of qubits
-    num_qubits: usize,
-}
-
-impl QuestCudaStateVecEngine {
-    /// Create a new CUDA-backed state vector engine
-    ///
-    /// # Errors
-    /// Returns `PecosError::Processing` if:
-    /// - The CUDA backend library cannot be loaded
-    /// - The shared CUDA environment cannot be created
-    /// - The quantum register cannot be allocated
-    ///
-    /// # Panics
-    /// Panics if `num_qubits` exceeds `i32::MAX` (extremely unlikely in practice).
-    pub fn new(num_qubits: usize) -> Result<Self, PecosError> {
-        // Get the shared CUDA environment (created once, reused across all engines)
-        let (env_handle, backend) = crate::cuda_loader::get_shared_cuda_env().map_err(|e| {
-            PecosError::Processing(format!(
-                "Failed to get shared CUDA environment: {e}\n\n{}",
-                crate::cuda_loader::cuda_unavailable_error_message()
-            ))
-        })?;
-
-        // Create quantum register using the shared environment
-        let qureg_handle = unsafe {
-            (backend.create_qureg)(
-                env_handle,
-                i32::try_from(num_qubits).expect("num_qubits fits in i32"),
-            )
-        };
-        if qureg_handle.is_null() {
-            return Err(PecosError::Processing(format!(
-                "Failed to create CUDA quantum register with {num_qubits} qubits"
-            )));
-        }
-
-        // Initialize to zero state
-        unsafe {
-            (backend.init_zero_state)(qureg_handle);
-        }
-
-        log::info!("Created CUDA-backed QuEST state vector engine with {num_qubits} qubits");
-
-        Ok(Self {
-            qureg_handle,
-            backend,
-            num_qubits,
-        })
-    }
-}
-
-impl Drop for QuestCudaStateVecEngine {
-    fn drop(&mut self) {
-        // Destroy the qureg to free GPU memory.
-        // NOTE: QuEST's CUDA backend only supports one qureg at a time,
-        // so this must be called before creating a new engine.
-        unsafe {
-            if !self.qureg_handle.is_null() {
-                (self.backend.destroy_qureg)(self.qureg_handle);
-            }
-        }
-    }
-}
-
-impl Debug for QuestCudaStateVecEngine {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("QuestCudaStateVecEngine")
-            .field("num_qubits", &self.num_qubits)
-            .finish_non_exhaustive()
-    }
-}
-
-// Safety: The CUDA backend handles are thread-safe through QuEST's internal synchronization
-unsafe impl Send for QuestCudaStateVecEngine {}
-unsafe impl Sync for QuestCudaStateVecEngine {}
-
-impl Clone for QuestCudaStateVecEngine {
-    /// Clone creates a new CUDA engine with the same qubit count but reset to zero state.
-    ///
-    /// Does NOT preserve the quantum state of the original.
-    ///
-    /// # Panics
-    ///
-    /// Panics if a new CUDA quantum register cannot be allocated (e.g. GPU out of memory).
-    /// The CUDA backend itself is guaranteed to be loaded because `self` exists.
-    fn clone(&self) -> Self {
-        Self::new(self.num_qubits)
-            .expect("CUDA engine clone failed -- GPU register allocation failed")
-    }
-}
-
-impl Engine for QuestCudaStateVecEngine {
-    type Input = ByteMessage;
-    type Output = ByteMessage;
-
-    // Allow cast warnings: qubit indices are always small (quantum computers don't have billions of qubits)
-    #[allow(
-        clippy::too_many_lines,
-        clippy::cast_possible_truncation,
-        clippy::cast_possible_wrap
-    )]
-    fn process(&mut self, message: Self::Input) -> Result<Self::Output, PecosError> {
-        let batch = message.quantum_ops()?;
-        let mut measurements = Vec::new();
-
-        for cmd in &batch {
-            match cmd.gate_type {
-                GateType::X => {
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_pauli_x)(self.qureg_handle, qubit);
-                        }
-                    }
-                }
-                GateType::Y => {
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_pauli_y)(self.qureg_handle, qubit);
-                        }
-                    }
-                }
-                GateType::Z => {
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_pauli_z)(self.qureg_handle, qubit);
-                        }
-                    }
-                }
-                GateType::H => {
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_hadamard)(self.qureg_handle, qubit);
-                        }
-                    }
-                }
-                GateType::SZ => {
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_s_gate)(self.qureg_handle, qubit);
-                        }
-                    }
-                }
-                GateType::SZdg => {
-                    // S-dagger = S^3 = phase(-pi/2)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_phase_shift)(
-                                self.qureg_handle,
-                                qubit,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::T => {
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_t_gate)(self.qureg_handle, qubit);
-                        }
-                    }
-                }
-                GateType::Tdg => {
-                    // T-dagger = T^7 = phase(-pi/4)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_phase_shift)(
-                                self.qureg_handle,
-                                qubit,
-                                -std::f64::consts::FRAC_PI_4,
-                            );
-                        }
-                    }
-                }
-                GateType::CX => {
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (ctrl, tgt) =
-                            (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, ctrl, tgt);
-                        }
-                    }
-                }
-                GateType::CY => {
-                    // CY = (I ⊗ S†) · CX · (I ⊗ S) = Controlled-Y
-                    // Decompose as: S†(target) · CX · S(target)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (ctrl, tgt) =
-                            (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            // S†(tgt) = phase(-pi/2)
-                            (self.backend.apply_phase_shift)(
-                                self.qureg_handle,
-                                tgt,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, ctrl, tgt);
-                            // S(tgt) = phase(pi/2)
-                            (self.backend.apply_phase_shift)(
-                                self.qureg_handle,
-                                tgt,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::CZ => {
-                    // CZ = H(target) · CX · H(target)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (ctrl, tgt) =
-                            (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_hadamard)(self.qureg_handle, tgt);
-                            (self.backend.apply_cnot)(self.qureg_handle, ctrl, tgt);
-                            (self.backend.apply_hadamard)(self.qureg_handle, tgt);
-                        }
-                    }
-                }
-                // CH = Ry(π/4)_target · CX · Ry(-π/4)_target
-                GateType::CH => {
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (ctrl, tgt) =
-                            (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_rotation_y)(
-                                self.qureg_handle,
-                                tgt,
-                                std::f64::consts::FRAC_PI_4,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, ctrl, tgt);
-                            (self.backend.apply_rotation_y)(
-                                self.qureg_handle,
-                                tgt,
-                                -std::f64::consts::FRAC_PI_4,
-                            );
-                        }
-                    }
-                }
-                GateType::RX => {
-                    if !cmd.angles.is_empty() {
-                        let theta = cmd.angles[0].to_radians();
-                        for q in &cmd.qubits {
-                            let qubit = **q as i32;
-                            unsafe {
-                                (self.backend.apply_rotation_x)(self.qureg_handle, qubit, theta);
-                            }
-                        }
-                    }
-                }
-                GateType::RY => {
-                    if !cmd.angles.is_empty() {
-                        let theta = cmd.angles[0].to_radians();
-                        for q in &cmd.qubits {
-                            let qubit = **q as i32;
-                            unsafe {
-                                (self.backend.apply_rotation_y)(self.qureg_handle, qubit, theta);
-                            }
-                        }
-                    }
-                }
-                GateType::RZ => {
-                    if !cmd.angles.is_empty() {
-                        let theta = cmd.angles[0].to_radians();
-                        for q in &cmd.qubits {
-                            let qubit = **q as i32;
-                            unsafe {
-                                (self.backend.apply_rotation_z)(self.qureg_handle, qubit, theta);
-                            }
-                        }
-                    }
-                }
-                GateType::RZZ => {
-                    // RZZ(theta) = exp(-i * theta/2 * Z_a Z_b)
-                    // Decompose as: CNOT(a,b) - RZ(theta, b) - CNOT(a,b)
-                    let theta = cmd.angles[0].to_radians();
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (*qubits[0] as i32, *qubits[1] as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_z)(self.qureg_handle, b, theta);
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::SZZ => {
-                    // SZZ = RZZ(pi/2) = exp(-i * pi/4 * Z_a Z_b)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_z)(
-                                self.qureg_handle,
-                                b,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::SZZdg => {
-                    // SZZdg = RZZ(-pi/2) = exp(i * pi/4 * Z_a Z_b)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_z)(
-                                self.qureg_handle,
-                                b,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::R1XY => {
-                    // R1XY(theta, phi) gate
-                    // Decompose as: RZ(-phi) - RX(theta) - RZ(phi)
-                    if cmd.angles.len() >= 2 {
-                        let theta = cmd.angles[0].to_radians();
-                        let phi = cmd.angles[1].to_radians();
-                        for q in &cmd.qubits {
-                            let qubit = **q as i32;
-                            unsafe {
-                                (self.backend.apply_rotation_z)(self.qureg_handle, qubit, -phi);
-                                (self.backend.apply_rotation_x)(self.qureg_handle, qubit, theta);
-                                (self.backend.apply_rotation_z)(self.qureg_handle, qubit, phi);
-                            }
-                        }
-                    }
-                }
-                GateType::U => {
-                    // U(theta, phi, lambda) = RZ(phi) - RY(theta) - RZ(lambda)
-                    if cmd.angles.len() >= 3 {
-                        let theta = cmd.angles[0].to_radians();
-                        let phi = cmd.angles[1].to_radians();
-                        let lambda = cmd.angles[2].to_radians();
-                        for q in &cmd.qubits {
-                            let qubit = **q as i32;
-                            unsafe {
-                                (self.backend.apply_rotation_z)(self.qureg_handle, qubit, lambda);
-                                (self.backend.apply_rotation_y)(self.qureg_handle, qubit, theta);
-                                (self.backend.apply_rotation_z)(self.qureg_handle, qubit, phi);
-                            }
-                        }
-                    }
-                }
-                GateType::MZ | GateType::MeasureLeaked | GateType::MeasureFree => {
-                    for q in &cmd.qubits {
-                        let qubit = **q as i32;
-                        let outcome = unsafe { (self.backend.measure)(self.qureg_handle, qubit) };
-                        measurements
-                            .push(u32::try_from(outcome).expect("measurement outcome fits in u32"));
-                    }
-                }
-                GateType::PZ | GateType::QAlloc => {
-                    // Prepare in |0> state: measure and flip if result is 1
-                    for q in &cmd.qubits {
-                        let qubit = **q as i32;
-                        let outcome = unsafe { (self.backend.measure)(self.qureg_handle, qubit) };
-                        if outcome == 1 {
-                            unsafe {
-                                (self.backend.apply_pauli_x)(self.qureg_handle, qubit);
-                            }
-                        }
-                    }
-                }
-                GateType::SWAP => {
-                    // SWAP = CX(0,1) CX(1,0) CX(0,1)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (q0, q1) =
-                            (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, q0, q1);
-                            (self.backend.apply_cnot)(self.qureg_handle, q1, q0);
-                            (self.backend.apply_cnot)(self.qureg_handle, q0, q1);
-                        }
-                    }
-                }
-                GateType::CRZ => {
-                    // CRZ(θ) = Rz(θ/2) on target, CX, Rz(-θ/2) on target, CX
-                    if !cmd.angles.is_empty() {
-                        let angle = cmd.angles[0].to_radians();
-                        let half_angle = angle / 2.0;
-                        for qubits in cmd.qubits.chunks_exact(2) {
-                            let (control, target) =
-                                (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                            unsafe {
-                                (self.backend.apply_rotation_z)(
-                                    self.qureg_handle,
-                                    target,
-                                    half_angle,
-                                );
-                                (self.backend.apply_cnot)(self.qureg_handle, control, target);
-                                (self.backend.apply_rotation_z)(
-                                    self.qureg_handle,
-                                    target,
-                                    -half_angle,
-                                );
-                                (self.backend.apply_cnot)(self.qureg_handle, control, target);
-                            }
-                        }
-                    }
-                }
-                GateType::CCX => {
-                    // Toffoli decomposition into Clifford+T gates
-                    for qubits in cmd.qubits.chunks_exact(3) {
-                        let c0 = usize::from(qubits[0]) as i32;
-                        let c1 = usize::from(qubits[1]) as i32;
-                        let target = usize::from(qubits[2]) as i32;
-                        unsafe {
-                            (self.backend.apply_hadamard)(self.qureg_handle, target);
-                            (self.backend.apply_cnot)(self.qureg_handle, c1, target);
-                            (self.backend.apply_phase_shift)(
-                                self.qureg_handle,
-                                target,
-                                -std::f64::consts::FRAC_PI_4,
-                            ); // Tdg
-                            (self.backend.apply_cnot)(self.qureg_handle, c0, target);
-                            (self.backend.apply_t_gate)(self.qureg_handle, target);
-                            (self.backend.apply_cnot)(self.qureg_handle, c1, target);
-                            (self.backend.apply_phase_shift)(
-                                self.qureg_handle,
-                                target,
-                                -std::f64::consts::FRAC_PI_4,
-                            ); // Tdg
-                            (self.backend.apply_cnot)(self.qureg_handle, c0, target);
-                            (self.backend.apply_t_gate)(self.qureg_handle, c1);
-                            (self.backend.apply_t_gate)(self.qureg_handle, target);
-                            (self.backend.apply_cnot)(self.qureg_handle, c0, c1);
-                            (self.backend.apply_hadamard)(self.qureg_handle, target);
-                            (self.backend.apply_t_gate)(self.qureg_handle, c0);
-                            (self.backend.apply_phase_shift)(
-                                self.qureg_handle,
-                                c1,
-                                -std::f64::consts::FRAC_PI_4,
-                            ); // Tdg
-                            (self.backend.apply_cnot)(self.qureg_handle, c0, c1);
-                        }
-                    }
-                }
-                GateType::SX => {
-                    // SX = RX(pi/2)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_rotation_x)(
-                                self.qureg_handle,
-                                qubit,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::SXdg => {
-                    // SXdg = RX(-pi/2)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_rotation_x)(
-                                self.qureg_handle,
-                                qubit,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::F => {
-                    // F = SX · SZ = RX(pi/2) · RZ(pi/2)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_rotation_z)(
-                                self.qureg_handle,
-                                qubit,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_rotation_x)(
-                                self.qureg_handle,
-                                qubit,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::Fdg => {
-                    // Fdg = F† = SZ† · SX† = RZ(-pi/2) · RX(-pi/2)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_rotation_x)(
-                                self.qureg_handle,
-                                qubit,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_rotation_z)(
-                                self.qureg_handle,
-                                qubit,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::SY => {
-                    // SY = RY(pi/2)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_rotation_y)(
-                                self.qureg_handle,
-                                qubit,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::SYdg => {
-                    // SYdg = RY(-pi/2)
-                    for q in &cmd.qubits {
-                        let qubit = usize::from(*q) as i32;
-                        unsafe {
-                            (self.backend.apply_rotation_y)(
-                                self.qureg_handle,
-                                qubit,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                        }
-                    }
-                }
-                GateType::SXX => {
-                    // SXX = RXX(pi/2): decompose as H⊗H · SZZ · H⊗H
-                    // Or equivalently: CNOT(a,b) · RX(pi/2, b) · CNOT(a,b)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_x)(
-                                self.qureg_handle,
-                                b,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::SXXdg => {
-                    // SXXdg = RXX(-pi/2)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_x)(
-                                self.qureg_handle,
-                                b,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::SYY => {
-                    // SYY = RYY(pi/2): decompose as CNOT(a,b) · RY(pi/2, b) · CNOT(a,b)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_y)(
-                                self.qureg_handle,
-                                b,
-                                std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::SYYdg => {
-                    // SYYdg = RYY(-pi/2)
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_y)(
-                                self.qureg_handle,
-                                b,
-                                -std::f64::consts::FRAC_PI_2,
-                            );
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::I
-                | GateType::Idle
-                | GateType::Custom
-                | GateType::MeasCrosstalkLocalPayload
-                | GateType::MeasCrosstalkGlobalPayload
-                | GateType::QFree => {
-                    // No operation needed (Custom is a placeholder whose actual gate name is in metadata)
-                }
-                GateType::RXX => {
-                    // RXX(theta) = CNOT(a,b) · RX(theta, b) · CNOT(a,b)
-                    if cmd.angles.is_empty() {
-                        return Err(PecosError::Processing(
-                            "RXX gate requires at least one angle".to_string(),
-                        ));
-                    }
-                    let theta = cmd.angles[0].to_radians();
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_x)(self.qureg_handle, b, theta);
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::RYY => {
-                    // RYY(theta) = CNOT(a,b) · RY(theta, b) · CNOT(a,b)
-                    if cmd.angles.is_empty() {
-                        return Err(PecosError::Processing(
-                            "RYY gate requires at least one angle".to_string(),
-                        ));
-                    }
-                    let theta = cmd.angles[0].to_radians();
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_y)(self.qureg_handle, b, theta);
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-                GateType::RXXRYYRZZ | GateType::U2q => {
-                    // RXXRYYRZZ(a,b,c) = RXX(a) · RYY(b) · RZZ(c)
-                    if cmd.angles.len() < 3 {
-                        return Err(PecosError::Processing(
-                            "RXXRYYRZZ gate requires three angles".to_string(),
-                        ));
-                    }
-                    let theta_xx = cmd.angles[0].to_radians();
-                    let theta_yy = cmd.angles[1].to_radians();
-                    let theta_zz = cmd.angles[2].to_radians();
-                    for qubits in cmd.qubits.chunks_exact(2) {
-                        let (a, b) = (usize::from(qubits[0]) as i32, usize::from(qubits[1]) as i32);
-                        unsafe {
-                            // RXX
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_x)(self.qureg_handle, b, theta_xx);
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            // RYY
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_y)(self.qureg_handle, b, theta_yy);
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            // RZZ
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                            (self.backend.apply_rotation_z)(self.qureg_handle, b, theta_zz);
-                            (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                        }
-                    }
-                }
-            }
-        }
-
-        // Create a message with the measurement results
-        let mut builder = ByteMessage::outcomes_builder();
-        let outcomes: Vec<usize> = measurements.iter().map(|&m| m as usize).collect();
-        builder.add_outcomes(&outcomes);
-
-        Ok(builder.build())
-    }
-
-    fn reset(&mut self) -> Result<(), PecosError> {
-        unsafe {
-            (self.backend.init_zero_state)(self.qureg_handle);
-        }
-        Ok(())
-    }
-}
-
-impl QuantumEngine for QuestCudaStateVecEngine {
-    fn set_seed(&mut self, _seed: u64) {
-        // CUDA backend doesn't currently support seeding via the loaded library
-        // The seed would need to be passed to QuEST's internal RNG
-        log::warn!("set_seed not yet implemented for CUDA backend");
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn as_any_mut(&mut self) -> &mut dyn Any {
-        self
-    }
-}
-
-// ============================================================================
-// CliffordGateable and ArbitraryRotationGateable implementations for CUDA engine
-// ============================================================================
-
-impl QuantumSimulator for QuestCudaStateVecEngine {
-    fn reset(&mut self) -> &mut Self {
-        unsafe {
-            (self.backend.init_zero_state)(self.qureg_handle);
-        }
-        self
-    }
-}
-
-impl CliffordGateable for QuestCudaStateVecEngine {
-    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            unsafe {
-                (self.backend.apply_s_gate)(self.qureg_handle, q.index() as i32);
-            }
-        }
-        self
-    }
-
-    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            unsafe {
-                (self.backend.apply_hadamard)(self.qureg_handle, q.index() as i32);
-            }
-        }
-        self
-    }
-
-    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q0, q1) in pairs {
-            unsafe {
-                (self.backend.apply_cnot)(self.qureg_handle, q0.index() as i32, q1.index() as i32);
-            }
-        }
-        self
-    }
-
-    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
-        qubits
-            .iter()
-            .map(|&q| {
-                let outcome =
-                    unsafe { (self.backend.measure)(self.qureg_handle, q.index() as i32) };
-                MeasurementResult {
-                    outcome: outcome != 0,
-                    is_deterministic: false, // CUDA backend doesn't report determinism
-                }
-            })
-            .collect()
-    }
-}
-
-impl ArbitraryRotationGateable for QuestCudaStateVecEngine {
-    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            unsafe {
-                (self.backend.apply_rotation_x)(self.qureg_handle, q.index() as i32, theta);
-            }
-        }
-        self
-    }
-
-    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            unsafe {
-                (self.backend.apply_rotation_z)(self.qureg_handle, q.index() as i32, theta);
-            }
-        }
-        self
-    }
-
-    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
-    fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        // RZZ(theta) = exp(-i * theta/2 * Z⊗Z)
-        // Decomposition: CNOT(q1,q2) . RZ(theta, q2) . CNOT(q1,q2)
-        for &(q0, q1) in pairs {
-            let a = q0.index() as i32;
-            let b = q1.index() as i32;
-            unsafe {
-                (self.backend.apply_cnot)(self.qureg_handle, a, b);
-                (self.backend.apply_rotation_z)(self.qureg_handle, b, theta);
-                (self.backend.apply_cnot)(self.qureg_handle, a, b);
-            }
-        }
-        self
-    }
-}
diff --git a/crates/pecos-quest/src/tests.rs b/crates/pecos-quest/src/tests.rs
deleted file mode 100644
index 9207500ae..000000000
--- a/crates/pecos-quest/src/tests.rs
+++ /dev/null
@@ -1,547 +0,0 @@
-//! Tests for `QuEST` quantum simulator wrapper
-
-#[cfg(test)]
-use crate::{QuestDensityMatrix, QuestStateVec};
-#[cfg(test)]
-use num_complex::Complex64;
-#[cfg(test)]
-use pecos_core::{Angle64, QubitId, qid};
-#[cfg(test)]
-use pecos_num::assert_relative_eq;
-#[cfg(test)]
-use pecos_simulators::{ArbitraryRotationGateable, CliffordGateable, QuantumSimulator};
-#[cfg(test)]
-use std::f64::consts::{FRAC_PI_2, FRAC_PI_4, PI};
-
-const EPSILON: f64 = 1e-10;
-
-// Helper function to check if complex numbers are approximately equal
-#[cfg(test)]
-fn assert_complex_eq(a: Complex64, b: Complex64, epsilon: f64) {
-    assert_relative_eq!(a.re, b.re, epsilon = epsilon);
-    assert_relative_eq!(a.im, b.im, epsilon = epsilon);
-}
-
-#[test]
-fn test_statevec_creation() {
-    let sim = QuestStateVec::new(4);
-    assert_eq!(sim.num_qubits(), 4);
-}
-
-#[test]
-fn test_statevec_with_seed() {
-    let sim: QuestStateVec = QuestStateVec::with_seed(3, 42);
-    assert_eq!(sim.num_qubits(), 3);
-}
-
-#[test]
-fn test_initial_state_is_zero() {
-    let sim = QuestStateVec::new(2);
-    // |00⟩ state should have amplitude 1 at index 0
-    let amp = sim.get_amplitude(0);
-    assert_complex_eq(amp, Complex64::new(1.0, 0.0), EPSILON);
-
-    // All other amplitudes should be 0
-    for i in 1..4 {
-        let amp = sim.get_amplitude(i);
-        assert_complex_eq(amp, Complex64::new(0.0, 0.0), EPSILON);
-    }
-}
-
-#[test]
-fn test_reset() {
-    let mut sim = QuestStateVec::new(2);
-
-    // Apply some gates
-    sim.h(&qid(0)).x(&qid(1));
-
-    // Reset should return to |00⟩
-    sim.reset();
-
-    let amp = sim.get_amplitude(0);
-    assert_complex_eq(amp, Complex64::new(1.0, 0.0), EPSILON);
-}
-
-#[test]
-fn test_pauli_x_gate() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Apply X gate: |0⟩ -> |1⟩
-    sim.x(&qid(0));
-
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim.get_amplitude(1), Complex64::new(1.0, 0.0), EPSILON);
-}
-
-#[test]
-fn test_pauli_y_gate() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Apply Y gate: |0⟩ -> i|1⟩
-    sim.y(&qid(0));
-
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim.get_amplitude(1), Complex64::new(0.0, 1.0), EPSILON);
-}
-
-#[test]
-fn test_pauli_z_gate() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Prepare |1⟩ state
-    sim.x(&qid(0));
-    // Apply Z gate: |1⟩ -> -|1⟩
-    sim.z(&qid(0));
-
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim.get_amplitude(1), Complex64::new(-1.0, 0.0), EPSILON);
-}
-
-#[test]
-fn test_hadamard_gate() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Apply H gate: |0⟩ -> (|0⟩ + |1⟩)/√2
-    sim.h(&qid(0));
-
-    let sqrt2_inv = 1.0 / 2.0_f64.sqrt();
-    assert_complex_eq(
-        sim.get_amplitude(0),
-        Complex64::new(sqrt2_inv, 0.0),
-        EPSILON,
-    );
-    assert_complex_eq(
-        sim.get_amplitude(1),
-        Complex64::new(sqrt2_inv, 0.0),
-        EPSILON,
-    );
-}
-
-#[test]
-fn test_s_gate() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Prepare |1⟩ state
-    sim.x(&qid(0));
-    // Apply S gate: |1⟩ -> i|1⟩
-    sim.sz(&qid(0));
-
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim.get_amplitude(1), Complex64::new(0.0, 1.0), EPSILON);
-}
-
-#[test]
-fn test_t_gate() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Prepare |1⟩ state
-    sim.x(&qid(0));
-    // Apply T gate: |1⟩ -> e^(iπ/4)|1⟩
-    sim.t(&qid(0));
-
-    let expected = Complex64::from_polar(1.0, FRAC_PI_4);
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim.get_amplitude(1), expected, EPSILON);
-}
-
-#[test]
-fn test_cnot_gate() {
-    let mut sim = QuestStateVec::new(2);
-
-    // Test CNOT with control=0, target=1
-    // |00⟩ -> |00⟩
-    sim.cx(&[(QubitId(0), QubitId(1))]);
-    assert_complex_eq(sim.get_amplitude(0b00), Complex64::new(1.0, 0.0), EPSILON);
-
-    sim.reset();
-
-    // |10⟩ -> |11⟩
-    sim.x(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-    assert_complex_eq(sim.get_amplitude(0b11), Complex64::new(1.0, 0.0), EPSILON);
-}
-
-#[test]
-fn test_cz_gate() {
-    let mut sim = QuestStateVec::new(2);
-
-    // Prepare |11⟩ state
-    sim.x(&qid(0)).x(&qid(1));
-    // Apply CZ: |11⟩ -> -|11⟩
-    sim.cz(&[(QubitId(0), QubitId(1))]);
-
-    assert_complex_eq(sim.get_amplitude(0b11), Complex64::new(-1.0, 0.0), EPSILON);
-}
-
-#[test]
-fn test_bell_state_preparation() {
-    let mut sim = QuestStateVec::new(2);
-
-    // Create Bell state (|00⟩ + |11⟩)/√2
-    sim.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-
-    let sqrt2_inv = 1.0 / 2.0_f64.sqrt();
-    assert_complex_eq(
-        sim.get_amplitude(0b00),
-        Complex64::new(sqrt2_inv, 0.0),
-        EPSILON,
-    );
-    assert_complex_eq(sim.get_amplitude(0b01), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim.get_amplitude(0b10), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(
-        sim.get_amplitude(0b11),
-        Complex64::new(sqrt2_inv, 0.0),
-        EPSILON,
-    );
-}
-
-#[test]
-fn test_rotation_gates() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Test Rx(π) = X
-    sim.rx(Angle64::from_radians(PI), &qid(0));
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(0.0, 0.0), 1e-9);
-    assert_complex_eq(sim.get_amplitude(1), Complex64::new(0.0, -1.0), 1e-9); // Note: -i|1⟩ due to phase
-
-    sim.reset();
-
-    // Test Ry(π) = Y (up to global phase)
-    sim.ry(Angle64::from_radians(PI), &qid(0));
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(0.0, 0.0), 1e-9);
-    assert_complex_eq(sim.get_amplitude(1), Complex64::new(1.0, 0.0), 1e-9);
-
-    sim.reset();
-
-    // Test Rz(π) on |+⟩ state
-    sim.h(&qid(0)).rz(Angle64::from_radians(PI), &qid(0));
-    // QuEST uses the convention RZ(θ) = diag(e^(-iθ/2), e^(iθ/2))
-    // So RZ(π) on |+⟩ gives (e^(-iπ/2)|0⟩ + e^(iπ/2)|1⟩)/√2 = (-i|0⟩ + i|1⟩)/√2
-    let sqrt2_inv = 1.0 / 2.0_f64.sqrt();
-    assert_relative_eq!(sim.get_amplitude(0).im, -sqrt2_inv, epsilon = 1e-9);
-    assert_relative_eq!(sim.get_amplitude(1).im, sqrt2_inv, epsilon = 1e-9);
-    assert_relative_eq!(sim.get_amplitude(0).re, 0.0, epsilon = 1e-9);
-    assert_relative_eq!(sim.get_amplitude(1).re, 0.0, epsilon = 1e-9);
-}
-
-#[test]
-fn test_measurement() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Measure |0⟩ state - should always give 0
-    let result = sim.mz(&qid(0));
-    assert!(!result[0].outcome); // 0 outcome
-    assert!(result[0].is_deterministic);
-
-    // After measurement, state should still be |0⟩
-    assert_complex_eq(sim.get_amplitude(0), Complex64::new(1.0, 0.0), EPSILON);
-    assert_complex_eq(sim.get_amplitude(1), Complex64::new(0.0, 0.0), EPSILON);
-}
-
-#[test]
-fn test_measurement_after_x() {
-    let mut sim = QuestStateVec::new(1);
-    sim.x(&qid(0));
-
-    // Measure |1⟩ state - should always give 1
-    let result = sim.mz(&qid(0));
-    assert!(result[0].outcome); // 1 outcome
-    assert!(result[0].is_deterministic);
-}
-
-#[test]
-fn test_method_chaining() {
-    let mut sim = QuestStateVec::new(3);
-
-    // Test that method chaining works
-    sim.h(&qid(0))
-        .cx(&[(QubitId(0), QubitId(1))])
-        .cx(&[(QubitId(1), QubitId(2))])
-        .h(&qid(2))
-        .z(&qid(1))
-        .y(&qid(0));
-
-    // Just check it doesn't crash and returns valid amplitudes
-    let _ = sim.get_amplitude(0);
-}
-
-// Density matrix tests
-#[test]
-fn test_density_matrix_creation() {
-    let sim = QuestDensityMatrix::new(3);
-    assert_eq!(sim.num_qubits(), 3);
-}
-
-#[test]
-fn test_density_matrix_purity() {
-    let sim = QuestDensityMatrix::new(1);
-    // Pure state should have purity = 1
-    assert_relative_eq!(sim.purity(), 1.0, epsilon = EPSILON);
-}
-
-#[test]
-fn test_density_matrix_operations() {
-    let mut sim = QuestDensityMatrix::new(2);
-
-    // Apply gates
-    sim.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-
-    // Check probabilities (diagonal elements)
-    let p0 = sim.probability(0);
-    let p3 = sim.probability(3);
-
-    // For Bell state, should have equal probabilities for |00⟩ and |11⟩
-    assert_relative_eq!(p0, 0.5, epsilon = 1e-9);
-    assert_relative_eq!(p3, 0.5, epsilon = 1e-9);
-}
-
-#[test]
-fn test_density_matrix_reset() {
-    let mut sim = QuestDensityMatrix::new(1);
-
-    sim.x(&qid(0));
-    sim.reset();
-
-    // After reset, should be in |0⟩⟨0| state
-    assert_relative_eq!(sim.probability(0), 1.0, epsilon = EPSILON);
-    assert_relative_eq!(sim.probability(1), 0.0, epsilon = EPSILON);
-}
-
-// Thread safety tests
-#[test]
-fn test_send_sync() {
-    fn assert_send_sync<T: Send + Sync>() {}
-    assert_send_sync::<QuestStateVec>();
-    assert_send_sync::<QuestDensityMatrix>();
-}
-
-#[test]
-fn test_parallel_simulators() {
-    use std::thread;
-
-    let handles: Vec<_> = (0..4)
-        .map(|i| {
-            thread::spawn(move || {
-                let mut sim: QuestStateVec = QuestStateVec::with_seed(2, i);
-                sim.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-
-                // Each thread should create a valid Bell state
-                let amp00 = sim.get_amplitude(0);
-                let amp11 = sim.get_amplitude(3);
-
-                let sqrt2_inv = 1.0 / 2.0_f64.sqrt();
-                assert_relative_eq!(amp00.norm(), sqrt2_inv, epsilon = 1e-9);
-                assert_relative_eq!(amp11.norm(), sqrt2_inv, epsilon = 1e-9);
-            })
-        })
-        .collect();
-
-    // All threads should complete successfully
-    for handle in handles {
-        handle.join().unwrap();
-    }
-}
-
-#[test]
-fn test_clone_independence() {
-    let mut sim1 = QuestStateVec::new(2);
-    let sim2 = sim1.clone();
-
-    // Modify sim1 - X on qubit 0 should flip |00⟩ to |10⟩
-    sim1.x(&qid(0));
-
-    // sim2 should be unaffected (still in |00⟩)
-    assert_complex_eq(sim2.get_amplitude(0), Complex64::new(1.0, 0.0), EPSILON);
-    assert_complex_eq(sim2.get_amplitude(1), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim2.get_amplitude(2), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim2.get_amplitude(3), Complex64::new(0.0, 0.0), EPSILON);
-
-    // sim1 should be modified (now in |10⟩)
-    assert_complex_eq(sim1.get_amplitude(0), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim1.get_amplitude(1), Complex64::new(0.0, 0.0), EPSILON);
-    assert_complex_eq(sim1.get_amplitude(2), Complex64::new(1.0, 0.0), EPSILON);
-    assert_complex_eq(sim1.get_amplitude(3), Complex64::new(0.0, 0.0), EPSILON);
-}
-
-#[test]
-#[should_panic(expected = "Invalid qubit index")]
-fn test_invalid_qubit_index() {
-    let mut sim = QuestStateVec::new(2);
-    sim.x(&qid(2)); // Should panic - only qubits 0 and 1 exist
-}
-
-#[test]
-fn test_tdg_gate() {
-    let mut sim = QuestStateVec::new(1);
-
-    // Prepare |1⟩ state
-    sim.x(&qid(0));
-    // Apply T† gate: |1⟩ -> e^(-iπ/4)|1⟩
-    sim.tdg(&qid(0));
-
-    let expected = Complex64::from_polar(1.0, -FRAC_PI_4);
-    assert_complex_eq(sim.get_amplitude(1), expected, EPSILON);
-}
-
-#[test]
-fn test_rzz_gate() {
-    let mut sim = QuestStateVec::new(2);
-
-    // Prepare |11⟩ state
-    sim.x(&qid(0)).x(&qid(1));
-
-    // Apply RZZ(π/2)
-    sim.rzz(
-        Angle64::from_radians(FRAC_PI_2),
-        &[(QubitId(0), QubitId(1))],
-    );
-
-    // QuEST's RZZ appears to apply a different scaling
-    // RZZ(π/2) on |11⟩ gives phase -π instead of -π/4
-    let expected = Complex64::new(-1.0, 0.0); // e^(-iπ) = -1
-    assert_complex_eq(sim.get_amplitude(0b11), expected, 1e-9);
-}
-
-// RNG management tests
-#[test]
-fn test_rng_management() {
-    use pecos_core::rng::RngManageable;
-    use pecos_random::PecosRng;
-
-    let mut sim = QuestStateVec::new(2);
-
-    // Set a new RNG
-    let new_rng = PecosRng::seed_from_u64(12345);
-    sim.set_rng(new_rng);
-
-    // Should be able to get RNG reference
-    let _ = sim.rng();
-    let _ = sim.rng_mut();
-}
-
-#[test]
-fn test_set_seed() {
-    use pecos_core::rng::RngManageable;
-
-    let mut sim = QuestStateVec::new(2);
-    sim.set_seed(9999);
-
-    // Subsequent random operations should be deterministic
-    // (though we don't have random operations in basic gates)
-}
-
-#[test]
-fn test_measurement_determinism_with_seed() {
-    // Test that measurements are deterministic when using the same seed
-    let seed = 42;
-    let num_measurements = 100;
-
-    // Run first simulation - repeatedly prepare and measure
-    let mut sim1: QuestStateVec = QuestStateVec::with_seed(2, seed);
-    let mut results1 = Vec::new();
-    for _ in 0..num_measurements {
-        sim1.reset();
-        sim1.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]); // Create Bell state
-        results1.push(sim1.mz(&qid(0))[0].outcome);
-    }
-
-    // Run second simulation with same seed - repeatedly prepare and measure
-    let mut sim2: QuestStateVec = QuestStateVec::with_seed(2, seed);
-    let mut results2 = Vec::new();
-    for _ in 0..num_measurements {
-        sim2.reset();
-        sim2.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]); // Create same Bell state
-        results2.push(sim2.mz(&qid(0))[0].outcome);
-    }
-
-    // Results should be identical
-    assert_eq!(
-        results1, results2,
-        "Measurements with same seed should produce identical results"
-    );
-}
-
-#[test]
-fn test_measurement_randomness_with_different_seeds() {
-    // Test that measurements show randomness when using different seeds
-    // This is deterministic because we control the seeds
-    let num_trials = 30;
-
-    let mut all_results = Vec::new();
-
-    for i in 0_u64..num_trials {
-        // Use different seeds for each trial to ensure different random streams
-        let mut sim: QuestStateVec = QuestStateVec::with_seed(1, 12345 + i);
-        sim.h(&qid(0)); // Create superposition
-        all_results.push(sim.mz(&qid(0))[0].outcome);
-    }
-
-    // With 30 different seeds measuring a superposition, we expect variation
-    // This test is deterministic given the seeds
-    let all_same = all_results.iter().all(|&x| x == all_results[0]);
-    assert!(
-        !all_same,
-        "Measurements with different seeds should show variation in outcomes"
-    );
-}
-
-#[test]
-fn test_different_seeds_produce_different_results() {
-    // Test that different seeds produce different measurement sequences
-    let num_measurements = 50;
-
-    let mut results_seed1 = Vec::new();
-    let mut results_seed2 = Vec::new();
-
-    // Seed 1 - repeatedly prepare and measure
-    let mut sim1: QuestStateVec = QuestStateVec::with_seed(1, 12345);
-    for _ in 0..num_measurements {
-        sim1.reset(); // Reset to |0⟩
-        sim1.h(&qid(0)); // Create superposition
-        results_seed1.push(sim1.mz(&qid(0))[0].outcome);
-    }
-
-    // Seed 2 (different) - repeatedly prepare and measure
-    let mut sim2: QuestStateVec = QuestStateVec::with_seed(1, 67890);
-    for _ in 0..num_measurements {
-        sim2.reset(); // Reset to |0⟩
-        sim2.h(&qid(0)); // Create superposition
-        results_seed2.push(sim2.mz(&qid(0))[0].outcome);
-    }
-
-    // Different seeds should produce different sequences
-    assert_ne!(
-        results_seed1, results_seed2,
-        "Different seeds should produce different measurement sequences"
-    );
-}
-
-#[test]
-fn test_density_matrix_measurement_determinism_with_seed() {
-    // Same test for QuestDensityMatrix
-    let seed = 123;
-    let num_measurements = 100;
-
-    // Run first simulation - repeatedly prepare and measure
-    let mut sim1: QuestDensityMatrix = QuestDensityMatrix::with_seed(2, seed);
-    let mut results1 = Vec::new();
-    for _ in 0..num_measurements {
-        sim1.reset();
-        sim1.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]); // Create Bell state
-        results1.push(sim1.mz(&qid(0))[0].outcome);
-    }
-
-    // Run second simulation with same seed - repeatedly prepare and measure
-    let mut sim2: QuestDensityMatrix = QuestDensityMatrix::with_seed(2, seed);
-    let mut results2 = Vec::new();
-    for _ in 0..num_measurements {
-        sim2.reset();
-        sim2.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]); // Create same Bell state
-        results2.push(sim2.mz(&qid(0))[0].outcome);
-    }
-
-    // Results should be identical
-    assert_eq!(
-        results1, results2,
-        "Density matrix measurements with same seed should produce identical results"
-    );
-}
diff --git a/crates/pecos-quest/tests/basic_test.rs b/crates/pecos-quest/tests/basic_test.rs
deleted file mode 100644
index d09aed699..000000000
--- a/crates/pecos-quest/tests/basic_test.rs
+++ /dev/null
@@ -1,386 +0,0 @@
-//! Basic tests for the `QuEST` wrapper using PECOS-style API
-
-use pecos_core::{Angle64, QubitId, qid};
-use pecos_num::assert_relative_eq;
-use pecos_quest::{ArbitraryRotationGateable, CliffordGateable, QuantumSimulator, QuestStateVec};
-use pecos_random::PecosRng;
-
-#[test]
-fn test_state_creation() {
-    let state = QuestStateVec::new(5);
-    assert_eq!(state.num_qubits(), 5);
-
-    // Check that initial state is |00000>
-    let prob = state.probability(0);
-    assert_relative_eq!(prob, 1.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_state_with_seed() {
-    let state1: QuestStateVec<PecosRng> = QuestStateVec::with_seed(3, 42);
-    let state2: QuestStateVec<PecosRng> = QuestStateVec::with_seed(3, 42);
-
-    assert_eq!(state1.num_qubits(), 3);
-    assert_eq!(state2.num_qubits(), 3);
-
-    // Both should be in the same initial state
-    assert_relative_eq!(
-        state1.probability(0),
-        state2.probability(0),
-        epsilon = 1e-10
-    );
-}
-
-#[test]
-fn test_computational_basis_preparation() {
-    let mut state = QuestStateVec::new(2);
-
-    // Prepare |01> (binary 10 = decimal 2)
-    state.prepare_computational_basis(0b10);
-
-    assert_relative_eq!(state.probability(0b00), 0.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(0b01), 0.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(0b10), 1.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(0b11), 0.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_plus_state_preparation() {
-    let mut state = QuestStateVec::new(2);
-    state.prepare_plus_state();
-
-    // Each basis state should have probability 1/4
-    let expected_prob = 0.25;
-    for i in 0..4 {
-        assert_relative_eq!(state.probability(i), expected_prob, epsilon = 1e-10);
-    }
-}
-
-#[test]
-fn test_state_access() {
-    let state = QuestStateVec::new(2);
-
-    // Initially |00>
-    // Check amplitude of |00>
-    let amp0 = state.get_amplitude(0);
-    assert_relative_eq!(amp0.re, 1.0, epsilon = 1e-10);
-    assert_relative_eq!(amp0.im, 0.0, epsilon = 1e-10);
-
-    // Check other amplitudes are zero
-    for i in 1..4 {
-        let amp = state.get_amplitude(i);
-        assert_relative_eq!(amp.re, 0.0, epsilon = 1e-10);
-        assert_relative_eq!(amp.im, 0.0, epsilon = 1e-10);
-    }
-}
-
-#[test]
-fn test_reset() {
-    let mut state = QuestStateVec::new(2);
-
-    // Change the state
-    state.prepare_computational_basis(3);
-    assert_relative_eq!(state.probability(3), 1.0, epsilon = 1e-10);
-
-    // Reset should bring back to |00>
-    state.reset();
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(3), 0.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_pauli_gates() {
-    let mut state = QuestStateVec::new(1);
-
-    // Test Pauli-X: |0> -> |1>
-    state.reset();
-    state.x(&qid(0));
-    assert_relative_eq!(state.probability(0), 0.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(1), 1.0, epsilon = 1e-10);
-
-    // Test Pauli-Z on |1>: should add phase but not change probabilities
-    state.z(&qid(0));
-    assert_relative_eq!(state.probability(0), 0.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(1), 1.0, epsilon = 1e-10);
-
-    // Test Pauli-Y: X*Z = iY, so after X then Z, we should have i|1>
-    // Probability should still be 1 for |1>
-    state.reset().x(&qid(0)).y(&qid(0));
-    // Y|1> = -i|0>, so we should be in |0>
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(1), 0.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_hadamard_gate() {
-    let mut state = QuestStateVec::new(1);
-
-    // H|0> = |+> = (|0> + |1>)/sqrt(2)
-    state.h(&qid(0));
-
-    let expected_prob = 0.5;
-    assert_relative_eq!(state.probability(0), expected_prob, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(1), expected_prob, epsilon = 1e-10);
-}
-
-#[test]
-fn test_s_gates() {
-    let mut state = QuestStateVec::new(1);
-
-    // S|0> = |0>, probability unchanged
-    state.sz(&qid(0));
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-
-    // S†S = I, so applying S then S† should be identity
-    state.szdg(&qid(0));
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_cnot_gate() {
-    let mut state = QuestStateVec::new(2);
-
-    // CNOT|00> = |00>
-    state.cx(&[(QubitId(0), QubitId(1))]);
-    assert_relative_eq!(state.probability(0b00), 1.0, epsilon = 1e-10);
-
-    // In PECOS convention (qubit 0 = LSB):
-    // - State 0b01 has qubit 0 = 1 (control set), qubit 1 = 0
-    // - State 0b10 has qubit 0 = 0 (control clear), qubit 1 = 1
-
-    // Prepare state with control qubit 0 = 1, apply CNOT(0,1) -> target flips
-    state.prepare_computational_basis(0b01); // qubit 0 = 1, qubit 1 = 0
-    state.cx(&[(QubitId(0), QubitId(1))]); // control=0 is set, so target=1 flips: 0->1
-    assert_relative_eq!(state.probability(0b11), 1.0, epsilon = 1e-10); // qubit 0 = 1, qubit 1 = 1
-
-    // Prepare state with control qubit 0 = 0, apply CNOT(0,1) -> no change
-    state.prepare_computational_basis(0b10); // qubit 0 = 0, qubit 1 = 1
-    state.cx(&[(QubitId(0), QubitId(1))]); // control=0 is clear, target doesn't flip
-    assert_relative_eq!(state.probability(0b10), 1.0, epsilon = 1e-10); // unchanged
-}
-
-#[test]
-fn test_cz_gate() {
-    let mut state = QuestStateVec::new(2);
-
-    // CZ|00> = |00>
-    state.cz(&[(QubitId(0), QubitId(1))]);
-    assert_relative_eq!(state.probability(0b00), 1.0, epsilon = 1e-10);
-
-    // CZ|11> = -|11> (same probability)
-    state.prepare_computational_basis(0b11);
-    state.cz(&[(QubitId(0), QubitId(1))]);
-    assert_relative_eq!(state.probability(0b11), 1.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_bell_state_creation() {
-    let mut state = QuestStateVec::new(2);
-
-    // Create Bell state: H(0) then CNOT(0,1)
-    state.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-
-    // Should have equal probability for |00> and |11>
-    assert_relative_eq!(state.probability(0b00), 0.5, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(0b01), 0.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(0b10), 0.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(0b11), 0.5, epsilon = 1e-10);
-}
-
-#[test]
-fn test_measurement() {
-    let mut state = QuestStateVec::new(1);
-
-    // Measure |0>
-    let result = state.mz(&qid(0));
-    assert!(!result[0].outcome); // |0> corresponds to false
-    assert!(result[0].is_deterministic);
-
-    // Measure |1>
-    state.prepare_computational_basis(1);
-    let result = state.mz(&qid(0));
-    assert!(result[0].outcome); // |1> corresponds to true
-    assert!(result[0].is_deterministic);
-
-    // Measure superposition state
-    state.reset().h(&qid(0));
-    let result = state.mz(&qid(0));
-    // Should not be deterministic (though this is probabilistic)
-    // For a superposition state, measurement is non-deterministic
-    assert!(!result[0].is_deterministic);
-}
-
-#[test]
-fn test_rotation_gates() {
-    use std::f64::consts::PI;
-
-    let mut state = QuestStateVec::new(1);
-
-    // RX(π) = -iX, so RX(π)|0> should give |1>
-    state.rx(Angle64::from_radians(PI), &qid(0));
-    assert_relative_eq!(state.probability(0), 0.0, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(1), 1.0, epsilon = 1e-10);
-
-    // RZ doesn't change computational basis probabilities
-    state.reset();
-    state.rz(Angle64::from_radians(PI / 2.0), &qid(0));
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-
-    // RY(π/2) should create superposition
-    state.reset();
-    state.ry(Angle64::from_radians(PI / 2.0), &qid(0));
-    assert_relative_eq!(state.probability(0), 0.5, epsilon = 1e-10);
-    assert_relative_eq!(state.probability(1), 0.5, epsilon = 1e-10);
-}
-
-#[test]
-fn test_t_gates() {
-    let mut state = QuestStateVec::new(1);
-
-    // T|0> = |0>, probability unchanged
-    state.t(&qid(0));
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-
-    // T†T = I, so applying T then T† should be identity
-    state.tdg(&qid(0));
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_rzz_gate() {
-    use std::f64::consts::PI;
-
-    let mut state = QuestStateVec::new(2);
-
-    // RZZ doesn't change computational basis probabilities
-    state.rzz(Angle64::from_radians(PI / 2.0), &[(QubitId(0), QubitId(1))]);
-    assert_relative_eq!(state.probability(0), 1.0, epsilon = 1e-10);
-
-    // Test on |11> state
-    state.prepare_computational_basis(0b11);
-    state.rzz(Angle64::from_radians(PI / 2.0), &[(QubitId(0), QubitId(1))]);
-    assert_relative_eq!(state.probability(0b11), 1.0, epsilon = 1e-10);
-}
-
-#[test]
-fn test_method_chaining() {
-    let mut state = QuestStateVec::new(2);
-
-    // Test that all methods return &mut Self for chaining
-    state
-        .reset()
-        .h(&qid(0))
-        .cx(&[(QubitId(0), QubitId(1))])
-        .z(&qid(1))
-        .rx(Angle64::from_radians(std::f64::consts::PI / 4.0), &qid(0));
-
-    // Just verify it compiles and runs
-    assert_eq!(state.num_qubits(), 2);
-}
-
-#[test]
-fn test_gpu_acceleration_status() {
-    let state = QuestStateVec::new(2);
-    let qureg_info = state.get_info();
-    let env_info = state.get_env_info();
-
-    // Print environment status for visibility
-    println!("QuEST Environment Info:");
-    println!("  Multithreaded: {}", env_info.is_multithreaded);
-    println!("  GPU accelerated: {}", env_info.is_gpu_accelerated);
-    println!("  Distributed: {}", env_info.is_distributed);
-    println!("  Rank: {}", env_info.rank);
-    println!("  Num nodes: {}", env_info.num_nodes);
-
-    println!("\nQureg Info:");
-    println!("  Number of qubits: {}", qureg_info.num_qubits);
-    println!("  Number of amplitudes: {}", qureg_info.num_amps);
-    println!("  Is density matrix: {}", qureg_info.is_density_matrix);
-
-    // The direct QuestStateVec wrapper always uses CPU mode.
-    // For GPU acceleration, use the engine builder with .with_gpu().
-    // This is because the CUDA backend is loaded at runtime via dlopen,
-    // allowing a single binary to work on systems with and without CUDA.
-    assert!(
-        !env_info.is_gpu_accelerated,
-        "QuestStateVec should use CPU mode. GPU acceleration is only available \
-         via the engine builder with .with_gpu()."
-    );
-    println!("\nINFO: QuestStateVec uses CPU mode (as expected)");
-    println!("      For GPU acceleration, use quest_state_vec().with_gpu()");
-}
-
-/// Test the CUDA engine through the builder interface
-#[test]
-fn test_cuda_engine_builder() {
-    use pecos_engines::{Engine, QuantumEngineBuilder, byte_message::ByteMessage};
-    use pecos_quest::quest_state_vec;
-
-    println!("\n=== Testing CUDA engine builder ===");
-
-    // Test CPU mode first
-    let mut cpu_builder = quest_state_vec().qubits(2);
-    let mut cpu_engine = cpu_builder.build().expect("Failed to build CPU engine");
-    println!("CPU engine created successfully");
-
-    // Create a Bell state circuit: H(0), CNOT(0,1), measure both
-    let mut msg_builder = ByteMessage::quantum_operations_builder();
-    msg_builder.h(&[0]);
-    msg_builder.cx(&[(0, 1)]);
-    msg_builder.mz(&[0, 1]);
-    let msg = msg_builder.build();
-
-    let result = cpu_engine.process(msg.clone()).expect("CPU process failed");
-    let outcomes = result.outcomes().expect("Failed to get outcomes");
-    println!("CPU measurement outcomes: {outcomes:?}");
-
-    // Verify Bell state outcomes (both qubits should match)
-    assert!(
-        outcomes.len() == 2,
-        "Expected 2 measurement outcomes, got {}",
-        outcomes.len()
-    );
-    assert_eq!(
-        outcomes[0], outcomes[1],
-        "Bell state outcomes should match: got {outcomes:?}"
-    );
-
-    // Now test GPU mode
-    println!("\n=== Testing GPU mode ===");
-    let mut gpu_builder = quest_state_vec().qubits(2).with_gpu();
-    match gpu_builder.build() {
-        Ok(mut gpu_engine) => {
-            println!("GPU engine created successfully!");
-
-            // Reset and run the same circuit
-            gpu_engine.reset().expect("Reset failed");
-
-            let mut msg_builder = ByteMessage::quantum_operations_builder();
-            msg_builder.h(&[0]);
-            msg_builder.cx(&[(0, 1)]);
-            msg_builder.mz(&[0, 1]);
-            let msg = msg_builder.build();
-
-            let result = gpu_engine.process(msg).expect("GPU process failed");
-            let outcomes = result.outcomes().expect("Failed to get outcomes");
-            println!("GPU measurement outcomes: {outcomes:?}");
-
-            // Verify Bell state outcomes
-            assert!(
-                outcomes.len() == 2,
-                "Expected 2 measurement outcomes, got {}",
-                outcomes.len()
-            );
-            assert_eq!(
-                outcomes[0], outcomes[1],
-                "Bell state outcomes should match: got {outcomes:?}"
-            );
-
-            println!("\nSUCCESS: CUDA engine works correctly!");
-        }
-        Err(e) => {
-            println!("GPU engine build failed (expected if CUDA not available): {e}");
-            // Not a failure - CUDA may not be available at runtime
-        }
-    }
-}
diff --git a/crates/pecos-quest/tests/thread_safety.rs b/crates/pecos-quest/tests/thread_safety.rs
deleted file mode 100644
index 994d14736..000000000
--- a/crates/pecos-quest/tests/thread_safety.rs
+++ /dev/null
@@ -1,315 +0,0 @@
-//! Thread safety tests for `QuEST` wrapper
-//! These tests verify that multiple `QuestStateVec` instances can work in parallel
-//! without interfering with each other, which is essential for Monte Carlo simulations.
-
-use pecos_core::{Angle64, QubitId, qid};
-use pecos_num::assert_relative_eq;
-use pecos_quest::{ArbitraryRotationGateable, CliffordGateable, QuantumSimulator, QuestStateVec};
-use pecos_random::PecosRng;
-use std::sync::{Arc, Barrier};
-use std::thread;
-
-#[test]
-fn test_send_sync_traits() {
-    // Compile-time check that QuestStateVec implements Send + Sync
-    fn assert_send_sync<T: Send + Sync>() {}
-    assert_send_sync::<QuestStateVec>();
-}
-
-#[test]
-fn test_parallel_independent_instances() {
-    const NUM_THREADS: usize = 4;
-    const NUM_QUBITS: usize = 3;
-
-    let barrier = Arc::new(Barrier::new(NUM_THREADS));
-    let handles: Vec<_> = (0..NUM_THREADS)
-        .map(|thread_id| {
-            let barrier = Arc::clone(&barrier);
-            thread::spawn(move || {
-                // Each thread gets its own completely independent state
-                let mut state: QuestStateVec<PecosRng> =
-                    QuestStateVec::with_seed(NUM_QUBITS, thread_id as u64 + 42);
-
-                // Wait for all threads to be ready
-                barrier.wait();
-
-                // Each thread performs different operations
-                match thread_id {
-                    0 => {
-                        // Thread 0: Create |000>
-                        state.reset();
-                        let prob = state.probability(0);
-                        assert_relative_eq!(prob, 1.0, epsilon = 1e-10);
-                        prob
-                    }
-                    1 => {
-                        // Thread 1: Create |111>
-                        state.prepare_computational_basis(0b111);
-                        let prob = state.probability(0b111);
-                        assert_relative_eq!(prob, 1.0, epsilon = 1e-10);
-                        prob
-                    }
-                    2 => {
-                        // Thread 2: Create Bell-like state on 3 qubits
-                        // H(0) puts qubit 0 in superposition, CX(0,1) entangles qubits 0 and 1
-                        // Result: (|000> + |011>)/sqrt(2) in |q2 q1 q0> notation
-                        // In PECOS (qubit 0 = LSB): states 0b000 = 0 and 0b011 = 3
-                        state.reset();
-                        state.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-                        let prob_000 = state.probability(0b000);
-                        let prob_011 = state.probability(0b011);
-                        assert_relative_eq!(prob_000, 0.5, epsilon = 1e-10);
-                        assert_relative_eq!(prob_011, 0.5, epsilon = 1e-10);
-                        prob_000 + prob_011
-                    }
-                    3 => {
-                        // Thread 3: Create uniform superposition
-                        state.prepare_plus_state();
-                        let mut total_prob = 0.0;
-                        for i in 0..(1 << NUM_QUBITS) {
-                            let prob = state.probability(i);
-                            assert_relative_eq!(prob, 1.0 / 8.0, epsilon = 1e-10);
-                            total_prob += prob;
-                        }
-                        total_prob
-                    }
-                    _ => unreachable!(),
-                }
-            })
-        })
-        .collect();
-
-    // Collect results from all threads
-    let results: Vec<f64> = handles
-        .into_iter()
-        .map(|handle| handle.join().unwrap())
-        .collect();
-
-    // Verify all threads completed successfully with expected results
-    assert_relative_eq!(results[0], 1.0, epsilon = 1e-10); // |000>
-    assert_relative_eq!(results[1], 1.0, epsilon = 1e-10); // |111>
-    assert_relative_eq!(results[2], 1.0, epsilon = 1e-10); // Bell state total
-    assert_relative_eq!(results[3], 1.0, epsilon = 1e-10); // Plus state total
-}
-
-#[test]
-fn test_parallel_bell_state_measurements() {
-    const NUM_THREADS: usize = 8;
-
-    let handles: Vec<_> = (0..NUM_THREADS)
-        .map(|thread_id| {
-            thread::spawn(move || {
-                let mut state: QuestStateVec<PecosRng> =
-                    QuestStateVec::with_seed(2, thread_id as u64 * 1000);
-
-                // Create Bell state
-                state.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-
-                // Perform many measurements to verify correlation
-                let mut correlations = Vec::new();
-                for _measurement in 0..20 {
-                    // Reset to Bell state for each measurement
-                    state.reset().h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]);
-
-                    let outcome0 = state.mz(&qid(0))[0].outcome;
-                    let outcome1 = state.mz(&qid(1))[0].outcome;
-
-                    // In Bell state, measurements should be perfectly correlated
-                    correlations.push(outcome0 == outcome1);
-                }
-
-                // Return correlation statistics
-                let correlation_count = correlations.iter().filter(|&&x| x).count();
-                (thread_id, correlation_count, correlations.len())
-            })
-        })
-        .collect();
-
-    let results: Vec<_> = handles
-        .into_iter()
-        .map(|handle| handle.join().unwrap())
-        .collect();
-
-    // Verify all threads completed and got reasonable correlation
-    for (thread_id, correlation_count, total_measurements) in results {
-        println!(
-            "Thread {thread_id}: {correlation_count}/{total_measurements} correlated measurements"
-        );
-
-        // Bell state measurements should be perfectly correlated
-        // (allowing for potential QuEST measurement implementation details)
-        assert_eq!(
-            correlation_count, total_measurements,
-            "Thread {thread_id} had imperfect Bell state correlation"
-        );
-    }
-}
-
-#[test]
-fn test_parallel_rotation_gates() {
-    const NUM_THREADS: usize = 6;
-
-    let handles: Vec<_> = (0..NUM_THREADS)
-        .map(|thread_id| {
-            thread::spawn(move || {
-                use std::f64::consts::PI;
-
-                let mut state = QuestStateVec::new(1);
-
-                match thread_id % 3 {
-                    0 => {
-                        // Test RX rotation
-                        state.rx(Angle64::from_radians(PI), &qid(0)); // RX(π)|0> = i|1>
-                        let prob_1 = state.probability(1);
-                        assert_relative_eq!(prob_1, 1.0, epsilon = 1e-10);
-                        prob_1
-                    }
-                    1 => {
-                        // Test RY rotation
-                        state.ry(Angle64::from_radians(PI / 2.0), &qid(0)); // RY(π/2)|0> = (|0> + |1>)/√2
-                        let prob_0 = state.probability(0);
-                        let prob_1 = state.probability(1);
-                        assert_relative_eq!(prob_0, 0.5, epsilon = 1e-10);
-                        assert_relative_eq!(prob_1, 0.5, epsilon = 1e-10);
-                        prob_0 + prob_1
-                    }
-                    2 => {
-                        // Test RZ rotation (doesn't change computational probabilities)
-                        state.rz(Angle64::from_radians(PI / 4.0), &qid(0)); // RZ only adds phase
-                        let prob_0 = state.probability(0);
-                        assert_relative_eq!(prob_0, 1.0, epsilon = 1e-10);
-                        prob_0
-                    }
-                    _ => unreachable!(),
-                }
-            })
-        })
-        .collect();
-
-    let results: Vec<f64> = handles
-        .into_iter()
-        .map(|handle| handle.join().unwrap())
-        .collect();
-
-    // Verify all rotations worked as expected
-    for result in results {
-        assert_relative_eq!(result, 1.0, epsilon = 1e-10);
-    }
-}
-
-#[test]
-fn test_parallel_cloning_and_states() {
-    const NUM_THREADS: usize = 4;
-
-    let handles: Vec<_> = (0..NUM_THREADS)
-        .map(|thread_id| {
-            thread::spawn(move || {
-                // Create template state
-                let mut template: QuestStateVec<PecosRng> = QuestStateVec::with_seed(2, 12345); // Same seed
-                template.h(&qid(0)).cx(&[(QubitId(0), QubitId(1))]); // Bell state
-
-                // Verify template probabilities
-                let template_00 = template.probability(0b00);
-                let template_11 = template.probability(0b11);
-                assert_relative_eq!(template_00, 0.5, epsilon = 1e-10);
-                assert_relative_eq!(template_11, 0.5, epsilon = 1e-10);
-
-                // Each thread modifies its own copy
-                match thread_id {
-                    0 => template.x(&qid(0)), // Should flip to |10> + |01>
-                    1 => template.z(&qid(0)), // Should add phase
-                    2 => template.h(&qid(1)), // Should create different superposition
-                    3 => template.reset(),    // Should go back to |00>
-                    _ => &mut template,
-                };
-
-                // Return final probabilities to verify independence
-                let mut probs = Vec::new();
-                for i in 0..4 {
-                    probs.push(template.probability(i));
-                }
-                (thread_id, probs)
-            })
-        })
-        .collect();
-
-    let results: Vec<_> = handles
-        .into_iter()
-        .map(|handle| handle.join().unwrap())
-        .collect();
-
-    // Verify that each thread produced different results
-    for (thread_id, probs) in &results {
-        println!("Thread {thread_id}: probabilities = {probs:?}");
-
-        // Each thread should have different probability distributions
-        let total_prob: f64 = probs.iter().sum();
-        assert_relative_eq!(total_prob, 1.0, epsilon = 1e-10);
-    }
-
-    // Verify threads didn't interfere with each other
-    // (Results should be deterministic given same operations)
-    let (_, thread0_probs) = &results[0];
-    let (_, thread3_probs) = &results[3]; // Thread 3 did reset()
-
-    // Thread 3 should be in |00> state
-    assert_relative_eq!(thread3_probs[0], 1.0, epsilon = 1e-10);
-
-    // Thread 0 should be different from thread 3
-    assert!((thread0_probs[0] - thread3_probs[0]).abs() > 1e-5);
-}
-
-#[test]
-fn test_many_parallel_instances() {
-    // Stress test with many threads to catch race conditions
-    const NUM_THREADS: usize = 16;
-
-    let handles: Vec<_> = (0..NUM_THREADS)
-        .map(|thread_id| {
-            thread::spawn(move || {
-                let mut state: QuestStateVec<PecosRng> =
-                    QuestStateVec::with_seed(1, thread_id as u64);
-
-                // Perform a series of operations
-                for i in 0..10 {
-                    match (thread_id + i) % 4 {
-                        0 => {
-                            state.reset();
-                        }
-                        1 => {
-                            state.x(&qid(0));
-                        }
-                        2 => {
-                            state.h(&qid(0));
-                        }
-                        3 => {
-                            state.z(&qid(0));
-                        }
-                        _ => unreachable!(),
-                    }
-                }
-
-                // Final measurement
-                let outcome = state.mz(&qid(0))[0].outcome;
-                (thread_id, outcome)
-            })
-        })
-        .collect();
-
-    let results: Vec<_> = handles
-        .into_iter()
-        .map(|handle| handle.join().unwrap())
-        .collect();
-
-    // Just verify all threads completed successfully
-    assert_eq!(results.len(), NUM_THREADS);
-
-    println!("All {NUM_THREADS} threads completed successfully");
-    for (thread_id, outcome) in results {
-        println!(
-            "Thread {}: final measurement = {}",
-            thread_id,
-            if outcome { "1" } else { "0" }
-        );
-    }
-}
diff --git a/crates/pecos-qulacs/Cargo.toml b/crates/pecos-qulacs/Cargo.toml
deleted file mode 100644
index c91da1f18..000000000
--- a/crates/pecos-qulacs/Cargo.toml
+++ /dev/null
@@ -1,34 +0,0 @@
-[package]
-name = "pecos-qulacs"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
-homepage.workspace = true
-repository.workspace = true
-license.workspace = true
-keywords.workspace = true
-categories.workspace = true
-description = "Qulacs quantum simulator bindings for PECOS"
-readme = "README.md"
-
-[dependencies]
-pecos-core.workspace = true
-pecos-simulators.workspace = true
-pecos-random.workspace = true
-num-complex.workspace = true
-rand.workspace = true
-rand_core.workspace = true
-cxx.workspace = true
-
-[dev-dependencies]
-rand.workspace = true
-
-[build-dependencies]
-cxx-build.workspace = true
-cc.workspace = true
-pecos-build.workspace = true
-log.workspace = true
-env_logger.workspace = true
-
-[lints]
-workspace = true
diff --git a/crates/pecos-qulacs/README.md b/crates/pecos-qulacs/README.md
deleted file mode 100644
index 423e427d8..000000000
--- a/crates/pecos-qulacs/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# pecos-qulacs
-
-Qulacs quantum backend for PECOS.
-
-## Purpose
-
-Wraps the Qulacs C++ state vector simulator for use as a PECOS quantum engine. Provides high-performance quantum circuit simulation.
-
-## Key Types
-
-- `QulacsStateVec` - State vector simulator using Qulacs backend
-
-## Features
-
-- Full Clifford gate set
-- Arbitrary rotation gates (Rx, Ry, Rz, etc.)
-- GPU acceleration (optional)
-- Implements `QuantumSimulator`, `CliffordGateable`, `ArbitraryRotationGateable` traits
-
-## Acknowledgements
-
-This crate wraps [Qulacs](https://github.com/qulacs/qulacs), a high-performance quantum circuit simulator developed by the Qulacs team at Osaka University and QunaSys.
-
-**Paper:**
-- Suzuki, Y., Kawase, Y., Masumura, Y., Hiraga, Y., Nakadai, M., Chen, J., Narasimhan, K., Okada, M., Sugiyama, K., Tan, Y.-Y., Takeshita, T., Yamashita, T., Yoshida, K., Shibasaki, Y., & Yamamoto, N. (2021). "Qulacs: a fast and versatile quantum circuit simulator for research purpose." Quantum, 5, 559. [arXiv:2011.13524](https://arxiv.org/abs/2011.13524)
diff --git a/crates/pecos-qulacs/build.rs b/crates/pecos-qulacs/build.rs
deleted file mode 100644
index 8a45bbeb0..000000000
--- a/crates/pecos-qulacs/build.rs
+++ /dev/null
@@ -1,352 +0,0 @@
-use log::warn;
-use pecos_build::{Manifest, ensure_dep_ready};
-use std::env;
-use std::path::{Path, PathBuf};
-
-fn main() {
-    // Initialize logger for build script
-    env_logger::init();
-
-    setup_rerun_conditions();
-
-    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
-    let target = env::var("TARGET").unwrap_or_default();
-    let is_windows = target.contains("windows");
-
-    // Ensure dependencies are downloaded and extracted to ~/.pecos/deps/
-    let (qulacs_path, eigen_path, boost_path) = download_and_extract_dependencies();
-
-    // Build our wrapper with actual Qulacs
-    let mut build = cxx_build::bridge("src/bridge.rs");
-
-    // Add our wrapper
-    build.file("src/qulacs_wrapper.cpp");
-
-    // Add essential Qulacs source files
-    let qulacs_src = qulacs_path.join("src");
-    add_qulacs_source_files(&mut build, &qulacs_src);
-
-    // Configure includes and compiler flags
-    configure_build(
-        &mut build,
-        &eigen_path,
-        &boost_path,
-        &qulacs_src,
-        &out_dir,
-        is_windows,
-        &target,
-    );
-
-    // Compile everything
-    build.compile("qulacs_wrapper");
-
-    // Add Windows-specific boost exception stub if needed
-    if is_windows {
-        create_windows_boost_stub(&out_dir);
-    }
-
-    // On macOS, link against the system C++ library from dyld shared cache
-    if target.contains("darwin") {
-        println!("cargo:rustc-link-search=native=/usr/lib");
-        println!("cargo:rustc-link-lib=c++");
-        println!("cargo:rustc-link-arg=-Wl,-search_paths_first");
-    }
-}
-
-fn setup_rerun_conditions() {
-    println!("cargo:rerun-if-changed=build.rs");
-    println!("cargo:rerun-if-changed=src/bridge.rs");
-    println!("cargo:rerun-if-changed=src/qulacs_wrapper.cpp");
-    println!("cargo:rerun-if-changed=src/qulacs_wrapper.h");
-}
-
-/// Get the build profile from Cargo's environment
-/// Returns "debug", "release", or "native"
-///
-/// Note: Cargo's PROFILE env var only reports "debug" or "release" even for custom profiles
-/// (due to backward compatibility - see RFC 2678). Custom profiles inherit from these base
-/// profiles, so PROFILE reflects the parent. To detect custom profiles like "native", we
-/// check the `OUT_DIR` path which contains the actual profile directory name.
-///
-/// Profile behavior:
-/// - "debug" -> no C++ optimization, fast compile
-/// - "release" -> full optimization (-O3)
-/// - "native" -> full optimization + CPU-specific (-O3 -march=native)
-fn get_build_profile() -> String {
-    // First check OUT_DIR for custom profile name (e.g., target/native/build/...)
-    // Custom profiles get their own directory under target/
-    if let Ok(out_dir) = env::var("OUT_DIR") {
-        // OUT_DIR looks like: .../target/<profile>/build/<crate>-<hash>/out
-        // We want to extract <profile>
-        let parts: Vec<&str> = out_dir.split(std::path::MAIN_SEPARATOR).collect();
-        if let Some(target_idx) = parts.iter().position(|&p| p == "target")
-            && let Some(profile_name) = parts.get(target_idx + 1)
-        {
-            return match *profile_name {
-                "native" => "native",
-                "release" => "release",
-                "debug" => "debug",
-                _ => {
-                    // Unknown profile, fall back to PROFILE env var
-                    if env::var("PROFILE").as_deref() == Ok("release") {
-                        "release"
-                    } else {
-                        "debug"
-                    }
-                }
-            }
-            .to_string();
-        }
-    }
-
-    // Fallback to PROFILE env var (will be "debug" or "release")
-    match env::var("PROFILE").as_deref() {
-        Ok("release") => "release".to_string(),
-        _ => "debug".to_string(),
-    }
-}
-
-fn download_and_extract_dependencies() -> (PathBuf, PathBuf, PathBuf) {
-    // Load manifest (crate-local or workspace-level, with validation)
-    let manifest =
-        Manifest::find_and_load_validated().expect("pecos.toml not found or validation failed");
-
-    // Ensure dependencies are downloaded and extracted to ~/.pecos/deps/
-    // This persists across `cargo clean` for faster rebuilds
-    let qulacs_path = ensure_dep_ready("qulacs", &manifest).expect("Failed to get Qulacs");
-    let eigen_path = ensure_dep_ready("eigen", &manifest).expect("Failed to get Eigen");
-    let boost_path = ensure_dep_ready("boost", &manifest).expect("Failed to get Boost");
-
-    (qulacs_path, eigen_path, boost_path)
-}
-
-fn add_qulacs_source_files(build: &mut cc::Build, qulacs_src: &Path) {
-    // Core cppsim files - only add files that exist
-    let cppsim_files = vec![
-        "state.cpp",
-        "state_dm.cpp", // Added: contains state::from_ptree implementation
-        "gate.cpp",
-        "gate_factory.cpp",
-        "gate_matrix.cpp",
-        "gate_named_one.cpp",
-        "utility.cpp",
-        "circuit.cpp",
-        "qubit_info.cpp",
-        "gate_matrix_sparse.cpp",
-        "gate_matrix_diagonal.cpp",
-        "gate_merge.cpp",
-        "pauli_operator.cpp",
-        "general_quantum_operator.cpp",
-        "observable.cpp",
-        "gate_noisy_evolution.cpp",
-    ];
-
-    for file in &cppsim_files {
-        let path = qulacs_src.join("cppsim").join(file);
-        if path.exists() {
-            build.file(path);
-        } else {
-            warn!("Skipping missing file: cppsim/{file}");
-        }
-    }
-
-    // Core csim files - these are the actual files present in Qulacs 0.6.12
-    let csim_files = vec![
-        "memory_ops.cpp",
-        "stat_ops.cpp",
-        "update_ops_named.cpp",
-        "update_ops_named_X.cpp",
-        "update_ops_named_Y.cpp",
-        "update_ops_named_Z.cpp",
-        "update_ops_named_H.cpp",
-        "update_ops_named_CNOT.cpp",
-        "update_ops_named_CZ.cpp",
-        "update_ops_named_SWAP.cpp",
-        "update_ops_named_state.cpp",
-        "update_ops_matrix_dense_single.cpp",
-        "update_ops_pauli_single.cpp",
-        "stat_ops_probability.cpp",
-        "utility.cpp",
-        "init_ops_fill.cpp",
-        "init_ops_random.cpp",
-        "update_ops_matrix_dense_double.cpp",
-        "update_ops_matrix_diagonal_single.cpp",
-        "update_ops_matrix_phase_single.cpp",
-        "update_ops_matrix_dense_multi.cpp",
-        "update_ops_matrix_diagonal_multi.cpp",
-        "update_ops_pauli_multi.cpp",
-        "stat_ops_expectation_value.cpp",
-        "stat_ops_transition_amplitude.cpp",
-        "update_ops_dm.cpp",
-        "memory_ops_dm.cpp",
-        "stat_ops_dm.cpp",
-        "constant.cpp",
-        // Files that were missing but actually exist in Qulacs 0.6.12
-        "update_ops_control_single_target_single.cpp",
-        "update_ops_control_single_target_multi.cpp",
-        "update_ops_control_multi_target_single.cpp",
-        "update_ops_control_multi_target_multi.cpp",
-        "update_ops_named_FusedSWAP.cpp",
-        "update_ops_reflection.cpp",
-        "update_ops_reversible_boolean.cpp",
-        "update_ops_qft.cpp",
-        "update_ops_named_projection.cpp",
-        "update_ops_matrix_dense_double_eigen.cpp",
-        "update_ops_matrix_dense_multi_eigen.cpp",
-    ];
-
-    for file in &csim_files {
-        let path = qulacs_src.join("csim").join(file);
-        if path.exists() {
-            build.file(path);
-        } else {
-            warn!("Skipping missing file: csim/{file}");
-        }
-    }
-}
-
-fn configure_build(
-    build: &mut cc::Build,
-    eigen_path: &Path,
-    boost_path: &Path,
-    qulacs_src: &Path,
-    out_dir: &Path,
-    is_windows: bool,
-    target: &str,
-) {
-    // Include directories
-    build.include(eigen_path);
-    build.include(boost_path);
-    build.include(qulacs_src);
-    build.include(qulacs_src.join("cppsim"));
-    build.include(qulacs_src.join("csim"));
-    build.include("src");
-    build.include(out_dir);
-
-    // Configure the C++ compiler based on platform.
-    // - macOS: MUST use system clang (/usr/bin/clang++) which has proper SDK paths.
-    //   PECOS's bundled clang doesn't have macOS SDK headers configured (missing math.h, etc.)
-    //   and the cc crate will find PECOS clang first if it's in PATH.
-    // - Windows: Use MSVC (default). PECOS's bundled clang-cl is LLVM 14, but MSVC 2022's STL
-    //   requires Clang 19.0.0+ when using clang-cl, causing "STL1000: Unexpected compiler version".
-    // - Linux: Use system GCC (PECOS clang can't find system GCC headers for libstdc++)
-    // Only override if CXX/CC env vars are not already set (allow user override).
-    if env::var("CXX").is_err() && env::var("CC").is_err() && target.contains("darwin") {
-        // On macOS, explicitly use system clang to ensure SDK paths are correct.
-        // The PECOS LLVM clang may be in PATH but doesn't have SDK headers.
-        build.compiler("/usr/bin/clang++");
-    }
-    // On Windows and Linux, use the default compiler (MSVC on Windows, GCC on Linux)
-
-    // Get the build profile for optimization decisions
-    let profile = get_build_profile();
-    let is_release = profile == "release" || profile == "native";
-
-    // Set compiler flags based on platform and compiler
-    if is_windows {
-        // MSVC-specific settings
-        build.std("c++14");
-        // Define Boost exception handling for Windows
-        build.define("BOOST_NO_EXCEPTIONS", None);
-        build.define("_USE_MATH_DEFINES", None);
-        // Windows needs these for proper linking
-        build.define("_WINDOWS", None);
-        build.define("NOMINMAX", None);
-
-        // Fix MSVC compiler crash with Eigen templates
-        build.flag("/bigobj"); // Allow larger object files
-        build.flag("/EHsc"); // Enable exception handling
-        build.flag("/Z7"); // Embed debug info in .obj files (no PDB) - required for parallel builds
-
-        // Suppress warnings from external headers (Eigen, Boost, Qulacs)
-        build.flag_if_supported("/external:anglebrackets"); // Treat angle-bracket includes as external
-        build.flag_if_supported("/external:W0"); // Disable warnings for external headers
-
-        // Use optimization level based on Cargo profile
-        if is_release {
-            build.opt_level(2); // Maximize speed optimization (/O2)
-        } else {
-            build.opt_level(0); // No optimization for debug builds
-        }
-    } else {
-        build.flag_if_supported("-std=c++14");
-
-        // Use profile-based optimization settings
-        match profile.as_str() {
-            "native" => {
-                // Native profile: release optimizations + CPU-specific optimizations
-                build.flag_if_supported("-O3");
-                build.flag_if_supported("-march=native"); // CPU-specific optimizations
-            }
-            "release" => {
-                // Release profile: optimized build
-                build.flag_if_supported("-O3");
-            }
-            _ => {
-                // Dev profile: no optimization flags for fastest compile times
-            }
-        }
-        // Debug builds use cc crate's default (no optimization flags)
-
-        // Safe math optimizations (don't cause ICEs, provide modest speedup)
-        // Applied to all profiles
-        build.flag_if_supported("-fno-math-errno");
-        build.flag_if_supported("-fno-trapping-math");
-
-        // Suppress all warnings from third-party C++ code (Qulacs, Eigen, Boost)
-        build.warnings(false);
-
-        // On macOS, use libc++ (the system default and what PECOS clang expects)
-        if target.contains("darwin") {
-            build.flag("-stdlib=libc++");
-            // Note: Linker flags are passed via cargo:rustc-link-arg below, not here
-        }
-        // On Linux, use system default (libstdc++) - no flag needed
-    }
-
-    // Define preprocessor macros - only disable Eigen debug checks in release mode
-    if is_release {
-        build.define("EIGEN_NO_DEBUG", None);
-    }
-
-    // Enable SIMD-optimized gate kernels in Qulacs (matches Qulacs CMake USE_SIMD=Yes).
-    // _USE_SIMD activates hand-written SIMD intrinsics for gates like H, X, CNOT, RZ, etc.
-    // On x86/x86_64, Qulacs's type.hpp will #undef _USE_SIMD if the compiler doesn't define
-    // __AVX2__, so this is safe even when -march=native isn't used.
-    if target.contains("x86_64")
-        || target.contains("x86")
-        || target.contains("i686")
-        || target.contains("aarch64")
-    {
-        build.define("_USE_SIMD", None);
-    }
-}
-
-fn create_windows_boost_stub(out_dir: &Path) {
-    println!("cargo:rustc-link-lib=static=qulacs_wrapper");
-    // Create a simple boost exception handler stub
-    std::fs::write(
-        out_dir.join("boost_exception_stub.cpp"),
-        r#"
-        #include <exception>
-        namespace boost {
-            struct source_location {
-                const char* file_name() const { return ""; }
-                const char* function_name() const { return ""; }
-                int line() const { return 0; }
-            };
-            void throw_exception(std::exception const& e, source_location const&) {
-                throw e;
-            }
-        }
-        "#,
-    )
-    .expect("Failed to write boost exception stub");
-
-    // Compile the stub
-    cc::Build::new()
-        .cpp(true)
-        .file(out_dir.join("boost_exception_stub.cpp"))
-        .std("c++14")
-        .compile("boost_exception_stub");
-}
diff --git a/crates/pecos-qulacs/pecos.toml b/crates/pecos-qulacs/pecos.toml
deleted file mode 100644
index ea7dc6c1d..000000000
--- a/crates/pecos-qulacs/pecos.toml
+++ /dev/null
@@ -1,21 +0,0 @@
-# This file is included in the published crate package
-
-version = 1
-
-[dependencies.qulacs]
-version = "0.6.13"
-url = "https://github.com/qulacs/qulacs/archive/v0.6.13.tar.gz"
-sha256 = "9ef25a988b9f483b97ea9501554a1ce5ee23ffaf89e7ca89969f0d03fcf94af0"
-description = "Qulacs quantum simulator"
-
-[dependencies.eigen]
-version = "3.4.0"
-url = "https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz"
-sha256 = "8586084f71f9bde545ee7fa6d00288b264a2b7ac3607b974e54d13e7162c1c72"
-description = "C++ linear algebra library"
-
-[dependencies.boost]
-version = "1.83.0"
-url = "https://archives.boost.io/release/1.83.0/source/boost_1_83_0.tar.bz2"
-sha256 = "6478edfe2f3305127cffe8caf73ea0176c53769f4bf1585be237eb30798c3b8e"
-description = "C++ Boost libraries"
diff --git a/crates/pecos-qulacs/src/bridge.rs b/crates/pecos-qulacs/src/bridge.rs
deleted file mode 100644
index 2f5284e18..000000000
--- a/crates/pecos-qulacs/src/bridge.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-//! CXX bridge for Qulacs C++ library bindings.
-
-#[cxx::bridge]
-pub mod ffi {
-    unsafe extern "C++" {
-        include!("qulacs_wrapper.h");
-
-        type QulacsState;
-
-        // Constructor and destructor
-        #[must_use]
-        fn create_quantum_state(num_qubits: usize) -> UniquePtr<QulacsState>;
-        fn clone_quantum_state(state: &QulacsState) -> UniquePtr<QulacsState>;
-
-        // RNG management
-        fn set_seed(state: Pin<&mut QulacsState>, seed: u32);
-
-        // State operations
-        fn reset(state: Pin<&mut QulacsState>);
-        #[allow(dead_code)]
-        fn set_zero_state(state: Pin<&mut QulacsState>);
-        fn set_computational_basis(state: Pin<&mut QulacsState>, basis: u64);
-
-        // Get state information
-        #[allow(dead_code)]
-        fn get_num_qubits(state: &QulacsState) -> usize;
-        #[allow(dead_code)]
-        fn get_squared_norm(state: &QulacsState) -> f64;
-        fn get_vector_size(state: &QulacsState) -> usize;
-        fn get_amplitude(state: &QulacsState, index: u64) -> [f64; 2];
-        fn get_marginal_probability(state: &QulacsState, qubit: usize) -> f64;
-
-        // Single-qubit gates
-        fn apply_x(state: Pin<&mut QulacsState>, qubit: usize);
-        fn apply_y(state: Pin<&mut QulacsState>, qubit: usize);
-        fn apply_z(state: Pin<&mut QulacsState>, qubit: usize);
-        fn apply_h(state: Pin<&mut QulacsState>, qubit: usize);
-        fn apply_s(state: Pin<&mut QulacsState>, qubit: usize);
-        fn apply_sdag(state: Pin<&mut QulacsState>, qubit: usize);
-        fn apply_t(state: Pin<&mut QulacsState>, qubit: usize);
-        fn apply_tdag(state: Pin<&mut QulacsState>, qubit: usize);
-
-        // NOTE: sqrt_x, sqrt_xdag, sqrt_y, sqrt_ydag removed - we use trait
-        // decompositions instead for consistency with StateVec.
-
-        // Rotation gates
-        fn apply_rx(state: Pin<&mut QulacsState>, qubit: usize, angle: f64);
-        fn apply_ry(state: Pin<&mut QulacsState>, qubit: usize, angle: f64);
-        fn apply_rz(state: Pin<&mut QulacsState>, qubit: usize, angle: f64);
-
-        // Global phase
-        #[allow(dead_code)]
-        fn apply_global_phase(state: Pin<&mut QulacsState>, angle: f64);
-
-        // Two-qubit gates
-        fn apply_cnot(state: Pin<&mut QulacsState>, control: usize, target: usize);
-        fn apply_cz(state: Pin<&mut QulacsState>, control: usize, target: usize);
-        fn apply_swap(state: Pin<&mut QulacsState>, qubit1: usize, qubit2: usize);
-
-        // Measurement
-        #[must_use]
-        fn measure_z(state: Pin<&mut QulacsState>, qubit: usize) -> u8;
-
-        // Direct csim-level gate functions (bypass gate object allocation)
-        fn csim_x(state: Pin<&mut QulacsState>, qubit: usize);
-        fn csim_h(state: Pin<&mut QulacsState>, qubit: usize);
-        fn csim_rz(state: Pin<&mut QulacsState>, qubit: usize, angle: f64);
-        fn csim_cnot(state: Pin<&mut QulacsState>, control: usize, target: usize);
-    }
-}
diff --git a/crates/pecos-qulacs/src/lib.rs b/crates/pecos-qulacs/src/lib.rs
deleted file mode 100644
index 0b4ed8bfa..000000000
--- a/crates/pecos-qulacs/src/lib.rs
+++ /dev/null
@@ -1,452 +0,0 @@
-// Copyright 2025 The PECOS Developers
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-// in compliance with the License.You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software distributed under the License
-// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-// or implied. See the License for the specific language governing permissions and limitations under
-// the License.
-
-//! Qulacs quantum simulator bindings for PECOS.
-//!
-//! Rust bindings to the Qulacs quantum simulator C++ library,
-//! enabling high-performance quantum circuit simulation.
-
-pub mod bridge;
-
-use bridge::ffi;
-use num_complex::Complex64;
-use pecos_core::{Angle64, QubitId, RngManageable};
-use pecos_random::PecosRng;
-use pecos_simulators::{
-    ArbitraryRotationGateable, CliffordGateable, MeasurementResult, QuantumSimulator,
-};
-use rand_core::{Rng, SeedableRng};
-use std::fmt::Debug;
-
-/// A quantum state simulator using Qulacs C++ backend.
-///
-/// `QulacsStateVec` maintains the full quantum state as a complex vector with 2ⁿ amplitudes
-/// for n qubits using the high-performance Qulacs C++ library.
-///
-/// # Type Parameters
-/// * `R` - Random number generator type implementing `Rng + SeedableRng` traits
-pub struct QulacsStateVec<R = PecosRng>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    state: cxx::UniquePtr<ffi::QulacsState>,
-    num_qubits: usize,
-    rng: R,
-}
-
-// Implement Clone for QulacsStateVec
-impl<R> Clone for QulacsStateVec<R>
-where
-    R: Rng + SeedableRng + Debug + Clone,
-{
-    fn clone(&self) -> Self {
-        let mut new_rng = self.rng.clone();
-        let mut new_state = ffi::clone_quantum_state(&self.state);
-        // Seed the cloned state's C++ RNG with a new value
-        let seed = new_rng.next_u32();
-        ffi::set_seed(new_state.pin_mut(), seed);
-        Self {
-            state: new_state,
-            num_qubits: self.num_qubits,
-            rng: new_rng,
-        }
-    }
-}
-
-impl QulacsStateVec {
-    /// Create a new state initialized to |0...0⟩
-    #[inline]
-    #[must_use]
-    pub fn new(num_qubits: usize) -> QulacsStateVec<PecosRng> {
-        let rng: PecosRng = rand::make_rng();
-        QulacsStateVec::with_rng(num_qubits, rng)
-    }
-
-    /// Create a new state vector simulator with a specific seed for the random number generator
-    #[inline]
-    #[must_use]
-    pub fn with_seed(num_qubits: usize, seed: u64) -> QulacsStateVec<PecosRng> {
-        let rng = PecosRng::seed_from_u64(seed);
-        QulacsStateVec::with_rng(num_qubits, rng)
-    }
-}
-
-impl<R> QulacsStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    /// Create a new state vector with a custom random number generator.
-    #[inline]
-    #[must_use]
-    pub fn with_rng(num_qubits: usize, mut rng: R) -> Self {
-        let mut state = ffi::create_quantum_state(num_qubits);
-        // Seed the C++ RNG with a value from our Rust RNG
-        let seed = rng.next_u32();
-        ffi::set_seed(state.pin_mut(), seed);
-        Self {
-            state,
-            num_qubits,
-            rng,
-        }
-    }
-
-    /// Returns the number of qubits in the system
-    #[inline]
-    #[must_use]
-    pub fn num_qubits(&self) -> usize {
-        self.num_qubits
-    }
-
-    /// Convert PECOS qubit index to Qulacs qubit index
-    /// PECOS uses MSB-first ordering (q0 is leftmost/most significant)
-    /// Qulacs uses LSB-first ordering (q0 is rightmost/least significant)
-    #[inline]
-    fn convert_qubit_index(&self, pecos_qubit: usize) -> usize {
-        if pecos_qubit >= self.num_qubits {
-            // Return the same index to let Qulacs handle the error
-            // This prevents panic in Rust and allows proper error propagation
-            return pecos_qubit;
-        }
-        self.num_qubits
-            .saturating_sub(1)
-            .saturating_sub(pecos_qubit)
-    }
-
-    /// Convert PECOS basis state to Qulacs basis state by reversing bit order
-    #[inline]
-    fn convert_basis_state(&self, pecos_basis: usize) -> usize {
-        let mut qulacs_basis = 0;
-        for i in 0..self.num_qubits {
-            if (pecos_basis >> i) & 1 == 1 {
-                // Bit i in PECOS maps to bit (n-1-i) in Qulacs
-                qulacs_basis |= 1 << (self.num_qubits - 1 - i);
-            }
-        }
-        qulacs_basis
-    }
-
-    /// Prepare the state as a specific computational basis state
-    ///
-    /// # Panics
-    /// Panics if `basis_state` is greater than or equal to 2^n where n is the number of qubits.
-    #[inline]
-    pub fn prepare_computational_basis(&mut self, basis_state: usize) -> &mut Self {
-        assert!(basis_state < 1 << self.num_qubits);
-        let qulacs_basis = self.convert_basis_state(basis_state);
-        ffi::set_computational_basis(self.state.pin_mut(), qulacs_basis as u64);
-        self
-    }
-
-    /// Prepare all qubits in the |+⟩ state, creating an equal superposition of all basis states
-    #[inline]
-    pub fn prepare_plus_state(&mut self) -> &mut Self {
-        ffi::reset(self.state.pin_mut());
-        for i in 0..self.num_qubits {
-            self.h(&[QubitId(i)]);
-        }
-        self
-    }
-
-    /// Returns the state vector
-    #[inline]
-    #[must_use]
-    pub fn state(&self) -> Vec<Complex64> {
-        let size = ffi::get_vector_size(&self.state);
-        let mut vector = Vec::with_capacity(size);
-
-        // Since we convert qubit indices when applying gates,
-        // the state vector is already in the correct ordering for PECOS
-        // We just need to retrieve it directly
-        for idx in 0..size {
-            let amp = ffi::get_amplitude(&self.state, idx as u64);
-            vector.push(Complex64::new(amp[0], amp[1]));
-        }
-
-        vector
-    }
-
-    /// Returns the probability of measuring a specific basis state
-    ///
-    /// # Panics
-    /// Panics if `basis_state` is greater than or equal to 2^n where n is the number of qubits.
-    #[inline]
-    #[must_use]
-    pub fn probability(&self, basis_state: usize) -> f64 {
-        assert!(basis_state < 1 << self.num_qubits);
-        let qulacs_basis = self.convert_basis_state(basis_state);
-        let amp = ffi::get_amplitude(&self.state, qulacs_basis as u64);
-        amp[0] * amp[0] + amp[1] * amp[1]
-    }
-
-    /// Apply a general single-qubit unitary gate
-    #[inline]
-    pub fn single_qubit_rotation(
-        &mut self,
-        _qubit: usize,
-        _u00: Complex64,
-        _u01: Complex64,
-        _u10: Complex64,
-        _u11: Complex64,
-    ) -> &mut Self {
-        unimplemented!("QulacsStateVec::single_qubit_rotation requires C++ wrapper support")
-    }
-
-    /// Apply a general two-qubit unitary given by a 4x4 complex matrix
-    pub fn two_qubit_unitary(
-        &mut self,
-        _qubit1: usize,
-        _qubit2: usize,
-        _matrix: [[Complex64; 4]; 4],
-    ) -> &mut Self {
-        unimplemented!("QulacsStateVec::two_qubit_unitary requires C++ wrapper support")
-    }
-}
-
-// Implement QuantumSimulator trait
-impl<R> QuantumSimulator for QulacsStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn reset(&mut self) -> &mut Self {
-        ffi::reset(self.state.pin_mut());
-        self
-    }
-}
-
-// Implement CliffordGateable trait
-impl<R> CliffordGateable for QulacsStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn h(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_h(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-
-    fn sz(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_s(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-
-    fn cx(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q1, q2) in pairs {
-            let qulacs_q1 = self.convert_qubit_index(q1.index());
-            let qulacs_q2 = self.convert_qubit_index(q2.index());
-            ffi::apply_cnot(self.state.pin_mut(), qulacs_q1, qulacs_q2);
-        }
-        self
-    }
-
-    fn mz(&mut self, qubits: &[QubitId]) -> Vec<MeasurementResult> {
-        let mut results = Vec::with_capacity(qubits.len());
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            let prob_zero = ffi::get_marginal_probability(&self.state, qulacs_qubit);
-            let is_deterministic = prob_zero.abs() < 1e-10 || (prob_zero - 1.0).abs() < 1e-10;
-
-            // The C++ measure_z function uses its own RNG (which we've seeded)
-            // and properly collapses the state
-            let outcome_bit = ffi::measure_z(self.state.pin_mut(), qulacs_qubit);
-            let outcome = outcome_bit != 0;
-
-            results.push(MeasurementResult {
-                outcome,
-                is_deterministic,
-            });
-        }
-        results
-    }
-
-    // Override with native Qulacs implementations for better performance
-
-    fn x(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_x(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-
-    fn y(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_y(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-
-    fn z(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_z(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-
-    fn szdg(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_sdag(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-
-    // sx, sxdg, sy, sydg use trait default decompositions for consistency with StateVec
-
-    fn cz(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q1, q2) in pairs {
-            let qulacs_q1 = self.convert_qubit_index(q1.index());
-            let qulacs_q2 = self.convert_qubit_index(q2.index());
-            ffi::apply_cz(self.state.pin_mut(), qulacs_q1, qulacs_q2);
-        }
-        self
-    }
-
-    fn swap(&mut self, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        for &(q1, q2) in pairs {
-            let qulacs_q1 = self.convert_qubit_index(q1.index());
-            let qulacs_q2 = self.convert_qubit_index(q2.index());
-            ffi::apply_swap(self.state.pin_mut(), qulacs_q1, qulacs_q2);
-        }
-        self
-    }
-
-    // Override the f() gate - the default implementation in the trait has the wrong order
-    // The F gate matrix is [[1+i, 1-i], [1+i, -1+i]]/2 which equals SZ @ SX as a matrix
-    // But when applying gates sequentially, we need SX first then SZ
-    fn f(&mut self, qubits: &[QubitId]) -> &mut Self {
-        // Apply SX then SZ to get F = SZ @ SX matrix
-        // This is because applying gates sequentially means the rightmost gate is applied first
-        self.sx(qubits);
-        self.sz(qubits);
-        self
-    }
-
-    // Similarly for fdg - F† = (SZ @ SX)† = SX† @ SZ†
-    // But when applying gates sequentially, we apply SZ† first then SX†
-    fn fdg(&mut self, qubits: &[QubitId]) -> &mut Self {
-        self.szdg(qubits);
-        self.sxdg(qubits);
-        self
-    }
-}
-
-// Implement ArbitraryRotationGateable trait
-impl<R> ArbitraryRotationGateable for QulacsStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    fn rx(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_rx(self.state.pin_mut(), qulacs_qubit, theta);
-        }
-        self
-    }
-
-    fn rz(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            // Both Qulacs and PECOS StateVec use the same convention: diag(e^(-iθ/2), e^(iθ/2))
-            // No phase correction needed
-            ffi::apply_rz(self.state.pin_mut(), qulacs_qubit, theta);
-        }
-        self
-    }
-
-    fn rzz(&mut self, theta: Angle64, pairs: &[(QubitId, QubitId)]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        // RZZ(θ) = exp(-i θ/2 Z⊗Z)
-        // Decomposition: CNOT(q1,q2), RZ(θ, q2), CNOT(q1,q2)
-        for &(q1, q2) in pairs {
-            let q1_conv = self.convert_qubit_index(q1.index());
-            let q2_conv = self.convert_qubit_index(q2.index());
-            ffi::apply_cnot(self.state.pin_mut(), q1_conv, q2_conv);
-            ffi::apply_rz(self.state.pin_mut(), q2_conv, theta);
-            ffi::apply_cnot(self.state.pin_mut(), q1_conv, q2_conv);
-        }
-        self
-    }
-
-    // Override with native Qulacs implementations
-
-    fn ry(&mut self, theta: Angle64, qubits: &[QubitId]) -> &mut Self {
-        let theta = theta.to_radians_signed();
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_ry(self.state.pin_mut(), qulacs_qubit, theta);
-        }
-        self
-    }
-
-    fn t(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_t(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-
-    fn tdg(&mut self, qubits: &[QubitId]) -> &mut Self {
-        for &q in qubits {
-            let qulacs_qubit = self.convert_qubit_index(q.index());
-            ffi::apply_tdag(self.state.pin_mut(), qulacs_qubit);
-        }
-        self
-    }
-}
-
-// Implement RngManageable trait
-impl<R> RngManageable for QulacsStateVec<R>
-where
-    R: Rng + SeedableRng + Debug,
-{
-    type Rng = R;
-
-    fn rng(&self) -> &Self::Rng {
-        &self.rng
-    }
-
-    fn rng_mut(&mut self) -> &mut Self::Rng {
-        &mut self.rng
-    }
-
-    fn set_rng(&mut self, mut rng: Self::Rng) {
-        // Re-seed the C++ RNG when setting a new Rust RNG
-        let seed = rng.next_u32();
-        ffi::set_seed(self.state.pin_mut(), seed);
-        self.rng = rng;
-    }
-}
-
-// SAFETY: QulacsStateVec is Send + Sync because:
-// 1. Each QulacsState instance in C++ is completely independent (no shared global state)
-// 2. UniquePtr provides exclusive ownership
-// 3. The RNG is required to be Send + Sync
-// 4. All operations on QulacsState are self-contained
-unsafe impl<R> Send for QulacsStateVec<R> where R: Rng + SeedableRng + Debug + Send {}
-
-unsafe impl<R> Sync for QulacsStateVec<R> where R: Rng + SeedableRng + Debug + Sync {}
-
-#[cfg(test)]
-mod tests;
-
-#[cfg(test)]
-mod thread_test;
diff --git a/crates/pecos-qulacs/src/qulacs_wrapper.cpp b/crates/pecos-qulacs/src/qulacs_wrapper.cpp
deleted file mode 100644
index 946e828a3..000000000
--- a/crates/pecos-qulacs/src/qulacs_wrapper.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-#include "qulacs_wrapper.h"
-#include "cppsim/state.hpp"
-#include "cppsim/gate_factory.hpp"
-#include "csim/update_ops.hpp"
-#include <complex>
-#include <array>
-
-// Constructor and destructor
-QulacsState::QulacsState(size_t n_qubits)
-    : state(std::make_unique<QuantumStateCpu>(n_qubits)), rng_seed(0) {
-}
-
-QulacsState::~QulacsState() = default;
-
-// Factory functions
-std::unique_ptr<QulacsState> create_quantum_state(size_t n_qubits) {
-    return std::make_unique<QulacsState>(n_qubits);
-}
-
-std::unique_ptr<QulacsState> clone_quantum_state(const QulacsState& state) {
-    size_t n_qubits = state.get_state()->qubit_count;
-    auto new_state = std::make_unique<QulacsState>(n_qubits);
-
-    // Copy the quantum state using Qulacs' copy functionality
-    new_state->get_state()->load(state.get_state());
-
-    // Copy the RNG seed as well
-    new_state->set_rng_seed(state.get_rng_seed());
-
-    return new_state;
-}
-
-// State operations
-void reset(QulacsState& state) {
-    state.get_state()->set_zero_state();
-}
-
-void set_zero_state(QulacsState& state) {
-    state.get_state()->set_zero_state();
-}
-
-void set_computational_basis(QulacsState& state, uint64_t basis) {
-    state.get_state()->set_computational_basis(basis);
-}
-
-// Get state information
-size_t get_num_qubits(const QulacsState& state) {
-    return state.get_state()->qubit_count;
-}
-
-double get_squared_norm(const QulacsState& state) {
-    return state.get_state()->get_squared_norm();
-}
-
-size_t get_vector_size(const QulacsState& state) {
-    return state.get_state()->dim;
-}
-
-std::array<double, 2> get_amplitude(const QulacsState& state, uint64_t index) {
-    // Access the raw data and get the amplitude directly
-    auto* data = state.get_state()->data_cpp();
-    auto amp = data[index];
-    return {amp.real(), amp.imag()};
-}
-
-double get_marginal_probability(const QulacsState& state, size_t qubit) {
-    return state.get_state()->get_zero_probability((UINT)qubit);
-}
-
-// Single-qubit gates - using Qulacs gate functions
-void apply_x(QulacsState& state, size_t qubit) {
-    auto gate = gate::X(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_y(QulacsState& state, size_t qubit) {
-    auto gate = gate::Y(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_z(QulacsState& state, size_t qubit) {
-    auto gate = gate::Z(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_h(QulacsState& state, size_t qubit) {
-    auto gate = gate::H(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_s(QulacsState& state, size_t qubit) {
-    auto gate = gate::S(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_sdag(QulacsState& state, size_t qubit) {
-    auto gate = gate::Sdag(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_t(QulacsState& state, size_t qubit) {
-    auto gate = gate::T(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_tdag(QulacsState& state, size_t qubit) {
-    auto gate = gate::Tdag(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_sqrt_x(QulacsState& state, size_t qubit) {
-    auto gate = gate::sqrtX(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_sqrt_xdag(QulacsState& state, size_t qubit) {
-    auto gate = gate::sqrtXdag(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_sqrt_y(QulacsState& state, size_t qubit) {
-    auto gate = gate::sqrtY(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_sqrt_ydag(QulacsState& state, size_t qubit) {
-    auto gate = gate::sqrtYdag(qubit);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-// Rotation gates
-// Note: Qulacs uses opposite sign convention, so we negate the angle
-void apply_rx(QulacsState& state, size_t qubit, double angle) {
-    auto gate = gate::RX(qubit, -angle);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_ry(QulacsState& state, size_t qubit, double angle) {
-    auto gate = gate::RY(qubit, -angle);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_rz(QulacsState& state, size_t qubit, double angle) {
-    auto gate = gate::RZ(qubit, -angle);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_global_phase(QulacsState& state, double angle) {
-    // Apply a global phase e^(i*angle) to all amplitudes
-    auto* data = state.get_state()->data_cpp();
-    size_t dim = state.get_state()->dim;
-    std::complex<double> phase = std::exp(std::complex<double>(0, angle));
-
-    for (size_t i = 0; i < dim; ++i) {
-        data[i] *= phase;
-    }
-}
-
-// Two-qubit gates
-void apply_cnot(QulacsState& state, size_t control, size_t target) {
-    auto gate = gate::CNOT(control, target);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_cz(QulacsState& state, size_t control, size_t target) {
-    auto gate = gate::CZ(control, target);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-void apply_swap(QulacsState& state, size_t qubit1, size_t qubit2) {
-    auto gate = gate::SWAP(qubit1, qubit2);
-    gate->update_quantum_state(state.get_state());
-    delete gate;
-}
-
-// RNG management
-void set_seed(QulacsState& state, uint32_t seed) {
-    // Store the seed to use when sampling
-    state.set_rng_seed(seed);
-}
-
-// Measurement
-uint8_t measure_z(QulacsState& state, size_t qubit) {
-
-    // Use Qulacs' built-in sampling to get a measurement outcome
-    auto* cpu_state = dynamic_cast<QuantumStateCpu*>(state.get_state());
-    if (cpu_state) {
-        // Sample one outcome using Qulacs' sampling with our stored seed
-        // Note: We increment the seed after each use to get different results
-        uint32_t current_seed = state.get_rng_seed();
-        state.set_rng_seed(current_seed + 1);  // Increment for next measurement
-
-        auto samples = cpu_state->sampling(1, current_seed);
-        bool outcome = (samples[0] >> qubit) & 1;
-
-        // Manually collapse the state by zeroing out incompatible amplitudes
-        auto* data = cpu_state->data_cpp();
-        double norm_factor = 0.0;
-
-        // First pass: zero out incompatible amplitudes and calculate normalization
-        for (ITYPE i = 0; i < cpu_state->dim; ++i) {
-            bool state_bit = (i >> qubit) & 1;
-            if (state_bit != outcome) {
-                data[i] = CPPCTYPE(0.0, 0.0);
-            } else {
-                norm_factor += std::norm(data[i]);
-            }
-        }
-
-        // Second pass: normalize remaining amplitudes
-        if (norm_factor > 1e-15) {
-            double inv_norm = 1.0 / std::sqrt(norm_factor);
-            for (ITYPE i = 0; i < cpu_state->dim; ++i) {
-                bool state_bit = (i >> qubit) & 1;
-                if (state_bit == outcome) {
-                    data[i] *= inv_norm;
-                }
-            }
-        }
-
-        return outcome ? 1 : 0;
-    }
-
-    // Fallback: just return 0
-    return 0;
-}
-
-// Direct csim-level gate functions (bypass gate object allocation)
-void csim_x(QulacsState& state, size_t qubit) {
-    X_gate(static_cast<UINT>(qubit), state.get_state()->data_c(), state.get_state()->dim);
-}
-
-void csim_h(QulacsState& state, size_t qubit) {
-    H_gate(static_cast<UINT>(qubit), state.get_state()->data_c(), state.get_state()->dim);
-}
-
-void csim_rz(QulacsState& state, size_t qubit, double angle) {
-    RZ_gate(static_cast<UINT>(qubit), -angle, state.get_state()->data_c(), state.get_state()->dim);
-}
-
-void csim_cnot(QulacsState& state, size_t control, size_t target) {
-    CNOT_gate(static_cast<UINT>(control), static_cast<UINT>(target), state.get_state()->data_c(), state.get_state()->dim);
-}
diff --git a/crates/pecos-qulacs/src/qulacs_wrapper.h b/crates/pecos-qulacs/src/qulacs_wrapper.h
deleted file mode 100644
index acfc025e3..000000000
--- a/crates/pecos-qulacs/src/qulacs_wrapper.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#pragma once
-#include <memory>
-#include <cstdint>
-#include <array>
-
-// Forward declaration of Qulacs QuantumStateCpu
-class QuantumStateCpu;
-
-// Wrapper class for C++/Rust interop
-class QulacsState {
-private:
-    std::unique_ptr<QuantumStateCpu> state;
-    uint32_t rng_seed;  // Store seed for measurements
-
-public:
-    QulacsState(size_t n_qubits);
-    ~QulacsState();
-
-    QuantumStateCpu* get_state() { return state.get(); }
-    const QuantumStateCpu* get_state() const { return state.get(); }
-
-    void set_rng_seed(uint32_t seed) { rng_seed = seed; }
-    uint32_t get_rng_seed() const { return rng_seed; }
-};
-
-// Factory functions
-std::unique_ptr<QulacsState> create_quantum_state(size_t n_qubits);
-std::unique_ptr<QulacsState> clone_quantum_state(const QulacsState& state);
-
-// RNG management
-void set_seed(QulacsState& state, uint32_t seed);
-
-// State operations
-void reset(QulacsState& state);
-void set_zero_state(QulacsState& state);
-void set_computational_basis(QulacsState& state, uint64_t basis);
-
-// Get state information
-size_t get_num_qubits(const QulacsState& state);
-double get_squared_norm(const QulacsState& state);
-size_t get_vector_size(const QulacsState& state);
-std::array<double, 2> get_amplitude(const QulacsState& state, uint64_t index);
-double get_marginal_probability(const QulacsState& state, size_t qubit);
-
-// Single-qubit gates
-void apply_x(QulacsState& state, size_t qubit);
-void apply_y(QulacsState& state, size_t qubit);
-void apply_z(QulacsState& state, size_t qubit);
-void apply_h(QulacsState& state, size_t qubit);
-void apply_s(QulacsState& state, size_t qubit);
-void apply_sdag(QulacsState& state, size_t qubit);
-void apply_t(QulacsState& state, size_t qubit);
-void apply_tdag(QulacsState& state, size_t qubit);
-void apply_sqrt_x(QulacsState& state, size_t qubit);
-void apply_sqrt_xdag(QulacsState& state, size_t qubit);
-void apply_sqrt_y(QulacsState& state, size_t qubit);
-void apply_sqrt_ydag(QulacsState& state, size_t qubit);
-
-// Rotation gates
-void apply_rx(QulacsState& state, size_t qubit, double angle);
-void apply_ry(QulacsState& state, size_t qubit, double angle);
-void apply_rz(QulacsState& state, size_t qubit, double angle);
-
-// Global phase
-void apply_global_phase(QulacsState& state, double angle);
-
-// Two-qubit gates
-void apply_cnot(QulacsState& state, size_t control, size_t target);
-void apply_cz(QulacsState& state, size_t control, size_t target);
-void apply_swap(QulacsState& state, size_t qubit1, size_t qubit2);
-
-// Measurement
-uint8_t measure_z(QulacsState& state, size_t qubit);
-
-// Direct csim-level gate functions (bypass gate object allocation)
-void csim_x(QulacsState& state, size_t qubit);
-void csim_h(QulacsState& state, size_t qubit);
-void csim_rz(QulacsState& state, size_t qubit, double angle);
-void csim_cnot(QulacsState& state, size_t control, size_t target);
diff --git a/crates/pecos-qulacs/src/tests.rs b/crates/pecos-qulacs/src/tests.rs
deleted file mode 100644
index f52e2d6f7..000000000
--- a/crates/pecos-qulacs/src/tests.rs
+++ /dev/null
@@ -1,542 +0,0 @@
-// Copyright 2025 The PECOS Developers
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-// in compliance with the License.You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software distributed under the License
-// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-// or implied. See the License for the specific language governing permissions and limitations under
-// the License.
-
-#[cfg(test)]
-mod qulacs_tests {
-    use crate::QulacsStateVec;
-    use num_complex::Complex64;
-    use pecos_core::{Angle64, QubitId, RngManageable, qid};
-    use pecos_simulators::{ArbitraryRotationGateable, CliffordGateable, QuantumSimulator};
-    use std::f64::consts::{FRAC_1_SQRT_2, FRAC_PI_2, FRAC_PI_4, PI};
-
-    /// Helper function to check if two states are equal within tolerance
-    fn assert_states_equal(state1: &[Complex64], state2: &[Complex64], tolerance: f64) {
-        assert_eq!(
-            state1.len(),
-            state2.len(),
-            "State vectors have different lengths"
-        );
-        for (i, (a, b)) in state1.iter().zip(state2.iter()).enumerate() {
-            let diff = (a - b).norm();
-            assert!(
-                diff < tolerance,
-                "States differ at index {i}: |{a:?} - {b:?}| = {diff} >= {tolerance}"
-            );
-        }
-    }
-
-    #[test]
-    fn test_initialization() {
-        let sim = QulacsStateVec::new(3);
-        assert_eq!(sim.num_qubits(), 3);
-
-        // Check initial state is |000⟩
-        let state = sim.state();
-        assert_eq!(state.len(), 8);
-        assert!((state[0].norm() - 1.0).abs() < 1e-10);
-        for amp in &state[1..8] {
-            assert!(amp.norm() < 1e-10);
-        }
-    }
-
-    #[test]
-    fn test_bell_state() {
-        let mut sim = QulacsStateVec::new(2);
-
-        // Create Bell state |Φ+⟩ = (|00⟩ + |11⟩)/√2
-        sim.h(&qid(0));
-        sim.cx(&[(QubitId(0), QubitId(1))]);
-
-        let state = sim.state();
-        assert_eq!(state.len(), 4);
-
-        // Check amplitudes
-        assert!((state[0].norm() - FRAC_1_SQRT_2).abs() < 1e-10);
-        assert!(state[1].norm() < 1e-10);
-        assert!(state[2].norm() < 1e-10);
-        assert!((state[3].norm() - FRAC_1_SQRT_2).abs() < 1e-10);
-    }
-
-    #[test]
-    fn test_ghz_state() {
-        let mut sim = QulacsStateVec::new(3);
-
-        // Create GHZ state |GHZ⟩ = (|000⟩ + |111⟩)/√2
-        sim.h(&qid(0));
-        sim.cx(&[(QubitId(0), QubitId(1))]);
-        sim.cx(&[(QubitId(1), QubitId(2))]);
-
-        let state = sim.state();
-        assert_eq!(state.len(), 8);
-
-        // Check amplitudes
-        assert!((state[0].norm() - FRAC_1_SQRT_2).abs() < 1e-10);
-        for amp in &state[1..7] {
-            assert!(amp.norm() < 1e-10);
-        }
-        assert!((state[7].norm() - FRAC_1_SQRT_2).abs() < 1e-10);
-    }
-
-    #[test]
-    fn test_single_qubit_gates() {
-        let mut sim = QulacsStateVec::new(1);
-
-        // Test X gate: X|0⟩ = |1⟩
-        sim.x(&qid(0));
-        assert!(sim.probability(0) < 1e-10);
-        assert!((sim.probability(1) - 1.0).abs() < 1e-10);
-
-        // Test X again: X|1⟩ = |0⟩
-        sim.x(&qid(0));
-        assert!((sim.probability(0) - 1.0).abs() < 1e-10);
-        assert!(sim.probability(1) < 1e-10);
-
-        // Test Y gate
-        sim.reset();
-        sim.y(&qid(0));
-        let state = sim.state();
-        assert!(state[0].norm() < 1e-10);
-        assert!((state[1] - Complex64::new(0.0, 1.0)).norm() < 1e-10);
-
-        // Test Z gate: Z|+⟩ = |−⟩
-        sim.reset();
-        sim.h(&qid(0)); // Create |+⟩
-        sim.z(&qid(0));
-        sim.h(&qid(0)); // H|−⟩ = |1⟩
-        assert!(sim.probability(0) < 1e-10);
-        assert!((sim.probability(1) - 1.0).abs() < 1e-10);
-    }
-
-    #[test]
-    fn test_phase_gates() {
-        let mut sim = QulacsStateVec::new(1);
-
-        // Test S gate: S = √Z
-        sim.h(&qid(0)); // |+⟩
-        sim.sz(&qid(0));
-        let state = sim.state();
-        let expected_phase = Complex64::new(0.0, 1.0);
-        assert!((state[1] / state[0] - expected_phase).norm() < 1e-10);
-
-        // Test T gate: T = ⁴√Z
-        sim.reset();
-        sim.h(&qid(0));
-        sim.t(&qid(0));
-        let state = sim.state();
-        let expected_t_phase = Complex64::from_polar(1.0, PI / 4.0);
-        assert!((state[1] / state[0] - expected_t_phase).norm() < 1e-10);
-    }
-
-    #[test]
-    fn test_rotation_gates() {
-        let mut sim = QulacsStateVec::new(1);
-
-        // Test RX(π) - Qulacs may use a different phase convention
-        sim.rx(Angle64::from_radians(PI), &qid(0));
-        let state = sim.state();
-        assert!(state[0].norm() < 1e-10);
-        // Check that we're in |1⟩ state (phase may differ between implementations)
-        assert!((state[1].norm() - 1.0).abs() < 1e-10);
-
-        // Test RY(π/2) rotation
-        sim.reset();
-        sim.ry(Angle64::from_radians(FRAC_PI_2), &qid(0));
-        let state = sim.state();
-        assert!((state[0].norm() - FRAC_1_SQRT_2).abs() < 1e-10);
-        assert!((state[1].norm() - FRAC_1_SQRT_2).abs() < 1e-10);
-
-        // Test RZ(π) = -Z
-        sim.reset();
-        sim.h(&qid(0)); // Create |+⟩
-        sim.rz(Angle64::from_radians(PI), &qid(0));
-        sim.h(&qid(0)); // Should give |1⟩
-        assert!(sim.probability(0) < 1e-10);
-        assert!((sim.probability(1) - 1.0).abs() < 1e-10);
-    }
-
-    #[test]
-    fn test_two_qubit_gates() {
-        // Test CZ gate
-        let mut sim = QulacsStateVec::new(2);
-        sim.h(&qid(0));
-        sim.h(&qid(1));
-        sim.cz(&[(QubitId(0), QubitId(1))]);
-        let state = sim.state();
-        // CZ on |++⟩ gives (|00⟩ + |01⟩ + |10⟩ - |11⟩)/2
-        assert!((state[0].norm() - 0.5).abs() < 1e-10);
-        assert!((state[1].norm() - 0.5).abs() < 1e-10);
-        assert!((state[2].norm() - 0.5).abs() < 1e-10);
-        assert!((state[3].norm() - 0.5).abs() < 1e-10);
-        assert!((state[3].re + 0.5).abs() < 1e-10); // Negative phase
-
-        // Test SWAP gate
-        sim.reset();
-        sim.x(&qid(0)); // |10⟩ in quantum notation, which is state 1 in computational basis
-        let initial_state = sim.state();
-        println!("Before SWAP: {initial_state:?}");
-
-        sim.swap(&[(QubitId(0), QubitId(1))]); // Should become |01⟩
-        let final_state = sim.state();
-        println!("After SWAP: {final_state:?}");
-
-        // Check which state has probability 1
-        for i in 0..4 {
-            if sim.probability(i) > 0.5 {
-                println!("State {} has probability {}", i, sim.probability(i));
-            }
-        }
-
-        // The SWAP should work - let's be more flexible about which state we expect
-        let mut found_one_state = false;
-        for i in 0..4 {
-            if (sim.probability(i) - 1.0).abs() < 1e-10 {
-                found_one_state = true;
-                break;
-            }
-        }
-        assert!(
-            found_one_state,
-            "SWAP gate should result in exactly one basis state"
-        );
-    }
-
-    #[test]
-    fn test_computational_basis_preparation() {
-        let mut sim = QulacsStateVec::new(3);
-
-        // Test preparing |101⟩ (binary 0b101 = 5)
-        sim.prepare_computational_basis(0b101);
-        assert!((sim.probability(0b101) - 1.0).abs() < 1e-10);
-
-        // Check all other states have zero probability
-        for i in 0..8 {
-            if i != 0b101 {
-                assert!(sim.probability(i) < 1e-10);
-            }
-        }
-    }
-
-    #[test]
-    fn test_plus_state_preparation() {
-        let mut sim = QulacsStateVec::new(2);
-        sim.prepare_plus_state();
-
-        // All basis states should have equal probability
-        for i in 0..4 {
-            assert!((sim.probability(i) - 0.25).abs() < 1e-10);
-        }
-    }
-
-    #[test]
-    fn test_reset() {
-        let mut sim = QulacsStateVec::new(2);
-
-        // Create some non-trivial state
-        sim.h(&qid(0));
-        sim.cx(&[(QubitId(0), QubitId(1))]);
-
-        // Reset should return to |00⟩
-        sim.reset();
-        assert!((sim.probability(0) - 1.0).abs() < 1e-10);
-        for i in 1..4 {
-            assert!(sim.probability(i) < 1e-10);
-        }
-    }
-
-    #[test]
-    fn test_seed_determinism() {
-        // Create two simulators with the same seed
-        let mut sim1 = QulacsStateVec::with_seed(2, 42);
-        let mut sim2 = QulacsStateVec::with_seed(2, 42);
-
-        // Prepare same state
-        sim1.h(&qid(0));
-        sim2.h(&qid(0));
-
-        // Perform measurements - should get same results
-        let mut results1 = Vec::new();
-        let mut results2 = Vec::new();
-
-        for _ in 0..10 {
-            // Reset to same state each time
-            sim1.reset().h(&qid(0));
-            sim2.reset().h(&qid(0));
-
-            results1.push(sim1.mz(&qid(0))[0].outcome);
-            results2.push(sim2.mz(&qid(0))[0].outcome);
-        }
-
-        // Results should be identical
-        assert_eq!(
-            results1, results2,
-            "Same seed should produce same measurement results"
-        );
-    }
-
-    #[test]
-    fn test_different_seeds_give_different_results() {
-        let mut sim1 = QulacsStateVec::with_seed(2, 42);
-        let mut sim2 = QulacsStateVec::with_seed(2, 43);
-
-        let mut results1 = Vec::new();
-        let mut results2 = Vec::new();
-
-        // Collect measurement results
-        for _ in 0..20 {
-            sim1.reset().h(&qid(0));
-            sim2.reset().h(&qid(0));
-
-            results1.push(sim1.mz(&qid(0))[0].outcome);
-            results2.push(sim2.mz(&qid(0))[0].outcome);
-        }
-
-        // Results should be different (with very high probability)
-        assert_ne!(
-            results1, results2,
-            "Different seeds should produce different results"
-        );
-    }
-
-    #[test]
-    fn test_rng_management() {
-        use pecos_random::PecosRng;
-
-        let mut sim = QulacsStateVec::new(1);
-
-        // Set a specific RNG
-        let new_rng = PecosRng::seed_from_u64(123);
-        sim.set_rng(new_rng);
-
-        // Prepare superposition and measure
-        sim.h(&qid(0));
-        let mut results = Vec::new();
-        for _ in 0..10 {
-            sim.reset().h(&qid(0));
-            results.push(sim.mz(&qid(0))[0].outcome);
-        }
-
-        // Reset RNG with same seed - should get same results
-        let new_rng = PecosRng::seed_from_u64(123);
-        sim.set_rng(new_rng);
-
-        let mut results2 = Vec::new();
-        for _ in 0..10 {
-            sim.reset().h(&qid(0));
-            results2.push(sim.mz(&qid(0))[0].outcome);
-        }
-
-        assert_eq!(
-            results, results2,
-            "Same RNG seed should produce same results"
-        );
-    }
-
-    #[test]
-    fn test_measurement_outcome() {
-        let mut sim = QulacsStateVec::with_seed(1, 100);
-
-        // Test measurement on definite states
-        sim.reset(); // |0⟩
-        let result = sim.mz(&qid(0));
-        assert!(result[0].is_deterministic); // Should be deterministic
-        assert!(!result[0].outcome); // Should measure 0
-
-        sim.x(&qid(0)); // |1⟩
-        let result = sim.mz(&qid(0));
-        assert!(result[0].is_deterministic); // Should be deterministic
-        assert!(result[0].outcome); // Should measure 1
-
-        // Test measurement on superposition gives non-deterministic result
-        sim.reset().h(&qid(0)); // |+⟩
-
-        // Test that probabilities are correct for superposition BEFORE measurement
-        let prob_0 = sim.probability(0);
-        let prob_1 = sim.probability(1);
-        assert!((prob_0 - 0.5).abs() < 1e-10);
-        assert!((prob_1 - 0.5).abs() < 1e-10);
-
-        let result = sim.mz(&qid(0));
-        assert!(!result[0].is_deterministic); // Should be probabilistic
-    }
-
-    #[test]
-    fn test_state_normalization() {
-        let mut sim = QulacsStateVec::new(3);
-
-        // Apply various gates
-        sim.h(&qid(0));
-        sim.cx(&[(QubitId(0), QubitId(1))]);
-        sim.ry(Angle64::from_radians(FRAC_PI_4), &qid(2));
-        sim.cz(&[(QubitId(1), QubitId(2))]);
-        sim.t(&qid(0));
-
-        // Check normalization
-        let state = sim.state();
-        let norm_squared: f64 = state.iter().map(num_complex::Complex::norm_sqr).sum();
-        assert!(
-            (norm_squared - 1.0).abs() < 1e-10,
-            "State should remain normalized"
-        );
-    }
-
-    #[test]
-    fn test_gate_reversibility() {
-        let mut sim = QulacsStateVec::new(2);
-
-        // Save initial state
-        let initial = sim.state();
-
-        // Apply gates and their inverses
-        sim.h(&qid(0));
-        sim.cx(&[(QubitId(0), QubitId(1))]);
-        sim.sz(&qid(1));
-        sim.szdg(&qid(1)); // S†
-        sim.cx(&[(QubitId(0), QubitId(1))]);
-        sim.h(&qid(0));
-
-        // Should be back to initial state
-        let final_state = sim.state();
-        assert_states_equal(&initial, &final_state, 1e-10);
-    }
-
-    #[test]
-    fn test_composite_gates() {
-        let mut sim = QulacsStateVec::new(2);
-
-        // Test CY gate implementation
-        sim.prepare_computational_basis(0b10); // |10⟩
-        sim.cy(&[(QubitId(1), QubitId(0))]); // Control on qubit 1, target on qubit 0
-
-        // CY|10⟩ = i|11⟩
-        let state = sim.state();
-        assert!(state[0b00].norm() < 1e-10);
-        assert!(state[0b01].norm() < 1e-10);
-        assert!(state[0b10].norm() < 1e-10);
-        assert!((state[0b11] - Complex64::new(0.0, 1.0)).norm() < 1e-10);
-    }
-
-    #[test]
-    fn test_qubit_ordering() {
-        // Test that PECOS qubit ordering is properly handled
-        let mut sim = QulacsStateVec::new(4);
-
-        // Apply X to qubit 0 in PECOS convention (MSB)
-        // Should produce state |1000> = index 8
-        sim.x(&qid(0));
-        let state = sim.state();
-
-        // Find non-zero amplitude
-        let mut nonzero_idx = 0;
-        for (i, amp) in state.iter().enumerate() {
-            if amp.norm() > 0.5 {
-                nonzero_idx = i;
-                break;
-            }
-        }
-
-        assert_eq!(
-            nonzero_idx, 8,
-            "X on qubit 0 should produce state |1000> (index 8)"
-        );
-
-        // Reset and test qubit 2
-        sim.reset();
-        sim.x(&qid(2));
-        let state = sim.state();
-
-        let mut nonzero_idx = 0;
-        for (i, amp) in state.iter().enumerate() {
-            if amp.norm() > 0.5 {
-                nonzero_idx = i;
-                break;
-            }
-        }
-
-        assert_eq!(
-            nonzero_idx, 2,
-            "X on qubit 2 should produce state |0010> (index 2)"
-        );
-    }
-
-    #[test]
-    fn test_measurement_statistics() {
-        let mut sim = QulacsStateVec::with_seed(1, 42);
-
-        // Prepare |+⟩ state
-        sim.h(&qid(0));
-
-        // Measure many times and check statistics
-        let n_trials = 1000;
-        let mut count_zero = 0;
-
-        for _ in 0..n_trials {
-            sim.reset().h(&qid(0));
-            if !sim.mz(&qid(0))[0].outcome {
-                count_zero += 1;
-            }
-        }
-
-        // Should be approximately 50/50
-        let ratio = f64::from(count_zero) / f64::from(n_trials);
-        assert!(
-            (ratio - 0.5).abs() < 0.05,
-            "Measurement statistics should be ~50/50 for |+⟩ state"
-        );
-    }
-
-    #[test]
-    fn test_measurement_collapse() {
-        // Test that measurement properly collapses the quantum state
-        let mut sim = QulacsStateVec::with_seed(1, 42);
-
-        // Initial state should be |0⟩
-        let initial_vector = sim.state();
-        assert!((initial_vector[0] - Complex64::new(1.0, 0.0)).norm() < 1e-10);
-        assert!(initial_vector[1].norm() < 1e-10);
-
-        // Apply H gate to create superposition
-        sim.h(&qid(0));
-        let superposition_vector = sim.state();
-        let expected_amp = 1.0 / 2.0_f64.sqrt();
-        assert!((superposition_vector[0].re - expected_amp).abs() < 1e-10);
-        assert!((superposition_vector[1].re - expected_amp).abs() < 1e-10);
-
-        // Measure - should collapse to either |0⟩ or |1⟩
-        let result = sim.mz(&qid(0));
-        let final_vector = sim.state();
-
-        println!("Measurement outcome: {}", result[0].outcome);
-        println!("Final state vector: {final_vector:?}");
-
-        if result[0].outcome {
-            // Should collapse to |1⟩
-            assert!(
-                final_vector[0].norm() < 1e-10,
-                "After measuring |1⟩, amplitude of |0⟩ should be 0"
-            );
-            assert!(
-                (final_vector[1] - Complex64::new(1.0, 0.0)).norm() < 1e-10,
-                "After measuring |1⟩, amplitude of |1⟩ should be 1"
-            );
-        } else {
-            // Should collapse to |0⟩
-            assert!(
-                (final_vector[0] - Complex64::new(1.0, 0.0)).norm() < 1e-10,
-                "After measuring |0⟩, amplitude of |0⟩ should be 1"
-            );
-            assert!(
-                final_vector[1].norm() < 1e-10,
-                "After measuring |0⟩, amplitude of |1⟩ should be 0"
-            );
-        }
-    }
-}
diff --git a/crates/pecos-qulacs/src/thread_test.rs b/crates/pecos-qulacs/src/thread_test.rs
deleted file mode 100644
index a8bc1f46c..000000000
--- a/crates/pecos-qulacs/src/thread_test.rs
+++ /dev/null
@@ -1,154 +0,0 @@
-// Test to verify QulacsStateVec is Send + Sync and works in multi-threaded contexts
-
-#[cfg(test)]
-mod thread_safety_tests {
-    use crate::QulacsStateVec;
-    use pecos_core::{QubitId, RngManageable, qid};
-    use pecos_random::PecosRng;
-    use pecos_simulators::{CliffordGateable, QuantumSimulator};
-    use std::sync::{Arc, Mutex};
-    use std::thread;
-
-    #[test]
-    fn test_send_sync_traits() {
-        fn assert_send<T: Send>() {}
-        fn assert_sync<T: Sync>() {}
-
-        assert_send::<QulacsStateVec>();
-        assert_sync::<QulacsStateVec>();
-    }
-
-    #[test]
-    fn test_clone_and_thread_independence() {
-        // Create a template simulator
-        let template_sim = QulacsStateVec::with_seed(2, 42);
-
-        // Clone it for multiple threads
-        let sim1 = template_sim.clone();
-        let sim2 = template_sim.clone();
-        let sim3 = template_sim.clone();
-
-        // Store results from each thread
-        let results = Arc::new(Mutex::new(Vec::new()));
-        let results1 = Arc::clone(&results);
-        let results2 = Arc::clone(&results);
-        let results3 = Arc::clone(&results);
-
-        // Spawn threads that work on independent simulators
-        let handle1 = thread::spawn(move || {
-            let mut sim = sim1;
-            sim.h(&qid(0));
-            sim.cx(&[(QubitId(0), QubitId(1))]);
-            let state = sim.state();
-            results1
-                .lock()
-                .unwrap()
-                .push(("thread1", state[0], state[3]));
-        });
-
-        let handle2 = thread::spawn(move || {
-            let mut sim = sim2;
-            sim.x(&qid(0));
-            sim.h(&qid(1));
-            let state = sim.state();
-            results2
-                .lock()
-                .unwrap()
-                .push(("thread2", state[1], state[3]));
-        });
-
-        let handle3 = thread::spawn(move || {
-            let mut sim = sim3;
-            sim.h(&qid(0));
-            sim.h(&qid(1));
-            let state = sim.state();
-            results3
-                .lock()
-                .unwrap()
-                .push(("thread3", state[0], state[3]));
-        });
-
-        // Wait for all threads to complete
-        handle1.join().unwrap();
-        handle2.join().unwrap();
-        handle3.join().unwrap();
-
-        // Verify we got results from all threads
-        let final_results = results.lock().unwrap();
-        assert_eq!(final_results.len(), 3);
-
-        // Each thread should have produced different results
-        println!("Thread results: {:?}", *final_results);
-
-        // Check that each thread worked independently
-        for (name, _, _) in final_results.iter() {
-            println!("Got result from {name}");
-        }
-    }
-
-    #[test]
-    #[allow(clippy::cast_precision_loss)]
-    fn test_concurrent_monte_carlo_simulation() {
-        const NUM_THREADS: usize = 4;
-        const TRIALS_PER_THREAD: usize = 100;
-
-        // Template simulator for Monte Carlo
-        let template = QulacsStateVec::with_seed(1, 123);
-
-        let handles: Vec<_> = (0..NUM_THREADS)
-            .map(|thread_id| {
-                let mut sim = template.clone();
-                // Give each thread a different seed to avoid correlation
-                sim.set_rng(PecosRng::seed_from_u64(123 + thread_id as u64 * 1000));
-
-                thread::spawn(move || {
-                    let mut measurement_results = Vec::new();
-
-                    for _trial in 0..TRIALS_PER_THREAD {
-                        sim.reset();
-                        sim.h(&qid(0));
-                        let result = sim.mz(&qid(0));
-                        measurement_results.push(result[0].outcome);
-                    }
-
-                    // Return thread ID and measurement statistics
-                    let ones_count = measurement_results.iter().filter(|&&x| x).count();
-                    (thread_id, ones_count, TRIALS_PER_THREAD)
-                })
-            })
-            .collect();
-
-        // Collect results from all threads
-        let mut total_ones = 0;
-        let mut total_trials = 0;
-
-        for handle in handles {
-            let (thread_id, ones_count, trials) = handle.join().unwrap();
-            println!(
-                "Thread {}: {} ones out of {} trials ({:.1}%)",
-                thread_id,
-                ones_count,
-                trials,
-                (ones_count as f64 / trials as f64) * 100.0
-            );
-            total_ones += ones_count;
-            total_trials += trials;
-        }
-
-        // Overall statistics should be roughly 50/50 for |+⟩ measurements
-        let overall_ratio = total_ones as f64 / total_trials as f64;
-        println!(
-            "Overall: {} ones out of {} trials ({:.1}%)",
-            total_ones,
-            total_trials,
-            overall_ratio * 100.0
-        );
-
-        // Should be approximately 50% (allowing some variance)
-        assert!(
-            (overall_ratio - 0.5).abs() < 0.1,
-            "Expected ~50% measurement outcomes, got {:.1}%",
-            overall_ratio * 100.0
-        );
-    }
-}
diff --git a/crates/pecos-simulators/Cargo.toml b/crates/pecos-simulators/Cargo.toml
index 0643fcc46..794ce21b9 100644
--- a/crates/pecos-simulators/Cargo.toml
+++ b/crates/pecos-simulators/Cargo.toml
@@ -36,7 +36,6 @@ rayon = { version = "1.10", optional = true }
 
 [dev-dependencies]
 rand.workspace = true
-pecos-quest.workspace = true
 paste.workspace = true
 
 [lints]
diff --git a/crates/pecos-simulators/src/density_matrix.rs b/crates/pecos-simulators/src/density_matrix.rs
index 129e71c5b..1b0008f4b 100644
--- a/crates/pecos-simulators/src/density_matrix.rs
+++ b/crates/pecos-simulators/src/density_matrix.rs
@@ -634,67 +634,46 @@ where
     /// * `&mut Self` - Returns self for method chaining
     #[inline]
     pub fn apply_amplitude_damping(&mut self, qubit: usize, gamma: f64) -> &mut Self {
-        // Ensure gamma is in valid range
         let gamma = gamma.clamp(0.0, 1.0);
-
         if gamma < f64::EPSILON {
-            // No damping, return unchanged
             return self;
         }
 
-        // Amplitude damping channel can be implemented using the Kraus operators:
-        // E_0 = |0⟩⟨0| + sqrt(1 - gamma) |1⟩⟨1|
-        // E_1 = sqrt(gamma) |0⟩⟨1|
-
-        // Get the current state vector values
+        // Amplitude damping via Kraus ops
+        //   E_0 = |0><0| + sqrt(1-g)|1><1|,   E_1 = sqrt(g)|0><1|
+        // gives the density-matrix transformation
+        //   rho_{a,b} -> E(rho)_{a,b} =
+        //     (a,b both bit_q=0): rho_{a,b} + g * rho_{a|q, b|q}
+        //     (one of a,b bit_q=1): sqrt(1-g) * rho_{a,b}
+        //     (both bit_q=1):     (1-g) * rho_{a,b}
+        //
+        // Apply on the density matrix, then Cholesky-re-purify the Choi state.
+        // This preserves the invariant that `probability()` reads rho_{k,k} as
+        // sum_i |psi[(k<<n)|i]|^2 -- the direct-Choi shortcut used previously
+        // broke that identity for partial damping.
         let n = self.num_physical_qubits;
-        let original_sv = self.state_vector.state();
-
-        // Reset state first
-        let sv_size = 1 << (2 * n);
-        let mut new_state = vec![num_complex::Complex64::new(0.0, 0.0); sv_size];
-
-        // We need to apply each Kraus operator to the state
-        // We'll implement the amplitude damping channel by its action on the density matrix elements
+        let dim = 1usize << n;
+        let qubit_mask = 1usize << qubit;
 
-        let qubit_mask = 1 << qubit;
-
-        // Iterate through all basis states
-        for i in 0..(1 << n) {
-            for j in 0..(1 << n) {
-                // Find the corresponding index in the state vector
-                let idx_i_j = (i << n) | j;
-
-                // Check if the qubit is in state |1⟩ in basis state i and j
-                let i_has_1 = (i & qubit_mask) != 0;
-                let j_has_1 = (j & qubit_mask) != 0;
-
-                // Calculate the modified state
-                if i_has_1 && j_has_1 {
-                    // Case |1⟩⟨1| -> (1-gamma)|1⟩⟨1| + gamma|0⟩⟨0|
-                    let i_with_0 = i & !qubit_mask; // Flip the qubit to 0
-                    let j_with_0 = j & !qubit_mask;
-
-                    // Apply damping
-                    new_state[idx_i_j] += (1.0 - gamma) * original_sv[idx_i_j];
-                    new_state[(i_with_0 << n) | j_with_0] += gamma * original_sv[idx_i_j];
-                } else if i_has_1 && !j_has_1 {
-                    // Case |1⟩⟨0| -> sqrt(1-gamma)|1⟩⟨0|
-                    new_state[idx_i_j] += (1.0 - gamma).sqrt() * original_sv[idx_i_j];
-                } else if !i_has_1 && j_has_1 {
-                    // Case |0⟩⟨1| -> sqrt(1-gamma)|0⟩⟨1|
-                    new_state[idx_i_j] += (1.0 - gamma).sqrt() * original_sv[idx_i_j];
-                } else {
-                    // Case |0⟩⟨0| -> |0⟩⟨0| + damping from |1⟩ states (added above)
-                    new_state[idx_i_j] += original_sv[idx_i_j];
-                }
+        let rho = self.get_density_matrix();
+        let mut new_rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        let sqrt_1mg = (1.0 - gamma).sqrt();
+        for i in 0..dim {
+            let i1 = (i & qubit_mask) != 0;
+            for j in 0..dim {
+                let j1 = (j & qubit_mask) != 0;
+                new_rho[i][j] = match (i1, j1) {
+                    (false, false) => {
+                        let ii = i | qubit_mask;
+                        let jj = j | qubit_mask;
+                        rho[i][j] + gamma * rho[ii][jj]
+                    }
+                    (true, true) => (1.0 - gamma) * rho[i][j],
+                    _ => sqrt_1mg * rho[i][j],
+                };
             }
         }
-
-        // Update the state vector
-        let new_sv = StateVec::from_state(&new_state, self.state_vector.rng().clone());
-        *self.state_vector_mut() = new_sv;
-
+        self.set_from_density_matrix(&new_rho);
         self
     }
 
@@ -712,55 +691,39 @@ where
     /// * `&mut Self` - Returns self for method chaining
     #[inline]
     pub fn apply_phase_damping(&mut self, qubit: usize, lambda: f64) -> &mut Self {
-        // Ensure lambda is in valid range
         let lambda = lambda.clamp(0.0, 1.0);
-
         if lambda < f64::EPSILON {
-            // No damping, return unchanged
             return self;
         }
 
-        // Phase damping channel can be implemented using the Kraus operators:
-        // E_0 = |0⟩⟨0| + sqrt(1 - lambda) |1⟩⟨1|
-        // E_1 = sqrt(lambda) |1⟩⟨1|
-
-        // Get the current state vector values
+        // Phase damping via Kraus ops
+        //   E_0 = |0><0| + sqrt(1-l)|1><1|,   E_1 = sqrt(l)|1><1|
+        // gives
+        //   rho_{a,b} unchanged when bit_q(a) == bit_q(b)
+        //   rho_{a,b} -> sqrt(1-l) * rho_{a,b} when they differ
+        // (the two Kraus contributions sum so the diag is preserved).
+        //
+        // Apply on the density matrix, then Cholesky-re-purify so
+        // `probability()` / `purity()` stay consistent with the Choi state.
         let n = self.num_physical_qubits;
-        let original_sv = self.state_vector.state();
-
-        // Reset state first
-        let sv_size = 1 << (2 * n);
-        let mut new_state = vec![num_complex::Complex64::new(0.0, 0.0); sv_size];
-
-        // We need to apply each Kraus operator to the state
-        // Phase damping channel keeps diagonal elements constant,
-        // but reduces off-diagonal elements
-
-        let qubit_mask = 1 << qubit;
-
-        // Iterate through all basis states
-        for i in 0..(1 << n) {
-            for j in 0..(1 << n) {
-                // Find the corresponding index in the state vector
-                let idx_i_j = (i << n) | j;
-
-                // Check if the qubit is in different states in i and j
-                let i_has_1 = (i & qubit_mask) != 0;
-                let j_has_1 = (j & qubit_mask) != 0;
+        let dim = 1usize << n;
+        let qubit_mask = 1usize << qubit;
 
-                if i_has_1 == j_has_1 {
-                    // Diagonal elements are preserved
-                    new_state[idx_i_j] += original_sv[idx_i_j];
+        let rho = self.get_density_matrix();
+        let mut new_rho = vec![vec![Complex64::new(0.0, 0.0); dim]; dim];
+        let sqrt_1ml = (1.0 - lambda).sqrt();
+        for i in 0..dim {
+            let i1 = (i & qubit_mask) != 0;
+            for j in 0..dim {
+                let j1 = (j & qubit_mask) != 0;
+                new_rho[i][j] = if i1 == j1 {
+                    rho[i][j]
                 } else {
-                    // Off-diagonal elements involving the qubit get damped
-                    new_state[idx_i_j] += (1.0 - lambda).sqrt() * original_sv[idx_i_j];
-                }
+                    sqrt_1ml * rho[i][j]
+                };
             }
         }
-
-        // Update the state vector
-        let new_sv = StateVec::from_state(&new_state, self.state_vector.rng().clone());
-        *self.state_vector_mut() = new_sv;
+        self.set_from_density_matrix(&new_rho);
 
         self
     }
diff --git a/crates/pecos-simulators/src/state_vec_soa.rs b/crates/pecos-simulators/src/state_vec_soa.rs
index 0ea3b6043..8d284b329 100644
--- a/crates/pecos-simulators/src/state_vec_soa.rs
+++ b/crates/pecos-simulators/src/state_vec_soa.rs
@@ -48,6 +48,13 @@ unsafe impl Send for SendPtr {}
 #[cfg(feature = "parallel")]
 unsafe impl Sync for SendPtr {}
 
+#[cfg(feature = "parallel")]
+#[derive(Clone, Copy)]
+enum RxxRyyKind {
+    Rxx,
+    Ryy,
+}
+
 // =============================================================================
 // Gate Fusion Support
 // =============================================================================
@@ -579,8 +586,12 @@ where
 
     /// Minimum number of qubits for parallel execution to be beneficial.
     /// Below this threshold, parallelism overhead exceeds benefits.
+    /// Empirical: at N=14-18 (state fits in L2/L3), rayon dispatch cost dominates
+    /// per-gate work and fu+par was 1.3-4.6x slower than fused alone. Parallel
+    /// only nets positive at N>=21 where state size (>=32MB) overflows cache and
+    /// memory bandwidth becomes the bottleneck (RTX 4090 host, 2026-04-11).
     #[cfg(feature = "parallel")]
-    const PARALLEL_THRESHOLD_QUBITS: usize = 14;
+    const PARALLEL_THRESHOLD_QUBITS: usize = 21;
 
     /// Set the number of threads for parallel execution.
     ///
@@ -1097,6 +1108,303 @@ where
         }
     }
 
+    /// Parallel RXX/RYY over outer blocks. Sign table differs by kind.
+    #[cfg(feature = "parallel")]
+    fn rxx_ryy_parallel(
+        &mut self,
+        step_lo: usize,
+        step_hi: usize,
+        cos: f64,
+        sin: f64,
+        kind: RxxRyyKind,
+    ) {
+        let n = self.real.len();
+        let outer_stride = step_hi * 2;
+        let num_blocks = n / outer_stride;
+
+        // RXX: |00⟩<->|11⟩ coupling has sign -i (s_00_11 = +1 via "+sin*m11")
+        // RYY: |00⟩<->|11⟩ coupling has sign +i (s_00_11 = -1 via "-sin*m11")
+        let s_0011 = match kind {
+            RxxRyyKind::Rxx => 1.0,
+            RxxRyyKind::Ryy => -1.0,
+        };
+
+        let real_ptr = SendPtr(self.real.as_mut_ptr());
+        let imag_ptr = SendPtr(self.imag.as_mut_ptr());
+
+        let work = || {
+            (0..num_blocks).into_par_iter().for_each(|block_idx| {
+                let outer = block_idx * outer_stride;
+                let rp = real_ptr.ptr();
+                let ip = imag_ptr.ptr();
+
+                for mid in (0..step_hi).step_by(step_lo * 2) {
+                    for inner_idx in 0..step_lo {
+                        let base = outer + mid + inner_idx;
+                        let i00 = base;
+                        let i01 = base + step_lo;
+                        let i10 = base + step_hi;
+                        let i11 = base + step_hi + step_lo;
+
+                        // SAFETY: all four indices lie within [outer, outer+outer_stride),
+                        // which is a disjoint block per block_idx.
+                        unsafe {
+                            let r00 = *rp.add(i00);
+                            let m00 = *ip.add(i00);
+                            let r01 = *rp.add(i01);
+                            let m01 = *ip.add(i01);
+                            let r10 = *rp.add(i10);
+                            let m10 = *ip.add(i10);
+                            let r11 = *rp.add(i11);
+                            let m11 = *ip.add(i11);
+
+                            *rp.add(i00) = cos * r00 + s_0011 * sin * m11;
+                            *ip.add(i00) = cos * m00 - s_0011 * sin * r11;
+                            *rp.add(i01) = cos * r01 + sin * m10;
+                            *ip.add(i01) = cos * m01 - sin * r10;
+                            *rp.add(i10) = sin * m01 + cos * r10;
+                            *ip.add(i10) = -sin * r01 + cos * m10;
+                            *rp.add(i11) = s_0011 * sin * m00 + cos * r11;
+                            *ip.add(i11) = -s_0011 * sin * r00 + cos * m11;
+                        }
+                    }
+                }
+            });
+        };
+
+        if let Some(num_threads) = self.num_threads {
+            let pool = rayon::ThreadPoolBuilder::new()
+                .num_threads(num_threads)
+                .build()
+                .expect("Failed to build thread pool");
+            pool.install(work);
+        } else {
+            work();
+        }
+    }
+
+    /// Parallel SIMD RZZ: phase-rotate every 4-amp chunk independently.
+    /// Precondition: `q_lo` >= 2 (so 4-amp chunks share the same (`bit_q1`, `bit_q2`)).
+    #[cfg(feature = "parallel")]
+    fn rzz_parallel(
+        &mut self,
+        q1: usize,
+        q2: usize,
+        cos_pos: f64,
+        sin_pos: f64,
+        cos_neg: f64,
+        sin_neg: f64,
+    ) {
+        let n = self.real.len();
+        let num_chunks = n / 4;
+
+        let real_ptr = SendPtr(self.real.as_mut_ptr());
+        let imag_ptr = SendPtr(self.imag.as_mut_ptr());
+
+        let work = || {
+            (0..num_chunks).into_par_iter().for_each(|chunk| {
+                let i = chunk * 4;
+                let bit1 = (i >> q1) & 1;
+                let bit2 = (i >> q2) & 1;
+                let (cos, sin) = if bit1 == bit2 {
+                    (cos_neg, sin_neg)
+                } else {
+                    (cos_pos, sin_pos)
+                };
+                let cos_v = f64x4::splat(cos);
+                let sin_v = f64x4::splat(sin);
+                let rp = real_ptr.ptr();
+                let ip = imag_ptr.ptr();
+
+                // SAFETY: chunks are non-overlapping 4-amp ranges.
+                unsafe {
+                    let re = f64x4::from(std::slice::from_raw_parts(rp.add(i), 4));
+                    let im = f64x4::from(std::slice::from_raw_parts(ip.add(i), 4));
+                    let new_re: [f64; 4] = (cos_v * re - sin_v * im).into();
+                    let new_im: [f64; 4] = (sin_v * re + cos_v * im).into();
+                    std::ptr::copy_nonoverlapping(new_re.as_ptr(), rp.add(i), 4);
+                    std::ptr::copy_nonoverlapping(new_im.as_ptr(), ip.add(i), 4);
+                }
+            });
+        };
+
+        if let Some(num_threads) = self.num_threads {
+            let pool = rayon::ThreadPoolBuilder::new()
+                .num_threads(num_threads)
+                .build()
+                .expect("Failed to build thread pool");
+            pool.install(work);
+        } else {
+            work();
+        }
+    }
+
+    /// Parallel SIMD CZ: negate amplitudes at `mask_11` across outer blocks.
+    #[cfg(feature = "parallel")]
+    fn cz_parallel(&mut self, mask_11: usize, step_lo: usize, step_hi: usize) {
+        let n = self.real.len();
+        let outer_stride = step_hi * 2;
+        let num_blocks = n / outer_stride;
+
+        let real_ptr = SendPtr(self.real.as_mut_ptr());
+        let imag_ptr = SendPtr(self.imag.as_mut_ptr());
+
+        let work = || {
+            (0..num_blocks).into_par_iter().for_each(|block_idx| {
+                let i_hi = block_idx * outer_stride;
+                let rp = real_ptr.ptr();
+                let ip = imag_ptr.ptr();
+
+                for i_lo in (i_hi..i_hi + step_hi).step_by(step_lo * 2) {
+                    let mut offset = 0;
+                    while offset + 4 <= step_lo {
+                        let idx = (i_lo + offset) | mask_11;
+                        // SAFETY: blocks are disjoint; idx lies in this block.
+                        unsafe {
+                            let re = f64x4::from(std::slice::from_raw_parts(rp.add(idx), 4));
+                            let im = f64x4::from(std::slice::from_raw_parts(ip.add(idx), 4));
+                            let neg_re: [f64; 4] = (-re).into();
+                            let neg_im: [f64; 4] = (-im).into();
+                            std::ptr::copy_nonoverlapping(neg_re.as_ptr(), rp.add(idx), 4);
+                            std::ptr::copy_nonoverlapping(neg_im.as_ptr(), ip.add(idx), 4);
+                        }
+                        offset += 4;
+                    }
+                }
+            });
+        };
+
+        if let Some(num_threads) = self.num_threads {
+            let pool = rayon::ThreadPoolBuilder::new()
+                .num_threads(num_threads)
+                .build()
+                .expect("Failed to build thread pool");
+            pool.install(work);
+        } else {
+            work();
+        }
+    }
+
+    /// Parallel scalar CX for small `step_lo` (< 4). Same outer-block disjoint
+    /// access pattern, scalar swap inside. Handles CX(0,1), CX(1,2) at large N.
+    #[cfg(feature = "parallel")]
+    fn cx_parallel_scalar(
+        &mut self,
+        control_mask: usize,
+        target_mask: usize,
+        step_lo: usize,
+        step_hi: usize,
+    ) {
+        let n = self.real.len();
+        let outer_stride = step_hi * 2;
+        let num_blocks = n / outer_stride;
+
+        let real_ptr = SendPtr(self.real.as_mut_ptr());
+        let imag_ptr = SendPtr(self.imag.as_mut_ptr());
+
+        let work = || {
+            (0..num_blocks).into_par_iter().for_each(|block_idx| {
+                let i_hi = block_idx * outer_stride;
+                let rp = real_ptr.ptr();
+                let ip = imag_ptr.ptr();
+
+                for i_lo in (i_hi..i_hi + step_hi).step_by(step_lo * 2) {
+                    for offset in 0..step_lo {
+                        let base = i_lo + offset;
+                        let a = base | control_mask;
+                        let b = a | target_mask;
+                        // SAFETY: both a and b lie within [i_hi, i_hi+outer_stride).
+                        unsafe {
+                            let ra = *rp.add(a);
+                            let rb = *rp.add(b);
+                            *rp.add(a) = rb;
+                            *rp.add(b) = ra;
+                            let ia = *ip.add(a);
+                            let ib = *ip.add(b);
+                            *ip.add(a) = ib;
+                            *ip.add(b) = ia;
+                        }
+                    }
+                }
+            });
+        };
+
+        if let Some(num_threads) = self.num_threads {
+            let pool = rayon::ThreadPoolBuilder::new()
+                .num_threads(num_threads)
+                .build()
+                .expect("Failed to build thread pool");
+            pool.install(work);
+        } else {
+            work();
+        }
+    }
+
+    /// Parallel SIMD CX: swap amplitudes where control=1, across outer blocks of
+    /// size `step_hi * 2`. Each block is written disjointly, so blocks run independently.
+    #[cfg(feature = "parallel")]
+    fn cx_parallel(
+        &mut self,
+        control_mask: usize,
+        target_mask: usize,
+        step_lo: usize,
+        step_hi: usize,
+    ) {
+        let n = self.real.len();
+        let outer_stride = step_hi * 2;
+        let num_blocks = n / outer_stride;
+
+        let real_ptr = SendPtr(self.real.as_mut_ptr());
+        let imag_ptr = SendPtr(self.imag.as_mut_ptr());
+
+        let work = || {
+            (0..num_blocks).into_par_iter().for_each(|block_idx| {
+                let i_hi = block_idx * outer_stride;
+                let rp = real_ptr.ptr();
+                let ip = imag_ptr.ptr();
+
+                for i_lo in (i_hi..i_hi + step_hi).step_by(step_lo * 2) {
+                    let mut offset = 0;
+                    while offset + 4 <= step_lo {
+                        let base = i_lo + offset;
+                        let idx0 = base | control_mask;
+                        let idx1 = idx0 | target_mask;
+
+                        // SAFETY: block_idx * outer_stride .. (block_idx+1) * outer_stride
+                        // is disjoint across blocks; idx0/idx1 lie within this block.
+                        unsafe {
+                            let re0 = f64x4::from(std::slice::from_raw_parts(rp.add(idx0), 4));
+                            let im0 = f64x4::from(std::slice::from_raw_parts(ip.add(idx0), 4));
+                            let re1 = f64x4::from(std::slice::from_raw_parts(rp.add(idx1), 4));
+                            let im1 = f64x4::from(std::slice::from_raw_parts(ip.add(idx1), 4));
+
+                            let arr_re0: [f64; 4] = re1.into();
+                            let arr_im0: [f64; 4] = im1.into();
+                            let arr_re1: [f64; 4] = re0.into();
+                            let arr_im1: [f64; 4] = im0.into();
+
+                            std::ptr::copy_nonoverlapping(arr_re0.as_ptr(), rp.add(idx0), 4);
+                            std::ptr::copy_nonoverlapping(arr_im0.as_ptr(), ip.add(idx0), 4);
+                            std::ptr::copy_nonoverlapping(arr_re1.as_ptr(), rp.add(idx1), 4);
+                            std::ptr::copy_nonoverlapping(arr_im1.as_ptr(), ip.add(idx1), 4);
+                        }
+                        offset += 4;
+                    }
+                }
+            });
+        };
+
+        if let Some(num_threads) = self.num_threads {
+            let pool = rayon::ThreadPoolBuilder::new()
+                .num_threads(num_threads)
+                .build()
+                .expect("Failed to build thread pool");
+            pool.install(work);
+        } else {
+            work();
+        }
+    }
+
     // =========================================================================
     // Specialized Gate Implementations (used when fusion is disabled)
     // =========================================================================
@@ -2869,6 +3177,19 @@ where
             let target_mask = 1 << target;
 
             // When q_lo >= 2, indices are contiguous and we can use SIMD
+            #[cfg(feature = "parallel")]
+            if self.parallel_enabled
+                && self.num_qubits >= Self::PARALLEL_THRESHOLD_QUBITS
+                && n / (step_hi * 2) >= 4
+            {
+                if step_lo >= 4 {
+                    self.cx_parallel(control_mask, target_mask, step_lo, step_hi);
+                } else {
+                    self.cx_parallel_scalar(control_mask, target_mask, step_lo, step_hi);
+                }
+                continue;
+            }
+
             if step_lo >= 4 {
                 for i_hi in (0..n).step_by(step_hi * 2) {
                     for i_lo in (i_hi..i_hi + step_hi).step_by(step_lo * 2) {
@@ -2933,6 +3254,16 @@ where
             let step_hi = 1 << q_hi;
             let mask_11 = (1 << q1) | (1 << q2);
 
+            #[cfg(feature = "parallel")]
+            if self.parallel_enabled
+                && self.num_qubits >= Self::PARALLEL_THRESHOLD_QUBITS
+                && step_lo >= 4
+                && n / (step_hi * 2) >= 4
+            {
+                self.cz_parallel(mask_11, step_lo, step_hi);
+                continue;
+            }
+
             // When q_lo >= 2, indices are contiguous and we can use SIMD
             if step_lo >= 4 {
                 for i_hi in (0..n).step_by(step_hi * 2) {
@@ -4307,6 +4638,15 @@ where
 
             let q_lo = q1.min(q2);
 
+            #[cfg(feature = "parallel")]
+            if self.parallel_enabled
+                && self.num_qubits >= Self::PARALLEL_THRESHOLD_QUBITS
+                && q_lo >= 2
+            {
+                self.rzz_parallel(q1, q2, cos_pos, sin_pos, cos_neg, sin_neg);
+                continue;
+            }
+
             // When both qubits >= 2, consecutive indices share the same phase
             if q_lo >= 2 {
                 let n = self.real.len();
@@ -4367,6 +4707,15 @@ where
             let step_lo = 1 << lo;
             let step_hi = 1 << hi;
 
+            #[cfg(feature = "parallel")]
+            if self.parallel_enabled
+                && self.num_qubits >= Self::PARALLEL_THRESHOLD_QUBITS
+                && self.real.len() / (step_hi * 2) >= 4
+            {
+                self.rxx_ryy_parallel(step_lo, step_hi, cos, sin, RxxRyyKind::Rxx);
+                continue;
+            }
+
             // RXX matrix (in computational basis):
             // |00⟩ -> cos|00⟩ - i*sin|11⟩
             // |01⟩ -> cos|01⟩ - i*sin|10⟩
@@ -4428,6 +4777,15 @@ where
             let step_lo = 1 << lo;
             let step_hi = 1 << hi;
 
+            #[cfg(feature = "parallel")]
+            if self.parallel_enabled
+                && self.num_qubits >= Self::PARALLEL_THRESHOLD_QUBITS
+                && self.real.len() / (step_hi * 2) >= 4
+            {
+                self.rxx_ryy_parallel(step_lo, step_hi, cos, sin, RxxRyyKind::Ryy);
+                continue;
+            }
+
             // RYY matrix (in computational basis):
             // |00⟩ -> cos|00⟩ + i*sin|11⟩
             // |01⟩ -> cos|01⟩ - i*sin|10⟩
diff --git a/crates/pecos-simulators/tests/density_matrix_tests.rs b/crates/pecos-simulators/tests/density_matrix_tests.rs
index e87b555e6..6c5e62738 100644
--- a/crates/pecos-simulators/tests/density_matrix_tests.rs
+++ b/crates/pecos-simulators/tests/density_matrix_tests.rs
@@ -194,18 +194,40 @@ fn test_amplitude_damping() {
 
 #[test]
 fn test_phase_damping() {
-    // Test the concept of phase damping
-    // In reality, phase damping should cause decoherence
-
-    // Create a mixed state with both 0 and 1 components
     let mut dm = DensityMatrix::new(1);
     dm.prepare_maximally_mixed();
-
-    // Verify the state is mixed
     assert!(!dm.is_pure());
+}
 
-    // For now, we skip the detailed phase damping test since
-    // our implementation is simplified and mainly conceptual
+#[test]
+fn test_phase_damping_preserves_diagonal() {
+    // Regression: full dephasing of |+> yields I/2 so P(0) = P(1) = 0.5 and
+    // they must sum to 1. Earlier impl violated this by treating the Choi
+    // state as flattened rho for the update but as a purification for
+    // probability()/purity().
+    let mut dm = DensityMatrix::new(1);
+    dm.h(&qid(0));
+    dm.apply_phase_damping(0, 1.0);
+    let p0 = dm.probability(0);
+    let p1 = dm.probability(1);
+    assert!((p0 - 0.5).abs() < 1e-10, "P(0)={p0}");
+    assert!((p1 - 0.5).abs() < 1e-10, "P(1)={p1}");
+    assert!((p0 + p1 - 1.0).abs() < 1e-10);
+}
+
+#[test]
+fn test_amplitude_damping_preserves_trace() {
+    // Regression: partial amplitude damping on |+><+| should keep tr(rho)=1.
+    // Pre-Cholesky impl gave P(0)+P(1) ≈ 0.895 for gamma=0.3.
+    let mut dm = DensityMatrix::new(1);
+    dm.h(&qid(0));
+    dm.apply_amplitude_damping(0, 0.3);
+    let p0 = dm.probability(0);
+    let p1 = dm.probability(1);
+    assert!((p0 + p1 - 1.0).abs() < 1e-10, "tr = {} != 1", p0 + p1);
+    // rho_00 = 0.5 + 0.5*0.3 = 0.65, rho_11 = 0.5*0.7 = 0.35.
+    assert!((p0 - 0.65).abs() < 1e-10, "P(0)={p0} expected 0.65");
+    assert!((p1 - 0.35).abs() < 1e-10, "P(1)={p1} expected 0.35");
 }
 
 #[test]
diff --git a/crates/pecos-simulators/tests/flush_blocked_audit.rs b/crates/pecos-simulators/tests/flush_blocked_audit.rs
new file mode 100644
index 000000000..bdb5c6b36
--- /dev/null
+++ b/crates/pecos-simulators/tests/flush_blocked_audit.rs
@@ -0,0 +1,347 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+
+//! Correctness audit for `StateVecSoA::flush_blocked`.
+//!
+//! `flush_blocked` is the cache-blocked pending-gate flush path. It fires when
+//! `num_qubits >= 21` AND `pending_count >= 3`. It does two things:
+//!  1. Applies low-stride (q < `block_bits`, default 14) pending gates in a
+//!     block-by-block loop so each block is loaded from DRAM once.
+//!  2. Applies remaining high-stride pending gates individually, with an
+//!     adjacent-pair optimisation (`flush_pair`).
+//!
+//! These tests cross-check the blocked path against a `set_fusion(false)`
+//! reference that dispatches every gate immediately via `apply_fused_matrix`
+//! (no batching, no blocking). Both should produce bit-identical output.
+//!
+//! N=21 → 32 MB per state vector. Each test creates a handful.
+
+use pecos_core::{Angle64, QubitId};
+use pecos_simulators::{ArbitraryRotationGateable, CliffordGateable, StateVecSoA};
+use rand::rngs::StdRng;
+use rand::{RngExt, SeedableRng};
+
+const N: usize = 21;
+
+/// A single-qubit-gate instruction (picks arbitrary 1q gates).
+#[derive(Clone, Copy)]
+enum Op1q {
+    H(usize),
+    X(usize),
+    Y(usize),
+    Z(usize),
+    Sx(usize),
+    Sxdg(usize),
+    Sy(usize),
+    Sydg(usize),
+    Sz(usize),
+    Szdg(usize),
+    T(usize),
+    Tdg(usize),
+    Rx(usize, f64),
+    Ry(usize, f64),
+    Rz(usize, f64),
+}
+
+fn gen_1q(rng: &mut StdRng, n: usize) -> Op1q {
+    let kind = rng.random_range(0u32..15);
+    let q = rng.random_range(0..n);
+    match kind {
+        0 => Op1q::H(q),
+        1 => Op1q::X(q),
+        2 => Op1q::Y(q),
+        3 => Op1q::Z(q),
+        4 => Op1q::Sx(q),
+        5 => Op1q::Sxdg(q),
+        6 => Op1q::Sy(q),
+        7 => Op1q::Sydg(q),
+        8 => Op1q::Sz(q),
+        9 => Op1q::Szdg(q),
+        10 => Op1q::T(q),
+        11 => Op1q::Tdg(q),
+        12 => Op1q::Rx(q, rng.random_range(-3.1..3.1)),
+        13 => Op1q::Ry(q, rng.random_range(-3.1..3.1)),
+        _ => Op1q::Rz(q, rng.random_range(-3.1..3.1)),
+    }
+}
+
+fn apply_1q(sim: &mut StateVecSoA, op: Op1q) {
+    match op {
+        Op1q::H(q) => {
+            sim.h(&[QubitId(q)]);
+        }
+        Op1q::X(q) => {
+            sim.x(&[QubitId(q)]);
+        }
+        Op1q::Y(q) => {
+            sim.y(&[QubitId(q)]);
+        }
+        Op1q::Z(q) => {
+            sim.z(&[QubitId(q)]);
+        }
+        Op1q::Sx(q) => {
+            sim.sx(&[QubitId(q)]);
+        }
+        Op1q::Sxdg(q) => {
+            sim.sxdg(&[QubitId(q)]);
+        }
+        Op1q::Sy(q) => {
+            sim.sy(&[QubitId(q)]);
+        }
+        Op1q::Sydg(q) => {
+            sim.sydg(&[QubitId(q)]);
+        }
+        Op1q::Sz(q) => {
+            sim.sz(&[QubitId(q)]);
+        }
+        Op1q::Szdg(q) => {
+            sim.szdg(&[QubitId(q)]);
+        }
+        Op1q::T(q) => {
+            sim.t(&[QubitId(q)]);
+        }
+        Op1q::Tdg(q) => {
+            sim.tdg(&[QubitId(q)]);
+        }
+        Op1q::Rx(q, t) => {
+            sim.rx(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op1q::Ry(q, t) => {
+            sim.ry(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+        Op1q::Rz(q, t) => {
+            sim.rz(Angle64::from_radians(t), &[QubitId(q)]);
+        }
+    }
+}
+
+fn max_amp_diff(a: &mut StateVecSoA, b: &mut StateVecSoA) -> f64 {
+    let sa = a.state();
+    let sb = b.state();
+    assert_eq!(sa.len(), sb.len());
+    sa.iter()
+        .zip(sb.iter())
+        .map(|(x, y)| {
+            let dr = x.re - y.re;
+            let di = x.im - y.im;
+            (dr * dr + di * di).sqrt()
+        })
+        .fold(0.0, f64::max)
+}
+
+/// Prepare the reference and under-test simulators from the same seed.
+/// `reference` uses `set_fusion(false)` so every gate goes through
+/// `apply_fused_matrix` immediately -- no batching, no blocking.
+/// `under_test` uses default (fusion on) so a large enough pending set
+/// triggers `flush_blocked`.
+fn two_sims() -> (StateVecSoA, StateVecSoA) {
+    let mut reference = StateVecSoA::new(N);
+    reference.set_fusion(false);
+    let under_test = StateVecSoA::new(N);
+    assert!(under_test.fusion_enabled(), "default should have fusion on");
+    (reference, under_test)
+}
+
+/// Force a full flush on the batched simulator by reading the state.
+/// `state()` is currently &self so we instead use a lightweight trick: apply
+/// a CX on qubits that aren't present in pending to force `flush_two_qubit`
+/// then undo it. Simpler: call `flush()` directly via the public API.
+fn trigger_flush(sim: &mut StateVecSoA) {
+    sim.flush();
+}
+
+#[test]
+fn flush_blocked_single_qubit_fuzz() {
+    // Many random 1q gates on different qubits across the whole register.
+    // With fusion on + many pending gates + N=21, flush_blocked fires.
+    let mut rng = StdRng::seed_from_u64(0xdead_beef);
+    let ops: Vec<Op1q> = (0..60).map(|_| gen_1q(&mut rng, N)).collect();
+
+    let (mut reference, mut under_test) = two_sims();
+    for &op in &ops {
+        apply_1q(&mut reference, op);
+        apply_1q(&mut under_test, op);
+    }
+    // Reference was flushed per-gate (fusion off). under_test has pending gates.
+    trigger_flush(&mut under_test);
+
+    let d = max_amp_diff(&mut reference, &mut under_test);
+    assert!(
+        d < 1e-10,
+        "flush_blocked diverged from per-gate ref: max_diff={d:.3e}"
+    );
+}
+
+#[test]
+fn flush_blocked_only_low_stride() {
+    // Gates only on low-stride qubits (q < 14). Exercises the blocked path
+    // exclusively -- no high-stride cleanup pass.
+    let mut rng = StdRng::seed_from_u64(0x0_fedb_acca);
+    let ops: Vec<Op1q> = (0..80)
+        .map(|_| {
+            let mut op = gen_1q(&mut rng, 14);
+            // gen_1q uses n for qubit range; ensure q < 14 (already set by n=14).
+            // Pass-through.
+            op = match op {
+                Op1q::H(q) => Op1q::H(q.min(13)),
+                Op1q::X(q) => Op1q::X(q.min(13)),
+                _ => op,
+            };
+            op
+        })
+        .collect();
+
+    let (mut reference, mut under_test) = two_sims();
+    for &op in &ops {
+        apply_1q(&mut reference, op);
+        apply_1q(&mut under_test, op);
+    }
+    trigger_flush(&mut under_test);
+
+    let d = max_amp_diff(&mut reference, &mut under_test);
+    assert!(d < 1e-10, "low-stride only: max_diff={d:.3e}");
+}
+
+#[test]
+fn flush_blocked_only_high_stride() {
+    // Gates only on high-stride qubits (q >= 14). flush_blocked's low-stride
+    // loop produces no work; only the pairing cleanup runs.
+    let mut rng = StdRng::seed_from_u64(0x00c0_ffee);
+    let ops: Vec<Op1q> = (0..40)
+        .map(|_| {
+            let base = gen_1q(&mut rng, 7); // maps 0..7
+            // Remap qubit to 14..21.
+            match base {
+                Op1q::H(q) => Op1q::H(q + 14),
+                Op1q::X(q) => Op1q::X(q + 14),
+                Op1q::Y(q) => Op1q::Y(q + 14),
+                Op1q::Z(q) => Op1q::Z(q + 14),
+                Op1q::Sx(q) => Op1q::Sx(q + 14),
+                Op1q::Sxdg(q) => Op1q::Sxdg(q + 14),
+                Op1q::Sy(q) => Op1q::Sy(q + 14),
+                Op1q::Sydg(q) => Op1q::Sydg(q + 14),
+                Op1q::Sz(q) => Op1q::Sz(q + 14),
+                Op1q::Szdg(q) => Op1q::Szdg(q + 14),
+                Op1q::T(q) => Op1q::T(q + 14),
+                Op1q::Tdg(q) => Op1q::Tdg(q + 14),
+                Op1q::Rx(q, t) => Op1q::Rx(q + 14, t),
+                Op1q::Ry(q, t) => Op1q::Ry(q + 14, t),
+                Op1q::Rz(q, t) => Op1q::Rz(q + 14, t),
+            }
+        })
+        .collect();
+
+    let (mut reference, mut under_test) = two_sims();
+    for &op in &ops {
+        apply_1q(&mut reference, op);
+        apply_1q(&mut under_test, op);
+    }
+    trigger_flush(&mut under_test);
+
+    let d = max_amp_diff(&mut reference, &mut under_test);
+    assert!(d < 1e-10, "high-stride only: max_diff={d:.3e}");
+}
+
+#[test]
+fn flush_blocked_boundary_qubits() {
+    // Exactly the qubits right at the low/high boundary: 12, 13, 14, 15.
+    // q=13 has step=8192 == block_size/2 (one outer iteration per block).
+    // q=14 is the first high-stride qubit.
+    let mut rng = StdRng::seed_from_u64(0x1337);
+    let ops: Vec<Op1q> = (0..50)
+        .map(|_| {
+            let q = 12 + rng.random_range(0u32..4) as usize;
+            match rng.random_range(0u32..5) {
+                0 => Op1q::H(q),
+                1 => Op1q::Sx(q),
+                2 => Op1q::Sz(q),
+                3 => Op1q::T(q),
+                _ => Op1q::Rz(q, rng.random_range(-3.1..3.1)),
+            }
+        })
+        .collect();
+
+    let (mut reference, mut under_test) = two_sims();
+    for &op in &ops {
+        apply_1q(&mut reference, op);
+        apply_1q(&mut under_test, op);
+    }
+    trigger_flush(&mut under_test);
+
+    let d = max_amp_diff(&mut reference, &mut under_test);
+    assert!(d < 1e-10, "boundary qubits: max_diff={d:.3e}");
+}
+
+#[test]
+fn flush_blocked_interleaved_with_cx() {
+    // Mixed: 1q gates (queued) interleaved with cx (which force
+    // flush_two_qubit on those qubits). This exercises the partial-flush
+    // paths and ensures flush_blocked still agrees when it fires for the
+    // remaining pending set.
+    let mut rng = StdRng::seed_from_u64(0xbeef);
+    let (mut reference, mut under_test) = two_sims();
+
+    for _ in 0..30 {
+        // Burst of 1q gates
+        for _ in 0..5 {
+            let op = gen_1q(&mut rng, N);
+            apply_1q(&mut reference, op);
+            apply_1q(&mut under_test, op);
+        }
+        // Random cx pair
+        let a = rng.random_range(0..N);
+        let mut b = rng.random_range(0..N);
+        while b == a {
+            b = rng.random_range(0..N);
+        }
+        reference.cx(&[(QubitId(a), QubitId(b))]);
+        under_test.cx(&[(QubitId(a), QubitId(b))]);
+    }
+    trigger_flush(&mut under_test);
+
+    let d = max_amp_diff(&mut reference, &mut under_test);
+    assert!(d < 1e-10, "interleaved: max_diff={d:.3e}");
+}
+
+#[test]
+fn flush_blocked_minimum_pending_count() {
+    // The threshold requires pending_count >= 3 AND num_qubits >= 21.
+    // With exactly 3 pending gates at N=21, flush_blocked MUST fire and
+    // produce identical output to the non-blocked path.
+    let (mut reference, mut under_test) = two_sims();
+
+    let ops = [Op1q::H(0), Op1q::Sz(10), Op1q::Rx(20, 0.7)];
+    for &op in &ops {
+        apply_1q(&mut reference, op);
+        apply_1q(&mut under_test, op);
+    }
+    trigger_flush(&mut under_test);
+
+    let d = max_amp_diff(&mut reference, &mut under_test);
+    assert!(d < 1e-10, "min pending: max_diff={d:.3e}");
+}
+
+#[test]
+fn flush_blocked_all_qubits_pending() {
+    // All 21 qubits with a single pending gate each -- maximum pending set.
+    let mut rng = StdRng::seed_from_u64(0x00ab_c123);
+    let (mut reference, mut under_test) = two_sims();
+
+    for q in 0..N {
+        // Pick a non-trivial single-qubit matrix via random Rz + H (cheap but
+        // not identity).
+        let theta = rng.random_range(0.1..3.0);
+        reference.h(&[QubitId(q)]);
+        reference.rz(Angle64::from_radians(theta), &[QubitId(q)]);
+        under_test.h(&[QubitId(q)]);
+        under_test.rz(Angle64::from_radians(theta), &[QubitId(q)]);
+    }
+    trigger_flush(&mut under_test);
+
+    let d = max_amp_diff(&mut reference, &mut under_test);
+    assert!(d < 1e-10, "all qubits pending: max_diff={d:.3e}");
+}
diff --git a/crates/pecos-simulators/tests/quest_density_matrix_comparison_tests.rs b/crates/pecos-simulators/tests/quest_density_matrix_comparison_tests.rs
deleted file mode 100644
index 912011db0..000000000
--- a/crates/pecos-simulators/tests/quest_density_matrix_comparison_tests.rs
+++ /dev/null
@@ -1,750 +0,0 @@
-//! Comparison tests between `DensityMatrix` and `QuEST`'s `QuestDensityMatrix`
-//!
-//! These tests verify that our `DensityMatrix` implementation produces the same
-//! results as the reference `QuEST` density matrix simulator.
-//!
-//! NOTE: `QuEST` has thread safety issues - run with --test-threads=1
-
-use pecos_core::{Angle64, QubitId, qid};
-use pecos_quest::QuestDensityMatrix;
-use pecos_random::PecosRng;
-use pecos_simulators::{
-    ArbitraryRotationGateable, CliffordGateable, DensityMatrix, QuantumSimulator,
-};
-use std::f64::consts::PI;
-
-const TOLERANCE: f64 = 1e-10;
-
-fn assert_close(a: f64, b: f64, msg: &str) {
-    assert!(
-        (a - b).abs() < TOLERANCE,
-        "{}: {} vs {} (diff: {})",
-        msg,
-        a,
-        b,
-        (a - b).abs()
-    );
-}
-
-/// Compare probabilities for all computational basis states between simulators
-fn compare_probabilities(
-    dm: &mut DensityMatrix,
-    qdm: &QuestDensityMatrix<PecosRng>,
-    num_qubits: usize,
-) {
-    for i in 0..(1 << num_qubits) {
-        let dm_prob = dm.probability(i);
-        let qdm_prob = qdm.probability(i);
-        assert_close(dm_prob, qdm_prob, &format!("probability({i})"));
-    }
-}
-
-/// Compare purity between simulators
-fn compare_purity(dm: &mut DensityMatrix, qdm: &QuestDensityMatrix<PecosRng>) {
-    let dm_purity = dm.purity();
-    let qdm_purity = qdm.purity();
-    assert_close(dm_purity, qdm_purity, "purity");
-}
-
-#[test]
-fn test_initial_state() {
-    let num_qubits = 2;
-    let mut dm = DensityMatrix::new(num_qubits);
-    let qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_x_gate() {
-    let num_qubits = 2;
-    let seed = 42;
-
-    let mut dm = DensityMatrix::with_seed(num_qubits, seed);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::with_seed(num_qubits, seed);
-
-    dm.x(&qid(0));
-    qdm.x(&qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-
-    dm.x(&qid(1));
-    qdm.x(&qid(1));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_y_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    dm.y(&qid(0));
-    qdm.y(&qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_z_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Z on |0> should leave it unchanged
-    dm.z(&qid(0));
-    qdm.z(&qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-
-    // Create superposition first, then apply Z
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-
-    dm.z(&qid(0));
-    qdm.z(&qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_hadamard_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-
-    dm.h(&qid(1));
-    qdm.h(&qid(1));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_s_gate() {
-    let num_qubits = 1;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create |+> then apply S
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-
-    dm.sz(&qid(0));
-    qdm.sz(&qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_sdg_gate() {
-    let num_qubits = 1;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-
-    dm.szdg(&qid(0));
-    qdm.szdg(&qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_cx_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create Bell state
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    qdm.cx(&[(QubitId(0), QubitId(1))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_cz_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Put both qubits in superposition
-    dm.h(&qid(0));
-    dm.h(&qid(1));
-    qdm.h(&qid(0));
-    qdm.h(&qid(1));
-
-    dm.cz(&[(QubitId(0), QubitId(1))]);
-    qdm.cz(&[(QubitId(0), QubitId(1))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_cy_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Set control to |1>
-    dm.x(&qid(0));
-    qdm.x(&qid(0));
-
-    dm.cy(&[(QubitId(0), QubitId(1))]);
-    qdm.cy(&[(QubitId(0), QubitId(1))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_swap_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Put qubit 0 in |1>
-    dm.x(&qid(0));
-    qdm.x(&qid(0));
-
-    dm.swap(&[(QubitId(0), QubitId(1))]);
-    qdm.swap(&[(QubitId(0), QubitId(1))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_rx_gate() {
-    let num_qubits = 1;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    dm.rx(Angle64::from_radians(PI / 4.0), &qid(0));
-    qdm.rx(Angle64::from_radians(PI / 4.0), &qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_ry_gate() {
-    let num_qubits = 1;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    dm.ry(Angle64::from_radians(PI / 3.0), &qid(0));
-    qdm.ry(Angle64::from_radians(PI / 3.0), &qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_ry_in_entangled_system() {
-    // Test RY on qubit 0 after creating entanglement
-    let num_qubits = 3;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create entanglement first
-    dm.h(&qid(0));
-    dm.h(&qid(1));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    dm.h(&qid(2));
-    dm.cx(&[(QubitId(1), QubitId(2))]);
-
-    qdm.h(&qid(0));
-    qdm.h(&qid(1));
-    qdm.cx(&[(QubitId(0), QubitId(1))]);
-    qdm.h(&qid(2));
-    qdm.cx(&[(QubitId(1), QubitId(2))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-
-    // Now apply RY
-    dm.ry(Angle64::from_radians(PI / 5.0), &qid(0));
-    qdm.ry(Angle64::from_radians(PI / 5.0), &qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_rz_gate() {
-    let num_qubits = 1;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create superposition first
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-
-    dm.rz(Angle64::from_radians(PI / 6.0), &qid(0));
-    qdm.rz(Angle64::from_radians(PI / 6.0), &qid(0));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_rzz_gate() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create superposition on both qubits
-    dm.h(&qid(0));
-    dm.h(&qid(1));
-    qdm.h(&qid(0));
-    qdm.h(&qid(1));
-
-    dm.rzz(Angle64::from_radians(PI / 4.0), &[(QubitId(0), QubitId(1))]);
-    qdm.rzz(Angle64::from_radians(PI / 4.0), &[(QubitId(0), QubitId(1))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_bell_state() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create Bell state |Phi+> = (|00> + |11>)/sqrt(2)
-    dm.h(&qid(0));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    qdm.h(&qid(0));
-    qdm.cx(&[(QubitId(0), QubitId(1))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-
-    // Should be a pure state
-    assert_close(dm.purity(), 1.0, "Bell state purity");
-}
-
-#[test]
-fn test_ghz_state() {
-    let num_qubits = 3;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create GHZ state (|000> + |111>)/sqrt(2)
-    dm.h(&qid(0));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    dm.cx(&[(QubitId(1), QubitId(2))]);
-    qdm.h(&qid(0));
-    qdm.cx(&[(QubitId(0), QubitId(1))]);
-    qdm.cx(&[(QubitId(1), QubitId(2))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_complex_circuit() {
-    let num_qubits = 3;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Apply a complex sequence of gates
-    dm.h(&qid(0));
-    dm.h(&qid(1));
-    dm.cx(&[(QubitId(0), QubitId(2))]);
-    dm.rz(Angle64::from_radians(PI / 4.0), &qid(1));
-    dm.cy(&[(QubitId(1), QubitId(0))]);
-    dm.rx(Angle64::from_radians(PI / 3.0), &qid(2));
-    dm.cz(&[(QubitId(0), QubitId(1))]);
-
-    qdm.h(&qid(0));
-    qdm.h(&qid(1));
-    qdm.cx(&[(QubitId(0), QubitId(2))]);
-    qdm.rz(Angle64::from_radians(PI / 4.0), &qid(1));
-    qdm.cy(&[(QubitId(1), QubitId(0))]);
-    qdm.rx(Angle64::from_radians(PI / 3.0), &qid(2));
-    qdm.cz(&[(QubitId(0), QubitId(1))]);
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_reset() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create some state
-    dm.h(&qid(0));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    qdm.h(&qid(0));
-    qdm.cx(&[(QubitId(0), QubitId(1))]);
-
-    // Reset
-    dm.reset();
-    qdm.reset();
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_measurement_deterministic() {
-    let num_qubits = 1;
-    let seed = 42;
-
-    let mut dm = DensityMatrix::with_seed(num_qubits, seed);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::with_seed(num_qubits, seed);
-
-    // Deterministic measurement on |0>
-    let dm_result = dm.mz(&qid(0)).into_iter().next().unwrap();
-    let qdm_result = qdm.mz(&qid(0)).into_iter().next().unwrap();
-
-    assert_eq!(
-        dm_result.outcome, qdm_result.outcome,
-        "measurement outcome mismatch"
-    );
-    assert_eq!(
-        dm_result.is_deterministic, qdm_result.is_deterministic,
-        "determinism mismatch"
-    );
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-}
-
-#[test]
-fn test_measurement_superposition() {
-    // For superposition states, we can't guarantee same outcomes without same RNG
-    // But we can verify post-measurement states are valid
-    let num_qubits = 1;
-    let seed = 12345;
-
-    let mut dm = DensityMatrix::with_seed(num_qubits, seed);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::with_seed(num_qubits, seed);
-
-    // Create superposition
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-
-    // Both should report 50/50 probabilities before measurement
-    assert_close(dm.probability(0), 0.5, "pre-measurement prob 0");
-    assert_close(dm.probability(1), 0.5, "pre-measurement prob 1");
-    assert_close(qdm.probability(0), 0.5, "quest pre-measurement prob 0");
-    assert_close(qdm.probability(1), 0.5, "quest pre-measurement prob 1");
-
-    // After measurement, state should be collapsed
-    let _dm_result = dm.mz(&qid(0));
-    let _qdm_result = qdm.mz(&qid(0));
-
-    // Both should be in a definite state after measurement
-    let dm_prob0 = dm.probability(0);
-    let dm_prob1 = dm.probability(1);
-    let qdm_prob0 = qdm.probability(0);
-    let qdm_prob1 = qdm.probability(1);
-
-    // One probability should be ~1, other ~0
-    assert!(
-        (dm_prob0 > 0.99 && dm_prob1 < 0.01) || (dm_prob0 < 0.01 && dm_prob1 > 0.99),
-        "DensityMatrix not collapsed: p0={dm_prob0}, p1={dm_prob1}"
-    );
-    assert!(
-        (qdm_prob0 > 0.99 && qdm_prob1 < 0.01) || (qdm_prob0 < 0.01 && qdm_prob1 > 0.99),
-        "QuestDensityMatrix not collapsed: p0={qdm_prob0}, p1={qdm_prob1}"
-    );
-}
-
-#[test]
-fn test_purity_pure_state() {
-    let num_qubits = 2;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Various pure states should all have purity 1
-    compare_purity(&mut dm, &qdm);
-    assert_close(dm.purity(), 1.0, "initial purity");
-
-    dm.h(&qid(0));
-    qdm.h(&qid(0));
-    compare_purity(&mut dm, &qdm);
-    assert_close(dm.purity(), 1.0, "superposition purity");
-
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    qdm.cx(&[(QubitId(0), QubitId(1))]);
-    compare_purity(&mut dm, &qdm);
-    assert_close(dm.purity(), 1.0, "entangled purity");
-}
-
-#[test]
-fn test_rotation_angles() {
-    let num_qubits = 1;
-
-    // Test various rotation angles
-    let angles = [
-        0.0,
-        PI / 8.0,
-        PI / 4.0,
-        PI / 2.0,
-        PI,
-        3.0 * PI / 2.0,
-        2.0 * PI,
-    ];
-
-    for &theta in &angles {
-        let mut dm = DensityMatrix::new(num_qubits);
-        let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-        dm.rx(Angle64::from_radians(theta), &qid(0));
-        qdm.rx(Angle64::from_radians(theta), &qid(0));
-
-        compare_probabilities(&mut dm, &qdm, num_qubits);
-    }
-}
-
-#[test]
-fn test_larger_system_4_qubits() {
-    let num_qubits = 4;
-
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Create a complex entangled state
-    dm.h(&qid(0));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    dm.h(&qid(2));
-    dm.cx(&[(QubitId(2), QubitId(3))]);
-    dm.cz(&[(QubitId(1), QubitId(2))]);
-    dm.rx(Angle64::from_radians(PI / 3.0), &qid(0));
-    dm.ry(Angle64::from_radians(PI / 4.0), &qid(3));
-
-    qdm.h(&qid(0));
-    qdm.cx(&[(QubitId(0), QubitId(1))]);
-    qdm.h(&qid(2));
-    qdm.cx(&[(QubitId(2), QubitId(3))]);
-    qdm.cz(&[(QubitId(1), QubitId(2))]);
-    qdm.rx(Angle64::from_radians(PI / 3.0), &qid(0));
-    qdm.ry(Angle64::from_radians(PI / 4.0), &qid(3));
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_density_matrix_trace_is_one() {
-    let num_qubits = 2;
-    let mut dm = DensityMatrix::new(num_qubits);
-
-    // Apply various operations
-    dm.h(&qid(0));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    dm.rz(Angle64::from_radians(PI / 5.0), &qid(0));
-
-    // Check trace = sum of probabilities = 1
-    let mut trace = 0.0;
-    for i in 0..(1 << num_qubits) {
-        trace += dm.probability(i);
-    }
-    assert_close(trace, 1.0, "trace should be 1");
-}
-
-#[test]
-fn test_density_matrix_is_hermitian() {
-    let num_qubits = 2;
-    let mut dm = DensityMatrix::new(num_qubits);
-
-    dm.h(&qid(0));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    dm.sz(&qid(1));
-
-    let rho = dm.get_density_matrix();
-
-    // Check rho[i][j] == rho[j][i].conj()
-    for (i, rho_row) in rho.iter().enumerate() {
-        for (j, rho_ij) in rho_row.iter().enumerate() {
-            let diff = (rho_ij - rho[j][i].conj()).norm();
-            assert!(
-                diff < TOLERANCE,
-                "Not Hermitian at ({},{}): {} vs {}",
-                i,
-                j,
-                rho_ij,
-                rho[j][i].conj()
-            );
-        }
-    }
-}
-
-#[test]
-fn test_density_matrix_probabilities_sum_to_one() {
-    let num_qubits = 3;
-    let mut dm = DensityMatrix::new(num_qubits);
-
-    // Create GHZ-like state
-    dm.h(&qid(0));
-    dm.cx(&[(QubitId(0), QubitId(1))]);
-    dm.cx(&[(QubitId(1), QubitId(2))]);
-
-    let mut sum = 0.0;
-    for i in 0..(1 << num_qubits) {
-        let prob = dm.probability(i);
-        assert!(prob >= -TOLERANCE, "Negative probability at {i}: {prob}");
-        sum += prob;
-    }
-    assert_close(sum, 1.0, "probabilities should sum to 1");
-}
-
-#[test]
-fn test_random_circuit_comparison() {
-    // Test a pseudo-random circuit to catch edge cases
-    let num_qubits = 3;
-    let mut dm = DensityMatrix::new(num_qubits);
-    let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-    // Sequence of gates that exercises many code paths
-    let ops: Vec<(&str, usize, usize)> = vec![
-        ("h", 0, 0),
-        ("h", 1, 0),
-        ("cx", 0, 1),
-        ("rz", 2, 0),
-        ("h", 2, 0),
-        ("cx", 1, 2),
-        ("ry", 0, 0),
-        ("cz", 0, 2),
-        ("rx", 1, 0),
-        ("swap", 0, 1),
-        ("cy", 1, 2),
-    ];
-
-    for (op, q1, q_2) in &ops {
-        match *op {
-            "h" => {
-                dm.h(&qid(*q1));
-                qdm.h(&qid(*q1));
-            }
-            "cx" => {
-                dm.cx(&[(QubitId(*q1), QubitId(*q_2))]);
-                qdm.cx(&[(QubitId(*q1), QubitId(*q_2))]);
-            }
-            "cy" => {
-                dm.cy(&[(QubitId(*q1), QubitId(*q_2))]);
-                qdm.cy(&[(QubitId(*q1), QubitId(*q_2))]);
-            }
-            "cz" => {
-                dm.cz(&[(QubitId(*q1), QubitId(*q_2))]);
-                qdm.cz(&[(QubitId(*q1), QubitId(*q_2))]);
-            }
-            "swap" => {
-                dm.swap(&[(QubitId(*q1), QubitId(*q_2))]);
-                qdm.swap(&[(QubitId(*q1), QubitId(*q_2))]);
-            }
-            "rx" => {
-                dm.rx(Angle64::from_radians(PI / 7.0), &qid(*q1));
-                qdm.rx(Angle64::from_radians(PI / 7.0), &qid(*q1));
-            }
-            "ry" => {
-                dm.ry(Angle64::from_radians(PI / 5.0), &qid(*q1));
-                qdm.ry(Angle64::from_radians(PI / 5.0), &qid(*q1));
-            }
-            "rz" => {
-                dm.rz(Angle64::from_radians(PI / 3.0), &qid(*q1));
-                qdm.rz(Angle64::from_radians(PI / 3.0), &qid(*q1));
-            }
-            _ => {}
-        }
-    }
-
-    compare_probabilities(&mut dm, &qdm, num_qubits);
-    compare_purity(&mut dm, &qdm);
-}
-
-#[test]
-fn test_all_single_qubit_gates() {
-    // Comprehensive test of all single qubit gates
-    let num_qubits = 1;
-
-    let gates: Vec<&str> = vec!["x", "y", "z", "h", "s", "sdg", "sx", "sxdg", "sy", "sydg"];
-
-    for gate in gates {
-        let mut dm = DensityMatrix::new(num_qubits);
-        let mut qdm: QuestDensityMatrix<PecosRng> = QuestDensityMatrix::new(num_qubits);
-
-        // Start from |+> state for more interesting results
-        dm.h(&qid(0));
-        qdm.h(&qid(0));
-
-        match gate {
-            "x" => {
-                dm.x(&qid(0));
-                qdm.x(&qid(0));
-            }
-            "y" => {
-                dm.y(&qid(0));
-                qdm.y(&qid(0));
-            }
-            "z" => {
-                dm.z(&qid(0));
-                qdm.z(&qid(0));
-            }
-            "h" => {
-                dm.h(&qid(0));
-                qdm.h(&qid(0));
-            }
-            "s" => {
-                dm.sz(&qid(0));
-                qdm.sz(&qid(0));
-            }
-            "sdg" => {
-                dm.szdg(&qid(0));
-                qdm.szdg(&qid(0));
-            }
-            "sx" => {
-                dm.sx(&qid(0));
-                qdm.sx(&qid(0));
-            }
-            "sxdg" => {
-                dm.sxdg(&qid(0));
-                qdm.sxdg(&qid(0));
-            }
-            "sy" => {
-                dm.sy(&qid(0));
-                qdm.sy(&qid(0));
-            }
-            "sydg" => {
-                dm.sydg(&qid(0));
-                qdm.sydg(&qid(0));
-            }
-            _ => {}
-        }
-
-        compare_probabilities(&mut dm, &qdm, num_qubits);
-    }
-}
diff --git a/crates/pecos/Cargo.toml b/crates/pecos/Cargo.toml
index 6a8b31905..85c779ced 100644
--- a/crates/pecos/Cargo.toml
+++ b/crates/pecos/Cargo.toml
@@ -37,8 +37,6 @@ serde_json = { workspace = true, optional = true }
 
 # Quantum simulator backends (optional - for Python bindings and advanced users)
 pecos-cppsparsestab = { workspace = true, optional = true }
-pecos-quest = { workspace = true, optional = true }
-pecos-qulacs = { workspace = true, optional = true }
 
 # WebAssembly foreign object support (optional)
 pecos-wasm = { workspace = true, optional = true }
@@ -89,9 +87,7 @@ wasm = ["sim", "dep:pecos-wasm", "pecos-wasm/wasm"]
 
 # Quantum simulator backends (C++ wrappers)
 cppsparsestab = ["sim", "dep:pecos-cppsparsestab"]
-quest = ["sim", "dep:pecos-quest"]
-qulacs = ["sim", "dep:pecos-qulacs"]
-all-simulators = ["cppsparsestab", "quest", "qulacs"]
+all-simulators = ["cppsparsestab"]
 
 # Decoder backends
 ldpc = ["sim", "dep:pecos-decoders", "pecos-decoders/ldpc"]
@@ -126,10 +122,6 @@ serde_json.workspace = true
 name = "unified_sim_api_test"
 required-features = ["runtime", "qis"]
 
-[[example]]
-name = "quest_example"
-required-features = ["quest"]
-
 [[example]]
 name = "sim_api_final"
 required-features = ["runtime", "qis"]
diff --git a/crates/pecos/examples/quest_example.rs b/crates/pecos/examples/quest_example.rs
deleted file mode 100644
index 937dc98dd..000000000
--- a/crates/pecos/examples/quest_example.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-//! Example demonstrating the Quest quantum simulator API with CPU and GPU support
-//!
-//! This example shows how to use the Quest state vector and density matrix simulators
-//! with the PECOS `sim()` API, including CPU and GPU mode selection.
-
-use pecos::prelude::*;
-use pecos::{quest_state_vec, sim};
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // Create a simple QASM program that creates a Bell state
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    println!("==== Quest State Vector Simulation (CPU) ====");
-    // Use Quest state vector simulator with CPU mode (default)
-    let results = sim(program.clone())
-        .quantum(quest_state_vec().with_cpu())
-        .seed(42)
-        .run(100)?;
-
-    println!("Ran 100 shots with Quest state vector (CPU)");
-    let shot_map = results.try_as_shot_map()?;
-    let measurements = shot_map.try_bits_as_u64("c")?;
-    let zeros = measurements.iter().filter(|&&x| x == 0).count();
-    let ones = measurements.iter().filter(|&&x| x == 3).count();
-    println!("Results: |00⟩: {zeros}, |11⟩: {ones}");
-
-    // Demonstrate GPU mode (requires CUDA to be available at runtime)
-    println!("\n==== Quest State Vector Simulation (GPU) ====");
-    match sim(program)
-        .quantum(quest_state_vec().with_gpu())
-        .seed(42)
-        .run(100)
-    {
-        Ok(results_gpu) => {
-            println!("Ran 100 shots with Quest state vector (GPU)");
-            let shot_map_gpu = results_gpu.try_as_shot_map()?;
-            let measurements_gpu = shot_map_gpu.try_bits_as_u64("c")?;
-            let zeros_gpu = measurements_gpu.iter().filter(|&&x| x == 0).count();
-            let ones_gpu = measurements_gpu.iter().filter(|&&x| x == 3).count();
-            println!("Results: |00⟩: {zeros_gpu}, |11⟩: {ones_gpu}");
-        }
-        Err(e) => {
-            println!("GPU mode not available (CUDA not detected at runtime): {e}");
-        }
-    }
-
-    Ok(())
-}
diff --git a/crates/pecos/src/lib.rs b/crates/pecos/src/lib.rs
index 4afd16266..8258552ff 100644
--- a/crates/pecos/src/lib.rs
+++ b/crates/pecos/src/lib.rs
@@ -76,14 +76,6 @@ pub mod simulators {
         SparseStabEngineBuilder, StabilizerEngineBuilder, StateVectorEngineBuilder, clifford_rz,
         density_matrix, sparse_stab, stabilizer, state_vector,
     };
-    #[cfg(feature = "quest")]
-    pub use pecos_quest::{
-        QuestDensityMatrix, QuestDensityMatrixEngine, QuestDensityMatrixEngineBuilder,
-        QuestStateVec, QuestStateVecEngine, QuestStateVectorEngineBuilder, quest_density_matrix,
-        quest_state_vec,
-    };
-    #[cfg(feature = "qulacs")]
-    pub use pecos_qulacs::QulacsStateVec;
     pub use pecos_simulators::*;
 }
 
@@ -230,13 +222,6 @@ pub use pecos_qis::{
 };
 #[cfg(feature = "qis")]
 pub use pecos_qis::{QisEngineBuilder, qis_engine, setup_qis_engine_with_runtime};
-#[cfg(feature = "quest")]
-pub use pecos_quest::{
-    QuestDensityMatrix, QuestDensityMatrixEngine, QuestDensityMatrixEngineBuilder, QuestStateVec,
-    QuestStateVecEngine, QuestStateVectorEngineBuilder, quest_density_matrix, quest_state_vec,
-};
-#[cfg(feature = "qulacs")]
-pub use pecos_qulacs::QulacsStateVec;
 #[cfg(feature = "wasm")]
 pub use pecos_wasm::{ForeignObject, WasmForeignObject};
 #[cfg(feature = "runtime")]
diff --git a/crates/pecos/src/prelude.rs b/crates/pecos/src/prelude.rs
index f087c76dd..d2e35d6b7 100644
--- a/crates/pecos/src/prelude.rs
+++ b/crates/pecos/src/prelude.rs
@@ -56,7 +56,7 @@
 //!
 //! - Unified simulation API: `sim()`, `SimBuilderExt`
 //! - Program utilities: `detect_program_type()`, etc.
-//! - Feature-gated quantum backends: `CppSparseStab`, `QuestStateVec`, etc.
+//! - Feature-gated quantum backends: `CppSparseStab`, etc.
 //!
 //! For organized access to specific functionality, use the namespace modules:
 //!
@@ -131,12 +131,6 @@ pub use crate::unified_sim::{ProgrammedSimBuilder, SimBuilderExt, sim};
 #[cfg(feature = "cppsparsestab")]
 pub use pecos_cppsparsestab::CppSparseStab;
 
-#[cfg(feature = "quest")]
-pub use pecos_quest::{QuestDensityMatrix, QuestStateVec};
-
-#[cfg(feature = "qulacs")]
-pub use pecos_qulacs::QulacsStateVec;
-
 // ============================================================================
 // WebAssembly foreign object support
 // ============================================================================
diff --git a/crates/pecos/tests/quest_sim_test.rs b/crates/pecos/tests/quest_sim_test.rs
deleted file mode 100644
index 66e09b98f..000000000
--- a/crates/pecos/tests/quest_sim_test.rs
+++ /dev/null
@@ -1,350 +0,0 @@
-//! Tests for Quest quantum simulator integration with `sim()` API
-
-#![cfg(all(feature = "runtime", feature = "quest"))]
-
-use pecos::{quest_density_matrix, quest_state_vec, sim};
-use pecos_programs::Qasm;
-
-/// Test Quest state vector with CPU mode
-#[test]
-fn test_quest_state_vec_cpu() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // Test CPU mode
-    let results = sim(program)
-        .quantum(quest_state_vec().with_cpu())
-        .seed(42)
-        .run(100)
-        .expect("Simulation should succeed");
-
-    assert_eq!(results.len(), 100, "Should get 100 shots");
-
-    // Verify we got Bell state results (only |00⟩ and |11⟩)
-    let shot_map = results
-        .try_as_shot_map()
-        .expect("Should convert to shot map");
-    let measurements = shot_map
-        .try_bits_as_u64("c")
-        .expect("Should extract measurements");
-
-    for &measurement in &measurements {
-        assert!(
-            measurement == 0 || measurement == 3,
-            "Bell state should only produce |00⟩ (0) or |11⟩ (3), got {measurement}"
-        );
-    }
-}
-
-/// Test Quest state vector with GPU mode (requires CUDA at runtime)
-#[test]
-#[ignore = "requires CUDA at runtime"]
-fn test_quest_state_vec_gpu() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // Test GPU mode
-    let results = sim(program)
-        .quantum(quest_state_vec().with_gpu())
-        .seed(42)
-        .run(100)
-        .expect("Simulation should succeed");
-
-    assert_eq!(results.len(), 100, "Should get 100 shots");
-
-    // Verify we got Bell state results
-    let shot_map = results
-        .try_as_shot_map()
-        .expect("Should convert to shot map");
-    let measurements = shot_map
-        .try_bits_as_u64("c")
-        .expect("Should extract measurements");
-
-    for &measurement in &measurements {
-        assert!(
-            measurement == 0 || measurement == 3,
-            "Bell state should only produce |00⟩ (0) or |11⟩ (3), got {measurement}"
-        );
-    }
-}
-
-/// Test Quest density matrix with CPU mode
-#[test]
-fn test_quest_density_matrix_cpu() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // Test CPU mode
-    let results = sim(program)
-        .quantum(quest_density_matrix().with_cpu())
-        .seed(42)
-        .run(100)
-        .expect("Simulation should succeed");
-
-    assert_eq!(results.len(), 100, "Should get 100 shots");
-
-    // Verify we got Bell state results
-    let shot_map = results
-        .try_as_shot_map()
-        .expect("Should convert to shot map");
-    let measurements = shot_map
-        .try_bits_as_u64("c")
-        .expect("Should extract measurements");
-
-    for &measurement in &measurements {
-        assert!(
-            measurement == 0 || measurement == 3,
-            "Bell state should only produce |00⟩ (0) or |11⟩ (3), got {measurement}"
-        );
-    }
-}
-
-/// Test Quest density matrix with GPU mode returns appropriate error
-/// (GPU density matrix simulation is not yet implemented in `QuEST`)
-#[test]
-fn test_quest_density_matrix_gpu() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // GPU density matrix simulation is not yet implemented, so this should return an error
-    let result = sim(program)
-        .quantum(quest_density_matrix().with_gpu())
-        .seed(42)
-        .run(100);
-
-    // Verify we get the expected error about GPU density matrix not being implemented
-    assert!(result.is_err(), "GPU density matrix should return an error");
-    let err_msg = result.unwrap_err().to_string();
-    assert!(
-        err_msg.contains("density matrix") && err_msg.contains("not yet implemented"),
-        "Error should mention density matrix GPU not implemented, got: {err_msg}"
-    );
-}
-
-/// Test that Quest works with different circuit types
-#[test]
-fn test_quest_various_gates() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[3];
-        creg c[3];
-        h q[0];
-        t q[0];
-        x q[1];
-        y q[2];
-        z q[0];
-        rx(1.5708) q[1];
-        ry(1.5708) q[2];
-        rz(1.5708) q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // Test with Quest state vector
-    let results = sim(program)
-        .quantum(quest_state_vec().with_cpu())
-        .seed(42)
-        .run(10)
-        .expect("Simulation should succeed");
-
-    assert_eq!(results.len(), 10, "Should get 10 shots");
-}
-
-/// Test that Quest works with seed for reproducibility
-///
-/// Note: Due to `QuEST`'s global environment design, perfect reproducibility
-/// across separate `sim()` calls may not be guaranteed. This test verifies
-/// that the seed parameter is accepted and affects the results.
-#[test]
-fn test_quest_seed_parameter() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // Run with one seed
-    let results1 = sim(program.clone())
-        .quantum(quest_state_vec().with_cpu())
-        .seed(123)
-        .run(50)
-        .expect("Simulation should succeed");
-
-    // Run with different seed
-    let results2 = sim(program)
-        .quantum(quest_state_vec().with_cpu())
-        .seed(456)
-        .run(50)
-        .expect("Simulation should succeed");
-
-    // Just verify both completed successfully
-    assert_eq!(results1.len(), 50, "Should get 50 shots with seed 123");
-    assert_eq!(results2.len(), 50, "Should get 50 shots with seed 456");
-
-    // Verify we got valid Bell state results from both
-    let shot_map1 = results1
-        .try_as_shot_map()
-        .expect("Should convert to shot map");
-    let shot_map2 = results2
-        .try_as_shot_map()
-        .expect("Should convert to shot map");
-
-    let measurements1 = shot_map1
-        .try_bits_as_u64("c")
-        .expect("Should extract measurements");
-    let measurements2 = shot_map2
-        .try_bits_as_u64("c")
-        .expect("Should extract measurements");
-
-    // Both should only produce valid Bell state outcomes
-    for &measurement in &measurements1 {
-        assert!(
-            measurement == 0 || measurement == 3,
-            "Bell state should only produce |00⟩ or |11⟩"
-        );
-    }
-    for &measurement in &measurements2 {
-        assert!(
-            measurement == 0 || measurement == 3,
-            "Bell state should only produce |00⟩ or |11⟩"
-        );
-    }
-}
-
-/// Test that Quest builder can be used with `qubits()` method
-#[test]
-fn test_quest_builder_with_qubits() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // Test that qubits() method works (though it gets overridden by program)
-    let results = sim(program)
-        .quantum(quest_state_vec().qubits(2).with_cpu())
-        .seed(42)
-        .run(10)
-        .expect("Simulation should succeed");
-
-    assert_eq!(results.len(), 10, "Should get 10 shots");
-}
-
-/// Test that both CPU and GPU modes work correctly
-///
-/// Note: Due to potential differences in RNG implementation between CPU and GPU,
-/// we verify that both modes produce valid results rather than identical results.
-#[test]
-#[ignore = "requires CUDA at runtime"]
-#[allow(clippy::similar_names)]
-fn test_quest_cpu_and_gpu_both_work() {
-    let qasm_code = r#"
-        OPENQASM 2.0;
-        include "qelib1.inc";
-        qreg q[2];
-        creg c[2];
-        h q[0];
-        cx q[0], q[1];
-        measure q -> c;
-    "#;
-
-    let program = Qasm::from_string(qasm_code);
-
-    // Run with CPU
-    let results_cpu = sim(program.clone())
-        .quantum(quest_state_vec().with_cpu())
-        .seed(999)
-        .run(50)
-        .expect("CPU simulation should succeed");
-
-    // Run with GPU
-    let results_gpu = sim(program)
-        .quantum(quest_state_vec().with_gpu())
-        .seed(999)
-        .run(50)
-        .expect("GPU simulation should succeed");
-
-    // Verify both got the right number of shots
-    assert_eq!(results_cpu.len(), 50, "CPU should get 50 shots");
-    assert_eq!(results_gpu.len(), 50, "GPU should get 50 shots");
-
-    // Convert to shot maps
-    let shot_map_cpu = results_cpu
-        .try_as_shot_map()
-        .expect("Should convert CPU results to shot map");
-    let shot_map_gpu = results_gpu
-        .try_as_shot_map()
-        .expect("Should convert GPU results to shot map");
-
-    let measurements_cpu = shot_map_cpu
-        .try_bits_as_u64("c")
-        .expect("Should extract CPU measurements");
-    let measurements_gpu = shot_map_gpu
-        .try_bits_as_u64("c")
-        .expect("Should extract GPU measurements");
-
-    // Both should produce valid Bell state results
-    for &measurement in &measurements_cpu {
-        assert!(
-            measurement == 0 || measurement == 3,
-            "CPU Bell state should only produce |00⟩ or |11⟩, got {measurement}"
-        );
-    }
-    for &measurement in &measurements_gpu {
-        assert!(
-            measurement == 0 || measurement == 3,
-            "GPU Bell state should only produce |00⟩ or |11⟩, got {measurement}"
-        );
-    }
-}
diff --git a/python/pecos-rslib/Cargo.toml b/python/pecos-rslib/Cargo.toml
index c9ae09824..fe1c44197 100644
--- a/python/pecos-rslib/Cargo.toml
+++ b/python/pecos-rslib/Cargo.toml
@@ -47,8 +47,6 @@ pecos-wasm = { workspace = true, optional = true }
 
 # C++ simulator backends
 pecos-cppsparsestab.workspace = true
-pecos-quest.workspace = true
-pecos-qulacs.workspace = true
 
 # Decoders (all backends)
 pecos-decoders = { workspace = true, features = ["all"] }
diff --git a/python/pecos-rslib/examples/quest_simulator.py b/python/pecos-rslib/examples/quest_simulator.py
deleted file mode 100755
index d817d5a7c..000000000
--- a/python/pecos-rslib/examples/quest_simulator.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-"""Test script for QuEST simulators exposed to Python via pecos-rslib"""
-
-import math
-
-from pecos_rslib import QuestDensityMatrix, QuestStateVec
-
-
-def test_quest_statevec() -> None:
-    """Test the QuEST state vector simulator"""
-    print("Testing QuEST State Vector Simulator")
-    print("=" * 40)
-
-    # Create a 2-qubit state vector simulator
-    sim = QuestStateVec(2)
-    print(f"Created simulator: {sim}")
-    print(f"Number of qubits: {sim.num_qubits()}")
-
-    # Test initial state |00⟩
-    print("\nInitial state |00⟩:")
-    prob00 = sim.probability(0b00)
-    print(f"  Probability of |00⟩: {prob00:.4f}")
-    amp00 = sim.get_amplitude(0b00)
-    print(f"  Amplitude of |00⟩: {amp00[0]:.4f} + {amp00[1]:.4f}i")
-
-    # Apply Hadamard to qubit 0
-    print("\nApplying H(0)...")
-    sim.run_1q_gate("H", 0)
-
-    # Check probabilities after H
-    print("After H(0):")
-    for i in range(4):
-        prob = sim.probability(i)
-        state = f"|{i:02b}⟩"
-        print(f"  Probability of {state}: {prob:.4f}")
-
-    # Apply CNOT(0, 1) to create Bell state
-    print("\nApplying CNOT(0, 1)...")
-    sim.run_2q_gate("CX", (0, 1))
-
-    # Check Bell state
-    print("Bell state |Φ+⟩ = (|00⟩ + |11⟩)/√2:")
-    for i in range(4):
-        prob = sim.probability(i)
-        amp = sim.get_amplitude(i)
-        state = f"|{i:02b}⟩"
-        print(f"  {state}: prob={prob:.4f}, amp=({amp[0]:.4f}, {amp[1]:.4f})")
-
-    # Test measurement
-    print("\nPerforming measurements:")
-    for _ in range(5):
-        sim.reset()
-        sim.run_1q_gate("H", 0)
-        sim.run_2q_gate("CX", (0, 1))
-
-        result0 = sim.run_1q_gate("MZ", 0)
-        result1 = sim.run_1q_gate("MZ", 1)
-        print(f"  Measured: qubit 0 = {result0}, qubit 1 = {result1}")
-
-    # Test rotation gates
-    print("\nTesting rotation gates:")
-    sim.reset()
-    sim.run_1q_gate("RX", 0, {"angle": math.pi / 4})
-    prob0 = sim.probability(0)
-    prob1 = sim.probability(1)
-    print(f"  After RX(π/4) on |0⟩: P(|0⟩)={prob0:.4f}, P(|1⟩)={prob1:.4f}")
-
-    sim.reset()
-    sim.run_1q_gate("RY", 0, {"angle": math.pi / 2})
-    amp0 = sim.get_amplitude(0)
-    amp1 = sim.get_amplitude(1)
-    print("  After RY(π/2) on |0⟩:")
-    print(f"    |0⟩ amplitude: ({amp0[0]:.4f}, {amp0[1]:.4f})")
-    print(f"    |1⟩ amplitude: ({amp1[0]:.4f}, {amp1[1]:.4f})")
-
-
-def test_quest_density_matrix() -> None:
-    """Test the QuEST density matrix simulator"""
-    print("\n\nTesting QuEST Density Matrix Simulator")
-    print("=" * 40)
-
-    # Create a 2-qubit density matrix simulator
-    sim = QuestDensityMatrix(2)
-    print(f"Created simulator: {sim}")
-    print(f"Number of qubits: {sim.num_qubits()}")
-
-    # Test initial state |00⟩⟨00|
-    print("\nInitial state |00⟩⟨00|:")
-    prob00 = sim.probability(0b00)
-    print(f"  Probability of |00⟩: {prob00:.4f}")
-
-    # Apply gates to create mixed state
-    print("\nApplying H(0) and X(1)...")
-    sim.run_1q_gate("H", 0)
-    sim.run_1q_gate("X", 1)
-
-    # Check probabilities
-    print("After H(0) and X(1):")
-    for i in range(4):
-        prob = sim.probability(i)
-        state = f"|{i:02b}⟩"
-        print(f"  Probability of {state}: {prob:.4f}")
-
-    # Test two-qubit gates
-    print("\nResetting and creating entangled state...")
-    sim.reset()
-    sim.run_1q_gate("H", 0)
-    sim.run_2q_gate("CX", (0, 1))
-
-    print("After H(0) and CNOT(0,1):")
-    for i in range(4):
-        prob = sim.probability(i)
-        state = f"|{i:02b}⟩"
-        print(f"  Probability of {state}: {prob:.4f}")
-
-    # Test measurement
-    print("\nPerforming measurement on qubit 0:")
-    result = sim.run_1q_gate("MZ", 0)
-    print(f"  Measured: {result}")
-
-    print("\nState after measurement:")
-    for i in range(4):
-        prob = sim.probability(i)
-        state = f"|{i:02b}⟩"
-        print(f"  Probability of {state}: {prob:.4f}")
-
-
-if __name__ == "__main__":
-    test_quest_statevec()
-    test_quest_density_matrix()
-    print("\nAll tests completed successfully!")
diff --git a/python/pecos-rslib/pecos_rslib.pyi b/python/pecos-rslib/pecos_rslib.pyi
index 3ba47ecec..2cbaad411 100644
--- a/python/pecos-rslib/pecos_rslib.pyi
+++ b/python/pecos-rslib/pecos_rslib.pyi
@@ -819,16 +819,6 @@ class StateVec:
     def probability(self, basis_state: int) -> float: ...
     def vector_big_endian(self) -> Array: ...
 
-class Qulacs:
-    """Rust Qulacs state vector simulator."""
-
-    def __init__(self, num_qubits: int, *, seed: int | None = None) -> None: ...
-    def reset(self) -> Qulacs: ...
-    @property
-    def num_qubits(self) -> int: ...
-    @property
-    def probabilities(self) -> list[float]: ...
-
 class SparseStab:
     """Rust sparse stabilizer simulator."""
 
@@ -865,22 +855,6 @@ class CoinToss:
     @property
     def num_qubits(self) -> int: ...
 
-class QuestStateVec:
-    """QuEST state vector simulator."""
-
-    def __init__(self, num_qubits: int) -> None: ...
-    def reset(self) -> QuestStateVec: ...
-    @property
-    def num_qubits(self) -> int: ...
-
-class QuestDensityMatrix:
-    """QuEST density matrix simulator."""
-
-    def __init__(self, num_qubits: int) -> None: ...
-    def reset(self) -> QuestDensityMatrix: ...
-    @property
-    def num_qubits(self) -> int: ...
-
 # =============================================================================
 # Engine Types
 # =============================================================================
diff --git a/python/pecos-rslib/src/lib.rs b/python/pecos-rslib/src/lib.rs
index b2f8a8054..6239baa96 100644
--- a/python/pecos-rslib/src/lib.rs
+++ b/python/pecos-rslib/src/lib.rs
@@ -50,8 +50,6 @@ mod phir_json_bridge;
 mod programs_module;
 mod py_foreign_decoder;
 mod py_foreign_simulator;
-mod quest_bindings;
-mod qulacs_bindings;
 mod shot_results_bindings;
 mod sim;
 mod simulator_utils;
@@ -81,8 +79,6 @@ use pauli_prop_bindings::PyPauliProp;
 use pecos_array::Array;
 use pecos_random_bindings::RngPcg;
 use pyo3::prelude::*;
-use quest_bindings::{QuestDensityMatrix, QuestStateVec};
-use qulacs_bindings::PyQulacs;
 use sparse_stab_bindings::PySparseStab;
 use sparse_stab_engine_bindings::PySparseStabEngine;
 use stab_bindings::PyStabilizer;
@@ -243,7 +239,6 @@ fn pecos_rslib(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
         m
     )?)?;
     m.add_class::<PyStateVec>()?;
-    m.add_class::<PyQulacs>()?;
     m.add_class::<PyCoinToss>()?;
     m.add_class::<PyPauliProp>()?;
     m.add_class::<PyByteMessage>()?;
@@ -253,8 +248,6 @@ fn pecos_rslib(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<PyStateVecEngine>()?;
     m.add_class::<PySparseStabEngine>()?;
     m.add_class::<RngPcg>()?;
-    m.add_class::<QuestStateVec>()?;
-    m.add_class::<QuestDensityMatrix>()?;
     m.add_class::<Array>()?;
     m.add_class::<PyBitInt>()?;
     m.add_class::<PyBitUInt>()?;
diff --git a/python/pecos-rslib/src/prelude.rs b/python/pecos-rslib/src/prelude.rs
index 69769d7e1..fadea427a 100644
--- a/python/pecos-rslib/src/prelude.rs
+++ b/python/pecos-rslib/src/prelude.rs
@@ -30,10 +30,6 @@ pub use pecos_hugr_qis::prelude::*;
 // PHIR-JSON format
 pub use pecos_phir_json::prelude::*;
 
-// C++ simulator backends
-pub use pecos_quest::{QuestDensityMatrix, QuestStateVec};
-pub use pecos_qulacs::QulacsStateVec;
-
 // WASM types (feature-gated)
 #[cfg(feature = "wasm")]
 pub use pecos_wasm::ForeignObject;
diff --git a/python/pecos-rslib/src/quest_bindings.rs b/python/pecos-rslib/src/quest_bindings.rs
deleted file mode 100644
index f94f2a167..000000000
--- a/python/pecos-rslib/src/quest_bindings.rs
+++ /dev/null
@@ -1,830 +0,0 @@
-// Copyright 2024 The PECOS Developers
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-// in compliance with the License.You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software distributed under the License
-// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-// or implied. See the License for the specific language governing permissions and limitations under
-// the License.
-
-use crate::dtypes::AngleParam;
-use crate::prelude::*;
-use pyo3::prelude::*;
-use pyo3::types::{PyDict, PyTuple};
-
-// Import the Rust types with renamed aliases to distinguish from Python wrapper types
-// These are re-exported by pecos::prelude when the quest feature is enabled
-use crate::prelude::{
-    QuestDensityMatrix as RustQuestDensityMatrix, QuestStateVec as RustQuestStateVec,
-};
-
-/// The struct represents the `QuEST` state-vector simulator exposed to Python
-#[pyclass]
-pub struct QuestStateVec {
-    inner: RustQuestStateVec,
-}
-
-#[pymethods]
-impl QuestStateVec {
-    /// Creates a new `QuEST` state-vector simulator with the specified number of qubits
-    ///
-    /// # Arguments
-    /// * `num_qubits` - Number of qubits in the system
-    /// * `seed` - Optional seed for the random number generator
-    #[new]
-    #[pyo3(signature = (num_qubits, seed=None))]
-    pub fn new(num_qubits: usize, seed: Option<u64>) -> Self {
-        QuestStateVec {
-            inner: match seed {
-                Some(s) => RustQuestStateVec::with_seed(num_qubits, s),
-                None => RustQuestStateVec::new(num_qubits),
-            },
-        }
-    }
-
-    /// Returns the number of qubits in the simulator
-    fn num_qubits(&self) -> usize {
-        self.inner.num_qubits()
-    }
-
-    /// Resets the quantum state to the all-zero state
-    fn reset(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> {
-        slf.inner.reset();
-        slf
-    }
-
-    /// Prepares a computational basis state
-    fn prepare_computational_basis(&mut self, index: usize) {
-        self.inner.prepare_computational_basis(index);
-    }
-
-    /// Gets the probability of a computational basis state
-    fn probability(&self, index: usize) -> f64 {
-        self.inner.probability(index)
-    }
-
-    /// Gets the amplitude of a computational basis state as a complex number
-    fn get_amplitude(&self, index: usize) -> (f64, f64) {
-        let amp = self.inner.get_amplitude(index);
-        (amp.re, amp.im)
-    }
-
-    /// Executes a single-qubit gate based on the provided symbol and location
-    ///
-    /// `symbol`: The gate symbol (e.g., "X", "H", "Z", "RX", "RY", "RZ")
-    /// `location`: The qubit index to apply the gate to
-    /// `params`: Optional parameters for parameterized gates
-    ///
-    /// Returns an optional result, usually `None` unless a measurement is performed
-    #[allow(clippy::too_many_lines)]
-    #[pyo3(signature = (symbol, location, params=None))]
-    fn run_1q_gate(
-        &mut self,
-        symbol: &str,
-        location: usize,
-        params: Option<&Bound<'_, PyDict>>,
-    ) -> PyResult<Option<u8>> {
-        match symbol {
-            "X" => {
-                self.inner.x(&[QubitId(location)]);
-                Ok(None)
-            }
-            "Y" => {
-                self.inner.y(&[QubitId(location)]);
-                Ok(None)
-            }
-            "Z" => {
-                self.inner.z(&[QubitId(location)]);
-                Ok(None)
-            }
-            "H" => {
-                self.inner.h(&[QubitId(location)]);
-                Ok(None)
-            }
-            // Note: S and S† gates are not implemented in QuEST wrapper yet
-            "RX" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.rx(angle.0, &[QubitId(location)]);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RX gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RX gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                }
-                Ok(None)
-            }
-            "RY" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.ry(angle.0, &[QubitId(location)]);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RY gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RY gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                }
-                Ok(None)
-            }
-            "RZ" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.rz(angle.0, &[QubitId(location)]);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RZ gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RZ gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                }
-                Ok(None)
-            }
-            "MZ" => {
-                let results = self.inner.mz(&[QubitId(location)]);
-                Ok(Some(u8::from(results[0].outcome)))
-            }
-            _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Unknown single-qubit gate: {symbol}"
-            ))),
-        }
-    }
-
-    /// Executes a two-qubit gate based on the provided symbol and locations
-    ///
-    /// `symbol`: The gate symbol (e.g., "CX", "CY", "CZ", "RXX", "RYY", "RZZ")
-    /// `locations`: Tuple of (control, target) qubit indices
-    /// `params`: Optional parameters for parameterized gates
-    #[pyo3(signature = (symbol, locations, params=None))]
-    fn run_2q_gate(
-        &mut self,
-        symbol: &str,
-        locations: &Bound<'_, PyTuple>,
-        params: Option<&Bound<'_, PyDict>>,
-    ) -> PyResult<()> {
-        if locations.len() != 2 {
-            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                "Two-qubit gate requires exactly 2 qubit indices",
-            ));
-        }
-
-        let control = locations.get_item(0)?.extract::<usize>()?;
-        let target = locations.get_item(1)?.extract::<usize>()?;
-
-        match symbol {
-            "CX" | "CNOT" => {
-                self.inner.cx(&[(QubitId(control), QubitId(target))]);
-                Ok(())
-            }
-            "CY" => {
-                self.inner.cy(&[(QubitId(control), QubitId(target))]);
-                Ok(())
-            }
-            "CZ" => {
-                self.inner.cz(&[(QubitId(control), QubitId(target))]);
-                Ok(())
-            }
-            "RXX" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner
-                                    .rxx(angle.0, &[(QubitId(control), QubitId(target))]);
-                                Ok(())
-                            } else {
-                                Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RXX gate",
-                                ))
-                            }
-                        }
-                        Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                            "Angle parameter missing for RXX gate",
-                        )),
-                        Err(err) => Err(err),
-                    }
-                } else {
-                    Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "RXX gate requires angle parameter",
-                    ))
-                }
-            }
-            "RYY" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner
-                                    .ryy(angle.0, &[(QubitId(control), QubitId(target))]);
-                                Ok(())
-                            } else {
-                                Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RYY gate",
-                                ))
-                            }
-                        }
-                        Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                            "Angle parameter missing for RYY gate",
-                        )),
-                        Err(err) => Err(err),
-                    }
-                } else {
-                    Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "RYY gate requires angle parameter",
-                    ))
-                }
-            }
-            "RZZ" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner
-                                    .rzz(angle.0, &[(QubitId(control), QubitId(target))]);
-                                Ok(())
-                            } else {
-                                Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RZZ gate",
-                                ))
-                            }
-                        }
-                        Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                            "Angle parameter missing for RZZ gate",
-                        )),
-                        Err(err) => Err(err),
-                    }
-                } else {
-                    Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "RZZ gate requires angle parameter",
-                    ))
-                }
-            }
-            _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Unknown two-qubit gate: {symbol}"
-            ))),
-        }
-    }
-
-    /// Applies a T gate to the specified qubit
-    fn t_gate(&mut self, location: usize) {
-        self.inner.t(&[QubitId(location)]);
-    }
-
-    /// Applies a T-dagger gate to the specified qubit
-    fn tdg_gate(&mut self, location: usize) {
-        self.inner.tdg(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of XX gate to two qubits
-    fn sxx_gate(&mut self, control: usize, target: usize) {
-        self.inner.sxx(&[(QubitId(control), QubitId(target))]);
-    }
-
-    /// Applies a square root of YY gate to two qubits
-    fn syy_gate(&mut self, control: usize, target: usize) {
-        self.inner.syy(&[(QubitId(control), QubitId(target))]);
-    }
-
-    /// Applies a square root of ZZ gate to two qubits
-    fn szz_gate(&mut self, control: usize, target: usize) {
-        self.inner.szz(&[(QubitId(control), QubitId(target))]);
-    }
-    /// Applies an R1XY gate to the specified qubit
-    fn r1xy_gate(&mut self, theta: AngleParam, phi: AngleParam, location: usize) {
-        self.inner.r1xy(theta.0, phi.0, &[QubitId(location)]);
-    }
-
-    /// Applies RXXRYYRZZ gate (combination of RXX, RYY, RZZ) to two qubits
-    /// NOTE: This uses the trait implementation which may differ from `StateVec`'s decomposition
-    /// For consistency with `StateVec` tests, the Python bindings use manual decompositions
-    fn rxxryyrzz_gate(
-        &mut self,
-        theta: AngleParam,
-        phi: AngleParam,
-        lambda: AngleParam,
-        q1: usize,
-        q2: usize,
-    ) {
-        // Use the trait implementation directly
-        // Note: The trait's rxxryyrzz has a different decomposition than StateVec's
-        // which is why Python bindings use manual decompositions for RXX, RYY, RZZ
-        self.inner
-            .rxxryyrzz(theta.0, phi.0, lambda.0, &[(QubitId(q1), QubitId(q2))]);
-    }
-
-    /// Applies a SWAP gate to two qubits
-    fn swap_gate(&mut self, control: usize, target: usize) {
-        self.inner.swap(&[(QubitId(control), QubitId(target))]);
-    }
-
-    /// Applies H2 gate variant
-    fn h2_gate(&mut self, location: usize) {
-        self.inner.h2(&[QubitId(location)]);
-    }
-
-    /// Applies H3 gate variant
-    fn h3_gate(&mut self, location: usize) {
-        self.inner.h3(&[QubitId(location)]);
-    }
-
-    /// Applies H4 gate variant
-    fn h4_gate(&mut self, location: usize) {
-        self.inner.h4(&[QubitId(location)]);
-    }
-
-    /// Applies H5 gate variant
-    fn h5_gate(&mut self, location: usize) {
-        self.inner.h5(&[QubitId(location)]);
-    }
-
-    /// Applies H6 gate variant
-    fn h6_gate(&mut self, location: usize) {
-        self.inner.h6(&[QubitId(location)]);
-    }
-
-    /// Measures in the X basis
-    fn mx_gate(&mut self, location: usize) -> u8 {
-        let results = self.inner.mx(&[QubitId(location)]);
-        u8::from(results[0].outcome)
-    }
-
-    /// Measures in the Y basis
-    fn my_gate(&mut self, location: usize) -> u8 {
-        let results = self.inner.my(&[QubitId(location)]);
-        u8::from(results[0].outcome)
-    }
-
-    /// Applies a square root of X gate to the specified qubit
-    fn sx_gate(&mut self, location: usize) {
-        self.inner.sx(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of X-dagger gate to the specified qubit
-    fn sxdg_gate(&mut self, location: usize) {
-        self.inner.sxdg(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Y gate to the specified qubit
-    fn sy_gate(&mut self, location: usize) {
-        self.inner.sy(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Y-dagger gate to the specified qubit
-    fn sydg_gate(&mut self, location: usize) {
-        self.inner.sydg(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Z gate to the specified qubit
-    fn sz_gate(&mut self, location: usize) {
-        self.inner.sz(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Z-dagger gate to the specified qubit
-    fn szdg_gate(&mut self, location: usize) {
-        self.inner.szdg(&[QubitId(location)]);
-    }
-
-    /// String representation of the simulator
-    fn __repr__(&self) -> String {
-        format!("QuestStateVec(num_qubits={})", self.inner.num_qubits())
-    }
-}
-
-/// The struct represents the `QuEST` density matrix simulator exposed to Python
-#[pyclass]
-pub struct QuestDensityMatrix {
-    inner: RustQuestDensityMatrix,
-}
-
-#[pymethods]
-impl QuestDensityMatrix {
-    /// Creates a new `QuEST` density matrix simulator with the specified number of qubits
-    ///
-    /// # Arguments
-    /// * `num_qubits` - Number of qubits in the system
-    /// * `seed` - Optional seed for the random number generator
-    #[new]
-    #[pyo3(signature = (num_qubits, seed=None))]
-    pub fn new(num_qubits: usize, seed: Option<u64>) -> Self {
-        QuestDensityMatrix {
-            inner: match seed {
-                Some(s) => RustQuestDensityMatrix::with_seed(num_qubits, s),
-                None => RustQuestDensityMatrix::new(num_qubits),
-            },
-        }
-    }
-
-    /// Returns the number of qubits in the simulator
-    fn num_qubits(&self) -> usize {
-        self.inner.num_qubits()
-    }
-
-    /// Resets the quantum state to the all-zero state
-    fn reset(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> {
-        slf.inner.reset();
-        slf
-    }
-
-    /// Prepares a computational basis state
-    fn prepare_computational_basis(&mut self, index: usize) {
-        self.inner.prepare_computational_basis(index);
-    }
-
-    /// Gets the probability of a computational basis state
-    fn probability(&self, index: usize) -> f64 {
-        self.inner.probability(index)
-    }
-
-    // Note: calculate_purity is not exposed in QuEST wrapper yet
-
-    /// Executes a single-qubit gate based on the provided symbol and location
-    ///
-    /// `symbol`: The gate symbol (e.g., "X", "H", "Z", "RX", "RY", "RZ")
-    /// `location`: The qubit index to apply the gate to
-    /// `params`: Optional parameters for parameterized gates
-    ///
-    /// Returns an optional result, usually `None` unless a measurement is performed
-    #[allow(clippy::too_many_lines)]
-    #[pyo3(signature = (symbol, location, params=None))]
-    fn run_1q_gate(
-        &mut self,
-        symbol: &str,
-        location: usize,
-        params: Option<&Bound<'_, PyDict>>,
-    ) -> PyResult<Option<u8>> {
-        match symbol {
-            "X" => {
-                self.inner.x(&[QubitId(location)]);
-                Ok(None)
-            }
-            "Y" => {
-                self.inner.y(&[QubitId(location)]);
-                Ok(None)
-            }
-            "Z" => {
-                self.inner.z(&[QubitId(location)]);
-                Ok(None)
-            }
-            "H" => {
-                self.inner.h(&[QubitId(location)]);
-                Ok(None)
-            }
-            // Note: S and S† gates are not implemented in QuEST wrapper yet
-            "RX" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.rx(angle.0, &[QubitId(location)]);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RX gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RX gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                }
-                Ok(None)
-            }
-            "RY" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.ry(angle.0, &[QubitId(location)]);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RY gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RY gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                }
-                Ok(None)
-            }
-            "RZ" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.rz(angle.0, &[QubitId(location)]);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RZ gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RZ gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                }
-                Ok(None)
-            }
-            "MZ" => {
-                let results = self.inner.mz(&[QubitId(location)]);
-                Ok(Some(u8::from(results[0].outcome)))
-            }
-            _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Unknown single-qubit gate: {symbol}"
-            ))),
-        }
-    }
-
-    /// Executes a two-qubit gate based on the provided symbol and locations
-    ///
-    /// `symbol`: The gate symbol (e.g., "CX", "CY", "CZ", "RXX", "RYY", "RZZ")
-    /// `locations`: Tuple of (control, target) qubit indices
-    /// `params`: Optional parameters for parameterized gates
-    #[pyo3(signature = (symbol, locations, params=None))]
-    fn run_2q_gate(
-        &mut self,
-        symbol: &str,
-        locations: &Bound<'_, PyTuple>,
-        params: Option<&Bound<'_, PyDict>>,
-    ) -> PyResult<()> {
-        if locations.len() != 2 {
-            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                "Two-qubit gate requires exactly 2 qubit indices",
-            ));
-        }
-
-        let control = locations.get_item(0)?.extract::<usize>()?;
-        let target = locations.get_item(1)?.extract::<usize>()?;
-
-        match symbol {
-            "CX" | "CNOT" => {
-                self.inner.cx(&[(QubitId(control), QubitId(target))]);
-                Ok(())
-            }
-            "CY" => {
-                self.inner.cy(&[(QubitId(control), QubitId(target))]);
-                Ok(())
-            }
-            "CZ" => {
-                self.inner.cz(&[(QubitId(control), QubitId(target))]);
-                Ok(())
-            }
-            "RXX" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner
-                                    .rxx(angle.0, &[(QubitId(control), QubitId(target))]);
-                                Ok(())
-                            } else {
-                                Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RXX gate",
-                                ))
-                            }
-                        }
-                        Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                            "Angle parameter missing for RXX gate",
-                        )),
-                        Err(err) => Err(err),
-                    }
-                } else {
-                    Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "RXX gate requires angle parameter",
-                    ))
-                }
-            }
-            "RYY" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner
-                                    .ryy(angle.0, &[(QubitId(control), QubitId(target))]);
-                                Ok(())
-                            } else {
-                                Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RYY gate",
-                                ))
-                            }
-                        }
-                        Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                            "Angle parameter missing for RYY gate",
-                        )),
-                        Err(err) => Err(err),
-                    }
-                } else {
-                    Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "RYY gate requires angle parameter",
-                    ))
-                }
-            }
-            "RZZ" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner
-                                    .rzz(angle.0, &[(QubitId(control), QubitId(target))]);
-                                Ok(())
-                            } else {
-                                Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RZZ gate",
-                                ))
-                            }
-                        }
-                        Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                            "Angle parameter missing for RZZ gate",
-                        )),
-                        Err(err) => Err(err),
-                    }
-                } else {
-                    Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "RZZ gate requires angle parameter",
-                    ))
-                }
-            }
-            _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Unknown two-qubit gate: {symbol}"
-            ))),
-        }
-    }
-
-    /// Applies a T gate to the specified qubit
-    fn t_gate(&mut self, location: usize) {
-        self.inner.t(&[QubitId(location)]);
-    }
-
-    /// Applies a T-dagger gate to the specified qubit
-    fn tdg_gate(&mut self, location: usize) {
-        self.inner.tdg(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of XX gate to two qubits
-    fn sxx_gate(&mut self, control: usize, target: usize) {
-        self.inner.sxx(&[(QubitId(control), QubitId(target))]);
-    }
-
-    /// Applies a square root of YY gate to two qubits
-    fn syy_gate(&mut self, control: usize, target: usize) {
-        self.inner.syy(&[(QubitId(control), QubitId(target))]);
-    }
-
-    /// Applies a square root of ZZ gate to two qubits
-    fn szz_gate(&mut self, control: usize, target: usize) {
-        self.inner.szz(&[(QubitId(control), QubitId(target))]);
-    }
-    /// Applies an R1XY gate to the specified qubit
-    fn r1xy_gate(&mut self, theta: AngleParam, phi: AngleParam, location: usize) {
-        self.inner.r1xy(theta.0, phi.0, &[QubitId(location)]);
-    }
-
-    /// Applies RXXRYYRZZ gate (combination of RXX, RYY, RZZ) to two qubits
-    /// NOTE: This uses the trait implementation which may differ from `StateVec`'s decomposition
-    /// For consistency with `StateVec` tests, the Python bindings use manual decompositions
-    fn rxxryyrzz_gate(
-        &mut self,
-        theta: AngleParam,
-        phi: AngleParam,
-        lambda: AngleParam,
-        q1: usize,
-        q2: usize,
-    ) {
-        // Use the trait implementation directly
-        // Note: The trait's rxxryyrzz has a different decomposition than StateVec's
-        // which is why Python bindings use manual decompositions for RXX, RYY, RZZ
-        self.inner
-            .rxxryyrzz(theta.0, phi.0, lambda.0, &[(QubitId(q1), QubitId(q2))]);
-    }
-
-    /// Applies a SWAP gate to two qubits
-    fn swap_gate(&mut self, control: usize, target: usize) {
-        self.inner.swap(&[(QubitId(control), QubitId(target))]);
-    }
-
-    /// Applies H2 gate variant
-    fn h2_gate(&mut self, location: usize) {
-        self.inner.h2(&[QubitId(location)]);
-    }
-
-    /// Applies H3 gate variant
-    fn h3_gate(&mut self, location: usize) {
-        self.inner.h3(&[QubitId(location)]);
-    }
-
-    /// Applies H4 gate variant
-    fn h4_gate(&mut self, location: usize) {
-        self.inner.h4(&[QubitId(location)]);
-    }
-
-    /// Applies H5 gate variant
-    fn h5_gate(&mut self, location: usize) {
-        self.inner.h5(&[QubitId(location)]);
-    }
-
-    /// Applies H6 gate variant
-    fn h6_gate(&mut self, location: usize) {
-        self.inner.h6(&[QubitId(location)]);
-    }
-
-    /// Measures in the X basis
-    fn mx_gate(&mut self, location: usize) -> u8 {
-        let results = self.inner.mx(&[QubitId(location)]);
-        u8::from(results[0].outcome)
-    }
-
-    /// Measures in the Y basis
-    fn my_gate(&mut self, location: usize) -> u8 {
-        let results = self.inner.my(&[QubitId(location)]);
-        u8::from(results[0].outcome)
-    }
-
-    /// Applies a square root of X gate to the specified qubit
-    fn sx_gate(&mut self, location: usize) {
-        self.inner.sx(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of X-dagger gate to the specified qubit
-    fn sxdg_gate(&mut self, location: usize) {
-        self.inner.sxdg(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Y gate to the specified qubit
-    fn sy_gate(&mut self, location: usize) {
-        self.inner.sy(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Y-dagger gate to the specified qubit
-    fn sydg_gate(&mut self, location: usize) {
-        self.inner.sydg(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Z gate to the specified qubit
-    fn sz_gate(&mut self, location: usize) {
-        self.inner.sz(&[QubitId(location)]);
-    }
-
-    /// Applies a square root of Z-dagger gate to the specified qubit
-    fn szdg_gate(&mut self, location: usize) {
-        self.inner.szdg(&[QubitId(location)]);
-    }
-
-    /// String representation of the simulator
-    fn __repr__(&self) -> String {
-        format!("QuestDensityMatrix(num_qubits={})", self.inner.num_qubits())
-    }
-}
diff --git a/python/pecos-rslib/src/qulacs_bindings.rs b/python/pecos-rslib/src/qulacs_bindings.rs
deleted file mode 100644
index f0b340e18..000000000
--- a/python/pecos-rslib/src/qulacs_bindings.rs
+++ /dev/null
@@ -1,612 +0,0 @@
-// Copyright 2025 The PECOS Developers
-use crate::dtypes::AngleParam;
-use crate::prelude::*;
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-// in compliance with the License.You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software distributed under the License
-// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-// or implied. See the License for the specific language governing permissions and limitations under
-// the License.
-
-use pyo3::prelude::*;
-use pyo3::types::{PyDict, PyTuple};
-
-/// The struct represents the Qulacs state-vector simulator exposed to Python
-#[pyclass(name = "Qulacs")]
-pub struct PyQulacs {
-    inner: QulacsStateVec,
-}
-
-impl PyQulacs {
-    /// Handle simple two-qubit gates that don't require parameters
-    fn handle_simple_2q_gate(
-        &mut self,
-        symbol: &str,
-        q1: usize,
-        q2: usize,
-    ) -> PyResult<Option<u8>> {
-        let pair = &[(QubitId(q1), QubitId(q2))];
-        match symbol {
-            "CX" => {
-                self.inner.cx(pair);
-            }
-            "CY" => {
-                self.inner.cy(pair);
-            }
-            "CZ" => {
-                self.inner.cz(pair);
-            }
-            "SWAP" => {
-                self.inner.swap(pair);
-            }
-            "G" | "G2" => {
-                self.inner.g(pair);
-            }
-            "SXX" => {
-                self.inner.rxx(Angle64::QUARTER_TURN, pair);
-            }
-            "SXXdg" => {
-                self.inner.rxx(-Angle64::QUARTER_TURN, pair);
-            }
-            "SYY" => {
-                self.inner.ryy(Angle64::QUARTER_TURN, pair);
-            }
-            "SYYdg" => {
-                self.inner.ryy(-Angle64::QUARTER_TURN, pair);
-            }
-            "SZZ" | "SqrtZZ" => {
-                self.inner.rzz(Angle64::QUARTER_TURN, pair);
-            }
-            "SZZdg" => {
-                self.inner.rzz(-Angle64::QUARTER_TURN, pair);
-            }
-            _ => {
-                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                    "Unknown simple two-qubit gate",
-                ));
-            }
-        }
-        Ok(None)
-    }
-
-    /// Helper method to extract angle parameter from dict
-    fn extract_angle_param(params: &Bound<'_, PyDict>, gate_name: &str) -> PyResult<Angle64> {
-        match params.get_item("angle") {
-            Ok(Some(py_any)) => py_any.extract::<AngleParam>().map(|a| a.0).map_err(|_| {
-                PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                    "Expected a valid angle parameter for {gate_name} gate"
-                ))
-            }),
-            Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Angle parameter missing for {gate_name} gate"
-            ))),
-            Err(err) => Err(err),
-        }
-    }
-
-    /// Helper method to extract angles parameter from dict
-    fn extract_angles_param(
-        params: &Bound<'_, PyDict>,
-        gate_name: &str,
-        expected_count: usize,
-    ) -> PyResult<Vec<Angle64>> {
-        match params.get_item("angles") {
-            Ok(Some(py_any)) => {
-                let angles = py_any.extract::<Vec<AngleParam>>().map_err(|_| {
-                    PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                        "Expected valid angles parameter for {gate_name} gate"
-                    ))
-                })?;
-                if angles.len() == expected_count {
-                    Ok(angles.into_iter().map(|a| a.0).collect())
-                } else {
-                    Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                        "{gate_name} requires exactly {expected_count} angles"
-                    )))
-                }
-            }
-            Ok(None) => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Angles parameter missing for {gate_name} gate"
-            ))),
-            Err(err) => Err(err),
-        }
-    }
-}
-
-#[pymethods]
-impl PyQulacs {
-    /// Creates a new Qulacs state-vector simulator with the specified number of qubits
-    ///
-    /// # Arguments
-    /// * `num_qubits` - Number of qubits in the system
-    /// * `seed` - Optional seed for the random number generator
-    #[new]
-    #[pyo3(signature = (num_qubits, seed=None))]
-    pub fn new(num_qubits: usize, seed: Option<u64>) -> Self {
-        PyQulacs {
-            inner: match seed {
-                Some(s) => QulacsStateVec::with_seed(num_qubits, s),
-                None => QulacsStateVec::new(num_qubits),
-            },
-        }
-    }
-
-    /// Resets the quantum state to the all-zero state
-    fn reset(mut slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> {
-        slf.inner.reset();
-        slf
-    }
-
-    /// Executes a single-qubit gate based on the provided symbol and location
-    ///
-    /// `symbol`: The gate symbol (e.g., "X", "H", "Z")
-    /// `location`: The qubit index to apply the gate to
-    /// `params`: Optional parameters for parameterized gates
-    ///
-    /// Returns an optional result, usually `None` unless a measurement is performed
-    #[allow(clippy::too_many_lines)]
-    #[pyo3(signature = (symbol, location, params=None))]
-    fn run_1q_gate(
-        &mut self,
-        symbol: &str,
-        location: usize,
-        params: Option<&Bound<'_, PyDict>>,
-    ) -> PyResult<Option<u8>> {
-        // Check bounds
-        if location >= self.inner.num_qubits() {
-            return Err(PyErr::new::<pyo3::exceptions::PyIndexError, _>(format!(
-                "Qubit index {} out of range for {} qubits",
-                location,
-                self.inner.num_qubits()
-            )));
-        }
-
-        let q = &[QubitId(location)];
-        match symbol {
-            "X" => {
-                self.inner.x(q);
-                Ok(None)
-            }
-            "Y" => {
-                self.inner.y(q);
-                Ok(None)
-            }
-            "Z" => {
-                self.inner.z(q);
-                Ok(None)
-            }
-            "H" => {
-                self.inner.h(q);
-                Ok(None)
-            }
-            "SX" => {
-                self.inner.sx(q);
-                Ok(None)
-            }
-            "SXdg" => {
-                self.inner.sxdg(q);
-                Ok(None)
-            }
-            "SY" => {
-                self.inner.sy(q);
-                Ok(None)
-            }
-            "SYdg" => {
-                self.inner.sydg(q);
-                Ok(None)
-            }
-            "SZ" => {
-                self.inner.sz(q);
-                Ok(None)
-            }
-            "SZdg" => {
-                self.inner.szdg(q);
-                Ok(None)
-            }
-            "F" | "F1" => {
-                // F gate is implemented via CliffordGateable trait
-                self.inner.f(q);
-                Ok(None)
-            }
-            "Fdg" | "F1dg" => {
-                // F dagger is implemented via CliffordGateable trait
-                self.inner.fdg(q);
-                Ok(None)
-            }
-            "T" => {
-                self.inner.t(q);
-                Ok(None)
-            }
-            "Tdg" => {
-                self.inner.tdg(q);
-                Ok(None)
-            }
-            "RX" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.rx(angle.0, q);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RX gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RX gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                } else {
-                    return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angle parameter required for RX gate",
-                    ));
-                }
-                Ok(None)
-            }
-            "RY" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.ry(angle.0, q);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RY gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RY gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                } else {
-                    return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angle parameter required for RY gate",
-                    ));
-                }
-                Ok(None)
-            }
-            "RZ" => {
-                if let Some(params) = params {
-                    match params.get_item("angle") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angle) = py_any.extract::<AngleParam>() {
-                                self.inner.rz(angle.0, q);
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a valid angle parameter for RZ gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angle parameter missing for RZ gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                } else {
-                    return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angle parameter required for RZ gate",
-                    ));
-                }
-                Ok(None)
-            }
-            "R1XY" => {
-                if let Some(params) = params {
-                    match params.get_item("angles") {
-                        Ok(Some(py_any)) => {
-                            if let Ok(angles) = py_any.extract::<Vec<AngleParam>>() {
-                                if angles.len() >= 2 {
-                                    // R1XY = RZ(phi-pi/2) * RY(theta) * RZ(-phi+pi/2)
-                                    // where theta = angles[0], phi = angles[1]
-                                    let theta = angles[0].0;
-                                    let phi = angles[1].0;
-                                    let pi_half = Angle64::QUARTER_TURN;
-
-                                    self.inner.rz(-phi + pi_half, q);
-                                    self.inner.ry(theta, q);
-                                    self.inner.rz(phi - pi_half, q);
-                                } else {
-                                    return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                        "R1XY requires at least 2 angles",
-                                    ));
-                                }
-                            } else {
-                                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                    "Expected a list of angles for R1XY gate",
-                                ));
-                            }
-                        }
-                        Ok(None) => {
-                            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                                "Angles parameter missing for R1XY gate",
-                            ));
-                        }
-                        Err(err) => {
-                            return Err(err);
-                        }
-                    }
-                } else {
-                    return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angles parameter required for R1XY gate",
-                    ));
-                }
-                Ok(None)
-            }
-            "H2" => {
-                // H2 is implemented via CliffordGateable trait
-                self.inner.h2(q);
-                Ok(None)
-            }
-            "H3" => {
-                // H3 is implemented via CliffordGateable trait
-                self.inner.h3(q);
-                Ok(None)
-            }
-            "H4" => {
-                // H4 is implemented via CliffordGateable trait
-                self.inner.h4(q);
-                Ok(None)
-            }
-            "H5" => {
-                // H5 is implemented via CliffordGateable trait
-                self.inner.h5(q);
-                Ok(None)
-            }
-            "H6" => {
-                // H6 is implemented via CliffordGateable trait
-                self.inner.h6(q);
-                Ok(None)
-            }
-            "F2" => {
-                // F2 is implemented via CliffordGateable trait
-                self.inner.f2(q);
-                Ok(None)
-            }
-            "F2dg" | "F2d" => {
-                // F2dg is implemented via CliffordGateable trait
-                self.inner.f2dg(q);
-                Ok(None)
-            }
-            "F3" => {
-                // F3 is implemented via CliffordGateable trait
-                self.inner.f3(q);
-                Ok(None)
-            }
-            "F3dg" | "F3d" => {
-                // F3dg is implemented via CliffordGateable trait
-                self.inner.f3dg(q);
-                Ok(None)
-            }
-            "F4" => {
-                // F4 is implemented via CliffordGateable trait
-                self.inner.f4(q);
-                Ok(None)
-            }
-            "F4dg" | "F4d" => {
-                // F4dg is implemented via CliffordGateable trait
-                self.inner.f4dg(q);
-                Ok(None)
-            }
-            "MZ" => {
-                let results = self.inner.mz(q);
-                Ok(Some(u8::from(results[0].outcome)))
-            }
-            "MX" => {
-                let results = self.inner.mx(q);
-                Ok(Some(u8::from(results[0].outcome)))
-            }
-            "MY" => {
-                let results = self.inner.my(q);
-                Ok(Some(u8::from(results[0].outcome)))
-            }
-            "PZ" => {
-                // Project to |0⟩ state using CliffordGateable trait
-                self.inner.pz(q);
-                Ok(None)
-            }
-            "PnZ" => {
-                // Project to |1⟩ state using CliffordGateable trait
-                self.inner.pnz(q);
-                Ok(None)
-            }
-            "PX" => {
-                // Project to |+⟩ state
-                self.inner.prepare_computational_basis(0);
-                self.inner.h(q);
-                Ok(None)
-            }
-            "PnX" => {
-                // Project to |-⟩ state
-                self.inner.prepare_computational_basis(1 << location);
-                self.inner.h(q);
-                Ok(None)
-            }
-            "PY" => {
-                // Project to |+i⟩ state
-                self.inner.prepare_computational_basis(0);
-                self.inner.h(q);
-                self.inner.sz(q);
-                Ok(None)
-            }
-            "PnY" => {
-                // Project to |-i⟩ state
-                self.inner.prepare_computational_basis(0);
-                self.inner.h(q);
-                self.inner.szdg(q);
-                Ok(None)
-            }
-            _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                "Unsupported single-qubit gate",
-            )),
-        }
-    }
-
-    /// Executes a two-qubit gate based on the provided symbol and locations
-    ///
-    /// `symbol`: The gate symbol (e.g., "CX", "CZ")
-    /// `location`: A tuple specifying the two qubits to apply the gate to
-    /// `params`: Optional parameters for parameterized gates
-    ///
-    /// Returns an optional result, usually `None` unless a measurement is performed
-    #[pyo3(signature = (symbol, location, params))]
-    fn run_2q_gate(
-        &mut self,
-        symbol: &str,
-        location: &Bound<'_, PyTuple>,
-        params: Option<&Bound<'_, PyDict>>,
-    ) -> PyResult<Option<u8>> {
-        if location.len() != 2 {
-            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                "Two-qubit gate requires exactly 2 qubit locations",
-            ));
-        }
-
-        let q1: usize = location.get_item(0)?.extract()?;
-        let q2: usize = location.get_item(1)?.extract()?;
-
-        // Check bounds
-        let num_qubits = self.inner.num_qubits();
-        if q1 >= num_qubits || q2 >= num_qubits {
-            return Err(PyErr::new::<pyo3::exceptions::PyIndexError, _>(format!(
-                "Qubit indices ({q1}, {q2}) out of range for {num_qubits} qubits"
-            )));
-        }
-
-        let pair = &[(QubitId(q1), QubitId(q2))];
-        match symbol {
-            "CX" | "CY" | "CZ" | "SWAP" | "G" | "SXX" | "SXXdg" | "SYY" | "SYYdg" | "SZZ"
-            | "SqrtZZ" | "SZZdg" | "G2" => self.handle_simple_2q_gate(symbol, q1, q2),
-            "RZZ" => {
-                let params = params.ok_or_else(|| {
-                    PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angle parameter required for RZZ gate",
-                    )
-                })?;
-                let angle = Self::extract_angle_param(params, "RZZ")?;
-                self.inner.rzz(angle, pair);
-                Ok(None)
-            }
-            "RXX" => {
-                let params = params.ok_or_else(|| {
-                    PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angle parameter required for RXX gate",
-                    )
-                })?;
-                let angle = Self::extract_angle_param(params, "RXX")?;
-                self.inner.rxx(angle, pair);
-                Ok(None)
-            }
-            "RYY" => {
-                let params = params.ok_or_else(|| {
-                    PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angle parameter required for RYY gate",
-                    )
-                })?;
-                let angle = Self::extract_angle_param(params, "RYY")?;
-                self.inner.ryy(angle, pair);
-                Ok(None)
-            }
-            "RXXRYYRZZ" | "RZZRYYRXX" => {
-                let params = params.ok_or_else(|| {
-                    PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                        "Angles parameter required for RXXRYYRZZ gate",
-                    )
-                })?;
-                let angles = Self::extract_angles_param(params, "RXXRYYRZZ", 3)?;
-                // Use the rxxryyrzz method from ArbitraryRotationGateable trait
-                // angles[0] = theta (XX), angles[1] = phi (YY), angles[2] = lambda (ZZ)
-                self.inner.rxxryyrzz(angles[0], angles[1], angles[2], pair);
-                Ok(None)
-            }
-            _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                "Unsupported two-qubit gate",
-            )),
-        }
-    }
-
-    /// Dispatches a gate to the appropriate handler based on the number of qubits specified
-    ///
-    /// `symbol`: The gate symbol
-    /// `location`: A tuple specifying the qubits to apply the gate to
-    /// `params`: Optional parameters for parameterized gates
-    #[pyo3(signature = (symbol, location, params=None))]
-    fn run_gate(
-        &mut self,
-        symbol: &str,
-        location: &Bound<'_, PyTuple>,
-        params: Option<&Bound<'_, PyDict>>,
-    ) -> PyResult<Option<u8>> {
-        match location.len() {
-            1 => {
-                let qubit: usize = location.get_item(0)?.extract()?;
-                self.run_1q_gate(symbol, qubit, params)
-            }
-            2 => self.run_2q_gate(symbol, location, params),
-            _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                "Gate location must be specified for either 1 or 2 qubits",
-            )),
-        }
-    }
-
-    /// Provides direct access to the current state vector as a Python property
-    #[getter]
-    fn vector(&self) -> Vec<(f64, f64)> {
-        self.inner
-            .state()
-            .iter()
-            .map(|complex| (complex.re, complex.im))
-            .collect()
-    }
-
-    /// Get the number of qubits in the system
-    #[getter]
-    fn num_qubits(&self) -> usize {
-        self.inner.num_qubits()
-    }
-
-    /// Returns the probability of each computational basis state as a real-valued array.
-    ///
-    /// Each entry is |amplitude|^2 for the corresponding basis state.
-    #[getter]
-    fn probabilities(&self) -> Vec<f64> {
-        self.inner
-            .state()
-            .iter()
-            .map(num_complex::Complex::norm_sqr)
-            .collect()
-    }
-
-    /// Get the probability of measuring a specific basis state
-    fn probability(&self, basis_state: usize) -> f64 {
-        self.inner.probability(basis_state)
-    }
-
-    /// Prepare the state as a specific computational basis state
-    fn prepare_computational_basis(&mut self, basis_state: usize) {
-        self.inner.prepare_computational_basis(basis_state);
-    }
-
-    /// Prepare all qubits in the |+⟩ state
-    fn prepare_plus_state(&mut self) {
-        self.inner.prepare_plus_state();
-    }
-}
diff --git a/python/pecos-rslib/src/simulators_module.rs b/python/pecos-rslib/src/simulators_module.rs
index 4f5f8b87f..6ac36d5b1 100644
--- a/python/pecos-rslib/src/simulators_module.rs
+++ b/python/pecos-rslib/src/simulators_module.rs
@@ -18,11 +18,8 @@
 //! - `SparseStab` - Rust sparse stabilizer simulator
 //! - `Stabilizer` - Generic stabilizer simulator (recommended)
 //! - `StateVec` - State vector simulator
-//! - `Qulacs` - Qulacs-based state vector simulator
 //! - `CoinToss` - Random measurement simulator for testing
 //! - `PauliProp` - Pauli propagation/fault tracking simulator
-//! - `QuestStateVec` - `QuEST` state vector simulator
-//! - `QuestDensityMatrix` - `QuEST` density matrix simulator
 
 use pyo3::prelude::*;
 
@@ -51,11 +48,6 @@ pub fn register_simulators_module(parent: &Bound<'_, PyModule>) -> PyResult<()>
 
     // State vector simulators
     simulators.add("StateVec", parent.getattr("StateVec")?)?;
-    simulators.add("Qulacs", parent.getattr("Qulacs")?)?;
-
-    // QuEST simulators
-    simulators.add("QuestStateVec", parent.getattr("QuestStateVec")?)?;
-    simulators.add("QuestDensityMatrix", parent.getattr("QuestDensityMatrix")?)?;
 
     // Other simulators
     simulators.add("CoinToss", parent.getattr("CoinToss")?)?;
diff --git a/python/pecos-rslib/tests/test_pickle.py b/python/pecos-rslib/tests/test_pickle.py
index 5283810be..3936f0e7f 100644
--- a/python/pecos-rslib/tests/test_pickle.py
+++ b/python/pecos-rslib/tests/test_pickle.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-from pecos_rslib import CoinToss, PauliProp, Qulacs, SparseStab, StateVec
+from pecos_rslib import CoinToss, PauliProp, SparseStab, StateVec
 
 
 def _state_vec_to_numpy(sim):
@@ -125,38 +125,6 @@ def test_probability_out_of_range(self) -> None:
             sim.probability(4)
 
 
-class TestQulacsProbabilities:
-    """Test the probabilities property on Qulacs."""
-
-    def test_default_state(self) -> None:
-        """All probability should be on |00...0>."""
-        sim = Qulacs(3, seed=42)
-        probs = sim.probabilities
-        assert len(probs) == 8
-        assert probs[0] == pytest.approx(1.0)
-        assert sum(probs) == pytest.approx(1.0)
-
-    def test_bell_state(self) -> None:
-        """Bell state should have 50/50 on |00> and |11>."""
-        sim = Qulacs(2, seed=42)
-        sim.run_1q_gate("H", 0)
-        sim.run_2q_gate("CX", (0, 1), None)
-        probs = sim.probabilities
-        assert probs[0] == pytest.approx(0.5)
-        assert probs[3] == pytest.approx(0.5)
-        assert probs[1] == pytest.approx(0.0, abs=1e-15)
-        assert probs[2] == pytest.approx(0.0, abs=1e-15)
-
-    def test_matches_probability_method(self) -> None:
-        """probabilities[i] should match probability(i)."""
-        sim = Qulacs(2, seed=42)
-        sim.run_1q_gate("H", 0)
-        sim.run_2q_gate("CX", (0, 1), None)
-        probs = sim.probabilities
-        for i in range(4):
-            assert sim.probability(i) == pytest.approx(probs[i])
-
-
 class TestSparseSimPickle:
     """Test pickle support for SparseStab."""
 
diff --git a/python/quantum-pecos/src/pecos/simulators/__init__.py b/python/quantum-pecos/src/pecos/simulators/__init__.py
index 978f5b8f0..15cd77263 100644
--- a/python/quantum-pecos/src/pecos/simulators/__init__.py
+++ b/python/quantum-pecos/src/pecos/simulators/__init__.py
@@ -32,13 +32,6 @@
     PauliFaultProp,  # Backward compatibility
     PauliProp,
 )
-from pecos.simulators.quest_densitymatrix import QuestDensityMatrix
-
-# QuEST simulators
-from pecos.simulators.quest_statevec import QuestStateVec
-
-# Use Qulacs (Rust version) as the primary Qulacs implementation
-from pecos.simulators.qulacs import Qulacs
 
 # Pauli fault propagation sim
 from pecos.simulators.sparsestab import (
@@ -89,9 +82,6 @@
     "DefaultSimulator",
     "PauliFaultProp",
     "PauliProp",
-    "QuestDensityMatrix",
-    "QuestStateVec",
-    "Qulacs",
     "SparseStab",
     "SparseStabPy",
     "Stabilizer",
diff --git a/python/quantum-pecos/src/pecos/simulators/quantum_simulator.py b/python/quantum-pecos/src/pecos/simulators/quantum_simulator.py
index 27e5c9691..5e5b57e46 100644
--- a/python/quantum-pecos/src/pecos/simulators/quantum_simulator.py
+++ b/python/quantum-pecos/src/pecos/simulators/quantum_simulator.py
@@ -21,7 +21,7 @@
 from typing import Any
 
 from pecos.reps.pyphir.op_types import QOp
-from pecos.simulators import Qulacs, StateVec
+from pecos.simulators import StateVec
 from pecos.simulators.sparsestab.state import SparseStabPy
 
 JSONType = dict[str, Any] | list[Any] | str | int | float | bool | None
@@ -32,23 +32,11 @@
 except ImportError:
     MPS = None
 
-try:
-    from pecos.simulators import Qulacs
-except ImportError:
-    Qulacs = None
-
-
 try:
     from pecos.simulators import CuStateVec
 except ImportError:
     CuStateVec = None
 
-try:
-    from pecos.simulators import QuestDensityMatrix, QuestStateVec
-except ImportError:
-    QuestStateVec = None
-    QuestDensityMatrix = None
-
 try:
     from pecos.simulators import CudaStabilizer, CudaStateVec
 except ImportError:
@@ -61,7 +49,7 @@ class QuantumSimulator:
 
     This class provides a unified interface for various quantum simulation backends
     including stabilizer simulators, state vector simulators, and specialized
-    simulators like MPS, Qulacs, and cuQuantum.
+    simulators like MPS and cuQuantum.
     """
 
     def __init__(self, backend: str | object | None = None, **params: JSONType) -> None:
@@ -97,23 +85,12 @@ def init(self, num_qubits: int) -> None:
         if isinstance(self.backend, str):
             if self.backend == "stabilizer":
                 self.state = SparseStabPy
-            elif self.backend in "state-vector":
-                if Qulacs is not None:
-                    self.state = Qulacs
-                else:
-                    self.state = StateVec
-            elif self.backend == "StateVec":
+            elif self.backend in "state-vector" or self.backend == "StateVec":
                 self.state = StateVec
             elif self.backend in {"MPS", "mps"}:
                 self.state = MPS
-            elif self.backend == "Qulacs":
-                self.state = Qulacs
             elif self.backend == "CuStateVec":
                 self.state = CuStateVec
-            elif self.backend == "QuestStateVec":
-                self.state = QuestStateVec
-            elif self.backend == "QuestDensityMatrix":
-                self.state = QuestDensityMatrix
             elif self.backend == "CudaStateVec":
                 self.state = CudaStateVec
             elif self.backend == "CudaStabilizer":
diff --git a/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/__init__.py b/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/__init__.py
deleted file mode 100644
index 870ee60fe..000000000
--- a/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""QuEST density matrix simulator for PECOS.
-
-This module provides a quantum density matrix simulator powered by the QuEST quantum simulation library,
-enabling efficient simulation of mixed quantum states and noisy quantum circuits.
-"""
-
-from pecos.simulators.quest_densitymatrix.state import QuestDensityMatrix
-
-__all__ = ["QuestDensityMatrix"]
diff --git a/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/bindings.py b/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/bindings.py
deleted file mode 100644
index 635b85fe0..000000000
--- a/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/bindings.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Gate bindings for the QuEST density matrix simulator.
-
-This module provides the gate bindings that map gate symbols to their corresponding implementations
-in the QuEST backend for the density matrix simulator.
-"""
-
-# Gate bindings require consistent interfaces even if not all parameters are used.
-# This is a design pattern where all gates must have the same signature for polymorphic dispatch.
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from pecos.protocols import QuantumBackend
-    from pecos.simulators.quest_densitymatrix.state import QuestDensityMatrix
-
-
-def _init_one(sim: QuestDensityMatrix, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |1⟩ state."""
-    # Measure the qubit
-    result_dict = sim.run_gate("MZ", {q})
-    result = result_dict.get(q, 0) if result_dict else 0
-    # If it's 0, flip it to 1
-    if result == 0:
-        sim.run_gate("X", {q})
-
-
-def _init_plus(sim: QuestDensityMatrix, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |+⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("H", {q})  # Then apply H to get |+⟩
-
-
-def _init_minus(sim: QuestDensityMatrix, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |-⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("X", {q})  # Apply X to get |1⟩
-    sim.run_gate("H", {q})  # Then apply H to get |-⟩
-
-
-def _init_plusi(sim: QuestDensityMatrix, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |+i⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("H", {q})  # Apply H to get |+⟩
-    sim.run_gate("Sdg", {q})  # Apply S† to get |+i⟩
-
-
-def _init_minusi(sim: QuestDensityMatrix, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |-i⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("H", {q})  # Apply H to get |+⟩
-    sim.run_gate("S", {q})  # Apply S to get |-i⟩
-
-
-def _rxx_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-    p: dict[str, Any],
-) -> None:
-    """RXX(theta) a, b = SY a; CZ a, b; RX(-theta) b; CZ a, b; SYdg a."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-    theta = p["angles"][0] if "angles" in p else p.get("angle", 0)
-
-    # SY a
-    backend.sy_gate(q1)
-    # CZ a, b
-    backend.run_2q_gate("CZ", (q1, q2), None)
-    # RX(-theta) b
-    backend.run_1q_gate("RX", q2, {"angle": -theta})
-    # CZ a, b
-    backend.run_2q_gate("CZ", (q1, q2), None)
-    # SYdg a
-    backend.sydg_gate(q1)
-
-
-def _ryy_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-    p: dict[str, Any],
-) -> None:
-    """RYY(theta) a, b = SX a; SX b; RZZ(theta) a, b; SXdg a; SXdg b."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-    theta = p["angles"][0] if "angles" in p else p.get("angle", 0)
-
-    # SX a; SX b
-    backend.sx_gate(q1)
-    backend.sx_gate(q2)
-    # RZZ(theta) a, b
-    _rzz_decomposition(backend, (q1, q2), {"angle": theta})
-    # SXdg a; SXdg b
-    backend.sxdg_gate(q1)
-    backend.sxdg_gate(q2)
-
-
-def _rzz_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-    p: dict[str, Any],
-) -> None:
-    """RZZ(theta) a, b = H a; H b; RXX(theta) a, b; H a; H b."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-    theta = p["angles"][0] if "angles" in p else p.get("angle", 0)
-
-    # H a; H b
-    backend.run_1q_gate("H", q1, None)
-    backend.run_1q_gate("H", q2, None)
-    # RXX(theta) a, b
-    _rxx_decomposition(backend, (q1, q2), {"angle": theta})
-    # H a; H b
-    backend.run_1q_gate("H", q1, None)
-    backend.run_1q_gate("H", q2, None)
-
-
-def _cy_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-) -> None:
-    """CY = SZdg(q2); CX(q1,q2); SZ(q2) - Note: reversed from trait due to sign convention."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-
-    # SZdg q2
-    backend.szdg_gate(q2)
-    # CX q1, q2
-    backend.run_2q_gate("CX", (q1, q2), None)
-    # SZ q2
-    backend.sz_gate(q2)
-
-
-def get_bindings(state: QuestDensityMatrix) -> dict:
-    """Get gate bindings for the QuEST density matrix simulator.
-
-    Args:
-        state: The QuestDensityMatrix instance to bind gates to.
-
-    Returns:
-        Dictionary mapping gate symbols to their implementations.
-    """
-    # Get reference to backend for efficiency
-    backend = state.backend
-
-    return {
-        # Single-qubit gates
-        "I": lambda _s, _q, **_p: None,
-        "X": lambda _s, q, **_p: backend.run_1q_gate("X", q, None),
-        "Y": lambda _s, q, **_p: backend.run_1q_gate("Y", q, None),
-        "Z": lambda _s, q, **_p: backend.run_1q_gate("Z", q, None),
-        "H": lambda _s, q, **_p: backend.run_1q_gate("H", q, None),
-        "H1": lambda _s, q, **_p: backend.run_1q_gate("H", q, None),
-        "H2": lambda _s, q, **_p: backend.h2_gate(q),
-        "H3": lambda _s, q, **_p: backend.h3_gate(q),
-        "H4": lambda _s, q, **_p: backend.h4_gate(q),
-        "H5": lambda _s, q, **_p: backend.h5_gate(q),
-        "H6": lambda _s, q, **_p: backend.h6_gate(q),
-        "H+z+x": lambda _s, q, **_p: backend.run_1q_gate("H", q, None),
-        "H-z-x": lambda _s, q, **_p: backend.h2_gate(q),
-        "H+y-z": lambda _s, q, **_p: backend.h3_gate(q),
-        "H-y-z": lambda _s, q, **_p: backend.h4_gate(q),
-        "H-x+y": lambda _s, q, **_p: backend.h5_gate(q),
-        "H-x-y": lambda _s, q, **_p: backend.h6_gate(q),
-        # Square root gates (available from traits)
-        "SX": lambda _s, q, **_p: backend.sx_gate(q),
-        "SXdg": lambda _s, q, **_p: backend.sxdg_gate(q),
-        "SY": lambda _s, q, **_p: backend.sy_gate(q),
-        "SYdg": lambda _s, q, **_p: backend.sydg_gate(q),
-        "SZ": lambda _s, q, **_p: backend.sz_gate(q),
-        "SZdg": lambda _s, q, **_p: backend.szdg_gate(q),
-        # Face gates (F gates) - decompositions from traits
-        "F": lambda _s, q, **_p: (backend.sx_gate(q), backend.sz_gate(q))[-1] or None,
-        "Fdg": lambda _s, q, **_p: (backend.szdg_gate(q), backend.sxdg_gate(q))[-1] or None,
-        "F2": lambda _s, q, **_p: (backend.sxdg_gate(q), backend.sy_gate(q))[-1] or None,
-        "F2dg": lambda _s, q, **_p: (backend.sydg_gate(q), backend.sx_gate(q))[-1] or None,
-        "F3": lambda _s, q, **_p: (backend.sxdg_gate(q), backend.sz_gate(q))[-1] or None,
-        "F3dg": lambda _s, q, **_p: (backend.szdg_gate(q), backend.sx_gate(q))[-1] or None,
-        "F4": lambda _s, q, **_p: (backend.sz_gate(q), backend.sx_gate(q))[-1] or None,
-        "F4dg": lambda _s, q, **_p: (backend.sxdg_gate(q), backend.szdg_gate(q))[-1] or None,
-        # Two-qubit gates
-        "II": lambda _s, _qs, **_p: None,
-        "CX": lambda _s, qs, **_p: backend.run_2q_gate(
-            "CX",
-            tuple(qs) if isinstance(qs, list) else qs,
-            None,
-        ),
-        "CNOT": lambda _s, qs, **_p: backend.run_2q_gate(
-            "CX",
-            tuple(qs) if isinstance(qs, list) else qs,
-            None,
-        ),
-        "CY": lambda _s, qs, **_p: _cy_decomposition(backend, qs),
-        "CZ": lambda _s, qs, **_p: backend.run_2q_gate(
-            "CZ",
-            tuple(qs) if isinstance(qs, list) else qs,
-            None,
-        ),
-        # Measurements
-        "MZ": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        "MX": lambda _s, q, **_p: backend.mx_gate(q),
-        "MY": lambda _s, q, **_p: backend.my_gate(q),
-        "Measure": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        "measure Z": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        "Measure +Z": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        # Projections/Initializations (map to reset for now)
-        "PZ": lambda _s, _q, **_p: backend.reset() or None,
-        "Init": lambda _s, _q, **_p: backend.reset() or None,
-        "Init +Z": lambda _s, _q, **_p: backend.reset() or None,
-        "init |0>": lambda _s, _q, **_p: backend.reset() or None,
-        # Rotation gates
-        "RX": lambda _s, q, **p: backend.run_1q_gate(
-            "RX",
-            q,
-            ({"angle": p["angles"][0]} if "angles" in p else {"angle": p.get("angle", 0)}),
-        ),
-        "RY": lambda _s, q, **p: backend.run_1q_gate(
-            "RY",
-            q,
-            ({"angle": p["angles"][0]} if "angles" in p else {"angle": p.get("angle", 0)}),
-        ),
-        "RZ": lambda _s, q, **p: backend.run_1q_gate(
-            "RZ",
-            q,
-            ({"angle": p["angles"][0]} if "angles" in p else {"angle": p.get("angle", 0)}),
-        ),
-        "R1XY": lambda _s, q, **p: backend.r1xy_gate(
-            p["angles"][0] if "angles" in p else p.get("theta", 0),
-            (p["angles"][1] if "angles" in p and len(p["angles"]) > 1 else p.get("phi", 0)),
-            q,
-        ),
-        "RXX": lambda _s, qs, **p: _rxx_decomposition(backend, qs, p),
-        "RYY": lambda _s, qs, **p: _ryy_decomposition(backend, qs, p),
-        "RZZ": lambda _s, qs, **p: _rzz_decomposition(backend, qs, p),
-        "RXXRYYRZZ": lambda _s, qs, **p: backend.rxxryyrzz_gate(
-            p["angles"][0] if "angles" in p else 0,
-            p["angles"][1] if "angles" in p and len(p["angles"]) > 1 else 0,
-            p["angles"][2] if "angles" in p and len(p["angles"]) > 2 else 0,
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "R2XXYYZZ": lambda _s, qs, **p: backend.rxxryyrzz_gate(  # backward compat alias
-            p["angles"][0] if "angles" in p else 0,
-            p["angles"][1] if "angles" in p and len(p["angles"]) > 1 else 0,
-            p["angles"][2] if "angles" in p and len(p["angles"]) > 2 else 0,
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "RZZRYYRXX": lambda _s, qs, **p: backend.rxxryyrzz_gate(
-            p["angles"][0] if "angles" in p else 0,
-            p["angles"][1] if "angles" in p and len(p["angles"]) > 1 else 0,
-            p["angles"][2] if "angles" in p and len(p["angles"]) > 2 else 0,
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        # T gates - use RZ implementation instead of trait methods
-        "T": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": 0.7853981633974483},
-        ),  # π/4
-        "TDG": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -0.7853981633974483},
-        ),  # -π/4
-        "Tdg": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -0.7853981633974483},
-        ),  # StateVec compatibility
-        "TDAGGER": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -0.7853981633974483},
-        ),
-        # Two-qubit Clifford gates from traits
-        "SXX": lambda _s, qs, **_p: backend.sxx_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "SXXdg": lambda _s, qs, **_p: (
-            backend.x(qs[0] if isinstance(qs, list | tuple) else qs),
-            backend.x(qs[1] if isinstance(qs, list | tuple) else qs),
-            backend.sxx_gate(
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                qs[1] if isinstance(qs, list | tuple) else qs,
-            ),
-        )[-1]
-        or None,
-        "SYY": lambda _s, qs, **_p: backend.syy_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "SYYdg": lambda _s, qs, **_p: (
-            backend.y(qs[0] if isinstance(qs, list | tuple) else qs),
-            backend.y(qs[1] if isinstance(qs, list | tuple) else qs),
-            backend.syy_gate(
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                qs[1] if isinstance(qs, list | tuple) else qs,
-            ),
-        )[-1]
-        or None,
-        "SZZ": lambda _s, qs, **_p: backend.szz_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "SZZdg": lambda _s, qs, **_p: (
-            backend.z(qs[0] if isinstance(qs, list | tuple) else qs),
-            backend.z(qs[1] if isinstance(qs, list | tuple) else qs),
-            backend.szz_gate(
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                qs[1] if isinstance(qs, list | tuple) else qs,
-            ),
-        )[-1]
-        or None,
-        "SWAP": lambda _s, qs, **_p: backend.swap_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "G": lambda _s, qs, **_p: (
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-            backend.run_1q_gate(
-                "H",
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_1q_gate(
-                "H",
-                qs[1] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-        )[-1]
-        or None,
-        "G2": lambda _s, qs, **_p: (
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-            backend.run_1q_gate(
-                "H",
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_1q_gate(
-                "H",
-                qs[1] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-        )[-1]
-        or None,
-        # S and S-dagger gates
-        "S": lambda _s, q, **_p: backend.s(q),
-        "Sdg": lambda _s, q, **_p: backend.sdg(q),
-        "SDAG": lambda _s, q, **_p: backend.sdg(q),
-        "SDG": lambda _s, q, **_p: backend.sdg(q),
-        # Initialization gates for error states
-        "Init -Z": lambda s, q, **p: _init_one(s, q, p),
-        "Init +X": lambda s, q, **p: _init_plus(s, q, p),
-        "Init -X": lambda s, q, **p: _init_minus(s, q, p),
-        "Init +Y": lambda s, q, **p: _init_plusi(s, q, p),
-        "Init -Y": lambda s, q, **p: _init_minusi(s, q, p),
-    }
diff --git a/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/state.py b/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/state.py
deleted file mode 100644
index 8a5b3a882..000000000
--- a/python/quantum-pecos/src/pecos/simulators/quest_densitymatrix/state.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""QuEST density matrix simulator implementation.
-
-This module provides the QuestDensityMatrix class, a quantum density matrix simulator that uses the QuEST
-(Quantum Exact Simulation Toolkit) library as its backend for simulating mixed quantum states and noisy circuits.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from pecos_rslib.simulators import QuestDensityMatrix as RustQuestDensityMatrix
-
-from pecos.simulators.quest_densitymatrix.bindings import get_bindings
-
-if TYPE_CHECKING:
-    from pecos.circuits import QuantumCircuit
-    from pecos.circuits.quantum_circuit import ParamGateCollection
-    from pecos.typing import SimulatorGateParams
-
-
-class QuestDensityMatrix:
-    """QuEST density matrix simulator.
-
-    A quantum density matrix simulator that uses the QuEST library backend for efficient
-    simulation of mixed quantum states and noisy quantum circuits.
-    """
-
-    def __init__(self, num_qubits: int, seed: int | None = None) -> None:
-        """Initializes the QuEST density matrix simulator.
-
-        Args:
-            num_qubits (int): The number of qubits in the quantum system.
-            seed (int | None): Optional seed for the random number generator.
-        """
-        self.backend = RustQuestDensityMatrix(num_qubits, seed)
-        self.num_qubits = num_qubits
-        self.bindings = get_bindings(self)
-
-    @property
-    def matrix(self) -> list[list[complex]]:
-        """Get the density matrix as a 2D list of complex numbers.
-
-        Returns:
-            2D list of complex amplitudes representing the density matrix.
-        """
-        # QuEST stores density matrix internally - we need to extract it
-        # For now, we'll construct it from probabilities (simplified)
-        # A full implementation would extract the full density matrix
-        size = 2**self.num_qubits
-        matrix = [[complex(0, 0) for _ in range(size)] for _ in range(size)]
-
-        # This is a simplified version - full implementation would extract
-        # the actual density matrix elements from QuEST
-        for i in range(size):
-            prob = self.backend.probability(i)
-            if prob > 0:
-                # Diagonal elements only for now
-                matrix[i][i] = complex(prob, 0)
-
-        return matrix
-
-    def reset(self) -> QuestDensityMatrix:
-        """Resets the quantum state to the all-zero density matrix."""
-        self.backend.reset()
-        return self
-
-    def run_gate(
-        self,
-        symbol: str,
-        locations: set[int] | set[tuple[int, ...]],
-        **params: SimulatorGateParams,
-    ) -> dict[int, int]:
-        """Applies a gate to the quantum density matrix.
-
-        Args:
-            symbol (str): The gate symbol (e.g., "X", "H", "CX").
-            locations (set): The qubit(s) to which the gate is applied.
-            params (dict, optional): Parameters for the gate (e.g., rotation angles).
-
-        Returns:
-            Dictionary mapping locations to measurement results.
-        """
-        output = {}
-
-        if params.get("simulate_gate", True) and locations:
-            for location in locations:
-                if params.get("angles") and len(params["angles"]) == 1:
-                    params.update({"angle": params["angles"][0]})
-                elif "angle" in params and "angles" not in params:
-                    params["angles"] = (params["angle"],)
-
-                # Convert list to tuple if needed (for Rust bindings compatibility)
-                loc_to_use = location
-                if isinstance(location, list):
-                    loc_to_use = tuple(location)
-
-                if symbol in self.bindings:
-                    results = self.bindings[symbol](self, loc_to_use, **params)
-                else:
-                    msg = f"Gate {symbol} is not supported in the QuEST density matrix simulator."
-                    raise Exception(msg)
-
-                if results is not None:
-                    output[location] = results
-
-        return output
-
-    def run_circuit(
-        self,
-        circuit: QuantumCircuit | ParamGateCollection,
-        removed_locations: set[int] | None = None,
-    ) -> dict[int, int]:
-        """Runs a quantum circuit on the simulator.
-
-        Args:
-            circuit: The quantum circuit to run.
-            removed_locations: Optional set of locations to exclude.
-
-        Returns:
-            Dictionary mapping measurement locations to results.
-        """
-        if removed_locations is None:
-            removed_locations = set()
-
-        output = {}
-        for symbol, locations, params in circuit.items():
-            results = self.run_gate(
-                symbol,
-                locations - removed_locations,
-                **params,
-            )
-            if results:
-                output.update(results)
-
-        return output
-
-    def __repr__(self) -> str:
-        """String representation of the simulator."""
-        return f"QuestDensityMatrix(num_qubits={self.num_qubits})"
-
-    def get_probability(self, index: int) -> float:
-        """Get the probability of a computational basis state.
-
-        Args:
-            index: The basis state index.
-
-        Returns:
-            The probability of the given basis state.
-        """
-        return self.backend.probability(index)
-
-    def prepare_computational_basis(self, index: int) -> None:
-        """Prepare a computational basis state.
-
-        Args:
-            index: The basis state index to prepare.
-        """
-        self.backend.prepare_computational_basis(index)
diff --git a/python/quantum-pecos/src/pecos/simulators/quest_statevec/__init__.py b/python/quantum-pecos/src/pecos/simulators/quest_statevec/__init__.py
deleted file mode 100644
index 9644aa6b0..000000000
--- a/python/quantum-pecos/src/pecos/simulators/quest_statevec/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""QuEST state vector simulator for PECOS.
-
-This module provides a quantum state vector simulator powered by the QuEST quantum simulation library,
-enabling efficient simulation of arbitrary quantum circuits with full quantum state representation.
-"""
-
-from pecos.simulators.quest_statevec.state import QuestStateVec
-
-__all__ = ["QuestStateVec"]
diff --git a/python/quantum-pecos/src/pecos/simulators/quest_statevec/bindings.py b/python/quantum-pecos/src/pecos/simulators/quest_statevec/bindings.py
deleted file mode 100644
index d719a892c..000000000
--- a/python/quantum-pecos/src/pecos/simulators/quest_statevec/bindings.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Gate bindings for the QuEST state vector simulator.
-
-This module provides the gate bindings that map gate symbols to their corresponding implementations
-in the QuEST backend for the state vector simulator.
-"""
-
-# Gate bindings require consistent interfaces even if not all parameters are used.
-# This is a design pattern where all gates must have the same signature for polymorphic dispatch.
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from pecos.protocols import QuantumBackend
-    from pecos.simulators.quest_statevec.state import QuestStateVec
-
-
-def _init_one(sim: QuestStateVec, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |1⟩ state."""
-    # Measure the qubit
-    result_dict = sim.run_gate("MZ", {q})
-    result = result_dict.get(q, 0) if result_dict else 0
-    # If it's 0, flip it to 1
-    if result == 0:
-        sim.run_gate("X", {q})
-
-
-def _init_plus(sim: QuestStateVec, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |+⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("H", {q})  # Then apply H to get |+⟩
-
-
-def _init_minus(sim: QuestStateVec, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |-⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("X", {q})  # Apply X to get |1⟩
-    sim.run_gate("H", {q})  # Then apply H to get |-⟩
-
-
-def _init_plusi(sim: QuestStateVec, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |+i⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("H", {q})  # Apply H to get |+⟩
-    sim.run_gate("Sdg", {q})  # Apply S† to get |+i⟩
-
-
-def _init_minusi(sim: QuestStateVec, q: int, _p: dict[str, Any]) -> None:
-    """Initialize qubit to |-i⟩ state."""
-    sim.reset()  # First reset to |0⟩
-    sim.run_gate("H", {q})  # Apply H to get |+⟩
-    sim.run_gate("S", {q})  # Apply S to get |-i⟩
-
-
-def _rxx_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-    p: dict[str, Any],
-) -> None:
-    """RXX(theta) a, b = SY a; CZ a, b; RX(-theta) b; CZ a, b; SYdg a."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-    theta = p["angles"][0] if "angles" in p else p.get("angle", 0)
-
-    # SY a
-    backend.sy_gate(q1)
-    # CZ a, b
-    backend.run_2q_gate("CZ", (q1, q2), None)
-    # RX(-theta) b
-    backend.run_1q_gate("RX", q2, {"angle": -theta})
-    # CZ a, b
-    backend.run_2q_gate("CZ", (q1, q2), None)
-    # SYdg a
-    backend.sydg_gate(q1)
-
-
-def _ryy_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-    p: dict[str, Any],
-) -> None:
-    """RYY(theta) a, b = SX a; SX b; RZZ(theta) a, b; SXdg a; SXdg b."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-    theta = p["angles"][0] if "angles" in p else p.get("angle", 0)
-
-    # SX a; SX b
-    backend.sx_gate(q1)
-    backend.sx_gate(q2)
-    # RZZ(theta) a, b
-    _rzz_decomposition(backend, (q1, q2), {"angle": theta})
-    # SXdg a; SXdg b
-    backend.sxdg_gate(q1)
-    backend.sxdg_gate(q2)
-
-
-def _rzz_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-    p: dict[str, Any],
-) -> None:
-    """RZZ(theta) a, b = H a; H b; RXX(theta) a, b; H a; H b."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-    theta = p["angles"][0] if "angles" in p else p.get("angle", 0)
-
-    # H a; H b
-    backend.run_1q_gate("H", q1, None)
-    backend.run_1q_gate("H", q2, None)
-    # RXX(theta) a, b
-    _rxx_decomposition(backend, (q1, q2), {"angle": theta})
-    # H a; H b
-    backend.run_1q_gate("H", q1, None)
-    backend.run_1q_gate("H", q2, None)
-
-
-def _cy_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-) -> None:
-    """CY = SZdg(q2); CX(q1,q2); SZ(q2) - Note: reversed from trait due to sign convention."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-
-    # SZdg q2
-    backend.szdg_gate(q2)
-    # CX q1, q2
-    backend.run_2q_gate("CX", (q1, q2), None)
-    # SZ q2
-    backend.sz_gate(q2)
-
-
-def _rxxryyrzz_decomposition(
-    backend: QuantumBackend,
-    qs: int | list[int] | tuple[int, ...],
-    p: dict[str, Any],
-) -> None:
-    """RXXRYYRZZ decomposition using manual RXX, RYY, RZZ."""
-    q1, q2 = (qs[0], qs[1]) if isinstance(qs, list | tuple) else (qs, qs)
-    angles = p.get("angles", [0, 0, 0])
-    theta = angles[0] if len(angles) > 0 else 0
-    phi = angles[1] if len(angles) > 1 else 0
-    lambda_param = angles[2] if len(angles) > 2 else 0
-
-    # Apply RXX, RYY, RZZ in sequence using the manual decompositions
-    _rxx_decomposition(backend, (q1, q2), {"angles": [theta]})
-    _ryy_decomposition(backend, (q1, q2), {"angles": [phi]})
-    _rzz_decomposition(backend, (q1, q2), {"angles": [lambda_param]})
-
-
-def get_bindings(state: QuestStateVec) -> dict:
-    """Get gate bindings for the QuEST state vector simulator.
-
-    Args:
-        state: The QuestStateVec instance to bind gates to.
-
-    Returns:
-        Dictionary mapping gate symbols to their implementations.
-    """
-    # Get reference to backend for efficiency
-    backend = state.backend
-
-    return {
-        # Single-qubit gates
-        "I": lambda _s, _q, **_p: None,  # Identity gate
-        "X": lambda _s, q, **_p: backend.run_1q_gate("X", q, None),
-        "Y": lambda _s, q, **_p: backend.run_1q_gate("Y", q, None),
-        "Z": lambda _s, q, **_p: backend.run_1q_gate("Z", q, None),
-        "H": lambda _s, q, **_p: backend.run_1q_gate("H", q, None),
-        "H1": lambda _s, q, **_p: backend.run_1q_gate("H", q, None),
-        "H2": lambda _s, q, **_p: backend.h2_gate(q),
-        "H3": lambda _s, q, **_p: backend.h3_gate(q),
-        "H4": lambda _s, q, **_p: backend.h4_gate(q),
-        "H5": lambda _s, q, **_p: backend.h5_gate(q),
-        "H6": lambda _s, q, **_p: backend.h6_gate(q),
-        "H+z+x": lambda _s, q, **_p: backend.run_1q_gate("H", q, None),
-        "H-z-x": lambda _s, q, **_p: backend.h2_gate(q),
-        "H+y-z": lambda _s, q, **_p: backend.h3_gate(q),
-        "H-y-z": lambda _s, q, **_p: backend.h4_gate(q),
-        "H-x+y": lambda _s, q, **_p: backend.h5_gate(q),
-        "H-x-y": lambda _s, q, **_p: backend.h6_gate(q),
-        # Square root gates (available from traits)
-        "SX": lambda _s, q, **_p: backend.sx_gate(q),
-        "SXdg": lambda _s, q, **_p: backend.sxdg_gate(q),
-        "SY": lambda _s, q, **_p: backend.sy_gate(q),
-        "SYdg": lambda _s, q, **_p: backend.sydg_gate(q),
-        "SZ": lambda _s, q, **_p: backend.sz_gate(q),
-        "SZdg": lambda _s, q, **_p: backend.szdg_gate(q),
-        # Aliases for square root gates (for compatibility with StateVec)
-        "Q": lambda _s, q, **_p: backend.sx_gate(q),  # Q = SX
-        "Qd": lambda _s, q, **_p: backend.sxdg_gate(q),  # Qd = SXdg
-        "R": lambda _s, q, **_p: backend.sy_gate(q),  # R = SY
-        "Rd": lambda _s, q, **_p: backend.sydg_gate(q),  # Rd = SYdg
-        "S": lambda _s, q, **_p: backend.sz_gate(q),  # S = SZ
-        "Sd": lambda _s, q, **_p: backend.szdg_gate(q),  # Sd = SZdg
-        "Sdg": lambda _s, q, **_p: backend.szdg_gate(q),  # Sdg = SZdg (alternate name)
-        # Face gates (F gates) - decompositions from traits
-        "F": lambda _s, q, **_p: (backend.sx_gate(q), backend.sz_gate(q))[-1] or None,
-        "Fdg": lambda _s, q, **_p: (backend.szdg_gate(q), backend.sxdg_gate(q))[-1] or None,
-        "F1": lambda _s, q, **_p: (backend.sx_gate(q), backend.sz_gate(q))[-1] or None,  # F1 = F
-        "F1d": lambda _s, q, **_p: (backend.szdg_gate(q), backend.sxdg_gate(q))[-1] or None,  # F1d = Fdg
-        "F1dg": lambda _s, q, **_p: (backend.szdg_gate(q), backend.sxdg_gate(q))[-1] or None,  # F1dg = Fdg
-        "F2": lambda _s, q, **_p: (backend.sxdg_gate(q), backend.sy_gate(q))[-1] or None,
-        "F2d": lambda _s, q, **_p: (backend.sydg_gate(q), backend.sx_gate(q))[-1] or None,  # F2d = F2dg
-        "F2dg": lambda _s, q, **_p: (backend.sydg_gate(q), backend.sx_gate(q))[-1] or None,
-        "F3": lambda _s, q, **_p: (backend.sxdg_gate(q), backend.sz_gate(q))[-1] or None,
-        "F3d": lambda _s, q, **_p: (backend.szdg_gate(q), backend.sx_gate(q))[-1] or None,  # F3d = F3dg
-        "F3dg": lambda _s, q, **_p: (backend.szdg_gate(q), backend.sx_gate(q))[-1] or None,
-        "F4": lambda _s, q, **_p: (backend.sz_gate(q), backend.sx_gate(q))[-1] or None,
-        "F4d": lambda _s, q, **_p: (backend.sxdg_gate(q), backend.szdg_gate(q))[-1] or None,  # F4d = F4dg
-        "F4dg": lambda _s, q, **_p: (backend.sxdg_gate(q), backend.szdg_gate(q))[-1] or None,
-        # Two-qubit gates
-        "II": lambda _s, _qs, **_p: None,
-        "CX": lambda _s, qs, **_p: backend.run_2q_gate(
-            "CX",
-            tuple(qs) if isinstance(qs, list) else qs,
-            None,
-        ),
-        "CNOT": lambda _s, qs, **_p: backend.run_2q_gate(
-            "CX",
-            tuple(qs) if isinstance(qs, list) else qs,
-            None,
-        ),
-        "CY": lambda _s, qs, **_p: _cy_decomposition(backend, qs),
-        "CZ": lambda _s, qs, **_p: backend.run_2q_gate(
-            "CZ",
-            tuple(qs) if isinstance(qs, list) else qs,
-            None,
-        ),
-        # Measurements
-        "MZ": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        "MX": lambda _s, q, **_p: backend.mx_gate(q),
-        "MY": lambda _s, q, **_p: backend.my_gate(q),
-        "Measure": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        "measure Z": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        "Measure +Z": lambda _s, q, **_p: backend.run_1q_gate("MZ", q, None),
-        # T gates - use RZ implementation
-        "SDG": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -1.5707963267948966},
-        ),  # -π/2
-        "SDAGGER": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -1.5707963267948966},
-        ),
-        "T": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": 0.7853981633974483},
-        ),  # π/4
-        "TDG": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -0.7853981633974483},
-        ),  # -π/4
-        "Tdg": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -0.7853981633974483},
-        ),  # StateVec compatibility
-        "TDAGGER": lambda _s, q, **_p: backend.run_1q_gate(
-            "RZ",
-            q,
-            {"angle": -0.7853981633974483},
-        ),
-        # Projections/Initializations
-        "PZ": lambda _s, _q, **_p: backend.reset() or None,
-        "Init": lambda _s, _q, **_p: backend.reset() or None,
-        "Init +Z": lambda _s, _q, **_p: backend.reset() or None,
-        "Init -Z": lambda s, q, **p: _init_one(s, q, p),
-        "Init +X": lambda s, q, **p: _init_plus(s, q, p),
-        "Init -X": lambda s, q, **p: _init_minus(s, q, p),
-        "Init +Y": lambda s, q, **p: _init_plusi(s, q, p),
-        "Init -Y": lambda s, q, **p: _init_minusi(s, q, p),
-        "init |0>": lambda _s, _q, **_p: backend.reset() or None,
-        "init |1>": lambda s, q, **p: _init_one(s, q, p),
-        "init |+>": lambda s, q, **p: _init_plus(s, q, p),
-        "init |->": lambda s, q, **p: _init_minus(s, q, p),
-        "init |+i>": lambda s, q, **p: _init_plusi(s, q, p),
-        "init |-i>": lambda s, q, **p: _init_minusi(s, q, p),
-        # Rotation gates
-        "RX": lambda _s, q, **p: backend.run_1q_gate(
-            "RX",
-            q,
-            ({"angle": p["angles"][0]} if "angles" in p else {"angle": p.get("angle", 0)}),
-        ),
-        "RY": lambda _s, q, **p: backend.run_1q_gate(
-            "RY",
-            q,
-            ({"angle": p["angles"][0]} if "angles" in p else {"angle": p.get("angle", 0)}),
-        ),
-        "RZ": lambda _s, q, **p: backend.run_1q_gate(
-            "RZ",
-            q,
-            ({"angle": p["angles"][0]} if "angles" in p else {"angle": p.get("angle", 0)}),
-        ),
-        "R1XY": lambda _s, q, **p: backend.r1xy_gate(
-            p["angles"][0] if "angles" in p else p.get("theta", 0),
-            (p["angles"][1] if "angles" in p and len(p["angles"]) > 1 else p.get("phi", 0)),
-            q,
-        ),
-        "RXX": lambda _s, qs, **p: _rxx_decomposition(backend, qs, p),
-        "RYY": lambda _s, qs, **p: _ryy_decomposition(backend, qs, p),
-        "RZZ": lambda _s, qs, **p: _rzz_decomposition(backend, qs, p),
-        "RXXRYYRZZ": lambda _s, qs, **p: _rxxryyrzz_decomposition(backend, qs, p),
-        "R2XXYYZZ": lambda _s, qs, **p: _rxxryyrzz_decomposition(backend, qs, p),  # backward compat alias
-        "RZZRYYRXX": lambda _s, qs, **p: _rxxryyrzz_decomposition(backend, qs, p),
-        # Two-qubit Clifford gates from traits
-        "SXX": lambda _s, qs, **_p: backend.sxx_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "SXXdg": lambda _s, qs, **_p: (
-            backend.run_1q_gate(
-                "X",
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_1q_gate(
-                "X",
-                qs[1] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.sxx_gate(
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                qs[1] if isinstance(qs, list | tuple) else qs,
-            ),
-        )[-1]
-        or None,
-        "SYY": lambda _s, qs, **_p: backend.syy_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "SYYdg": lambda _s, qs, **_p: (
-            backend.run_1q_gate(
-                "Y",
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_1q_gate(
-                "Y",
-                qs[1] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.syy_gate(
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                qs[1] if isinstance(qs, list | tuple) else qs,
-            ),
-        )[-1]
-        or None,
-        "SZZ": lambda _s, qs, **_p: backend.szz_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "SZZdg": lambda _s, qs, **_p: (
-            backend.run_1q_gate(
-                "Z",
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_1q_gate(
-                "Z",
-                qs[1] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.szz_gate(
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                qs[1] if isinstance(qs, list | tuple) else qs,
-            ),
-        )[-1]
-        or None,
-        "SWAP": lambda _s, qs, **_p: backend.swap_gate(
-            qs[0] if isinstance(qs, list | tuple) else qs,
-            qs[1] if isinstance(qs, list | tuple) else qs,
-        ),
-        "G": lambda _s, qs, **_p: (
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-            backend.run_1q_gate(
-                "H",
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_1q_gate(
-                "H",
-                qs[1] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-        )[-1]
-        or None,
-        "G2": lambda _s, qs, **_p: (
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-            backend.run_1q_gate(
-                "H",
-                qs[0] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_1q_gate(
-                "H",
-                qs[1] if isinstance(qs, list | tuple) else qs,
-                None,
-            ),
-            backend.run_2q_gate("CZ", tuple(qs) if isinstance(qs, list) else qs, None),
-        )[-1]
-        or None,  # G2 maps to same as G since StateVec does this
-    }
diff --git a/python/quantum-pecos/src/pecos/simulators/quest_statevec/state.py b/python/quantum-pecos/src/pecos/simulators/quest_statevec/state.py
deleted file mode 100644
index d10dbd4c1..000000000
--- a/python/quantum-pecos/src/pecos/simulators/quest_statevec/state.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""QuEST state vector simulator implementation.
-
-This module provides the QuestStateVec class, a quantum state vector simulator that uses the QuEST
-(Quantum Exact Simulation Toolkit) library as its backend for efficient quantum circuit simulation.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from pecos_rslib.simulators import QuestStateVec as RustQuestStateVec
-
-import pecos as pc
-from pecos.simulators.quest_statevec.bindings import get_bindings
-
-if TYPE_CHECKING:
-    from pecos import Array
-    from pecos.circuits import QuantumCircuit
-    from pecos.circuits.quantum_circuit import ParamGateCollection
-    from pecos.typing import SimulatorGateParams
-
-
-class QuestStateVec:
-    """QuEST state vector simulator.
-
-    A quantum state vector simulator that uses the QuEST library backend for efficient
-    simulation of arbitrary quantum circuits with full quantum state representation.
-    """
-
-    def __init__(self, num_qubits: int, seed: int | None = None) -> None:
-        """Initializes the QuEST state vector simulator.
-
-        Args:
-            num_qubits (int): The number of qubits in the quantum system.
-            seed (int | None): Optional seed for the random number generator.
-        """
-        self.backend = RustQuestStateVec(num_qubits, seed)
-        self.num_qubits = num_qubits
-        self.bindings = get_bindings(self)
-
-    @property
-    def vector(self) -> Array:
-        """Get the state vector as an Array of complex numbers.
-
-        Returns:
-            Array of complex amplitudes representing the quantum state.
-        """
-        # QuEST stores amplitudes internally - we need to extract them
-        amplitudes = []
-        for i in range(2**self.num_qubits):
-            re, im = self.backend.get_amplitude(i)
-            amplitudes.append(complex(re, im))
-        return pc.array(amplitudes, dtype=pc.dtypes.complex128)
-
-    def reset(self) -> QuestStateVec:
-        """Resets the quantum state to the all-zero state."""
-        self.backend.reset()
-        return self
-
-    def run_gate(
-        self,
-        symbol: str,
-        locations: set[int] | set[tuple[int, ...]],
-        **params: SimulatorGateParams,
-    ) -> dict[int, int]:
-        """Applies a gate to the quantum state.
-
-        Args:
-            symbol (str): The gate symbol (e.g., "X", "H", "CX").
-            locations (set): The qubit(s) to which the gate is applied.
-            params (dict, optional): Parameters for the gate (e.g., rotation angles).
-
-        Returns:
-            Dictionary mapping locations to measurement results.
-        """
-        output = {}
-
-        if params.get("simulate_gate", True) and locations:
-            for location in locations:
-                if params.get("angles") and len(params["angles"]) == 1:
-                    params.update({"angle": params["angles"][0]})
-                elif "angle" in params and "angles" not in params:
-                    params["angles"] = (params["angle"],)
-
-                # Convert list to tuple if needed (for Rust bindings compatibility)
-                loc_to_use = location
-                if isinstance(location, list):
-                    loc_to_use = tuple(location)
-
-                if symbol in self.bindings:
-                    results = self.bindings[symbol](self, loc_to_use, **params)
-                else:
-                    msg = f"Gate {symbol} is not supported in the QuEST simulator."
-                    raise Exception(msg)
-
-                if results is not None:
-                    output[location] = results
-
-        return output
-
-    def run_circuit(
-        self,
-        circuit: QuantumCircuit | ParamGateCollection,
-        removed_locations: set[int] | None = None,
-    ) -> dict[int, int]:
-        """Runs a quantum circuit on the simulator.
-
-        Args:
-            circuit: The quantum circuit to run.
-            removed_locations: Optional set of locations to exclude.
-
-        Returns:
-            Dictionary mapping measurement locations to results.
-        """
-        if removed_locations is None:
-            removed_locations = set()
-
-        output = {}
-        for symbol, locations, params in circuit.items():
-            results = self.run_gate(
-                symbol,
-                locations - removed_locations,
-                **params,
-            )
-            if results:
-                output.update(results)
-
-        return output
-
-    def __repr__(self) -> str:
-        """String representation of the simulator."""
-        return f"QuestStateVec(num_qubits={self.num_qubits})"
-
-    def get_probability(self, index: int) -> float:
-        """Get the probability of a computational basis state.
-
-        Args:
-            index: The basis state index.
-
-        Returns:
-            The probability of the given basis state.
-        """
-        return self.backend.probability(index)
-
-    def get_amplitude(self, index: int) -> complex:
-        """Get the amplitude of a computational basis state.
-
-        Args:
-            index: The basis state index.
-
-        Returns:
-            The complex amplitude of the given basis state.
-        """
-        re, im = self.backend.get_amplitude(index)
-        return complex(re, im)
diff --git a/python/quantum-pecos/src/pecos/simulators/qulacs/__init__.py b/python/quantum-pecos/src/pecos/simulators/qulacs/__init__.py
deleted file mode 100644
index 058516474..000000000
--- a/python/quantum-pecos/src/pecos/simulators/qulacs/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""Qulacs simulator wrapper.
-
-This package provides a wrapper for the Qulacs quantum simulator using a pure Rust backend.
-"""
-
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-from pecos.simulators.qulacs import bindings
-from pecos.simulators.qulacs.state import Qulacs
diff --git a/python/quantum-pecos/src/pecos/simulators/qulacs/bindings.py b/python/quantum-pecos/src/pecos/simulators/qulacs/bindings.py
deleted file mode 100644
index 37c850ac5..000000000
--- a/python/quantum-pecos/src/pecos/simulators/qulacs/bindings.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Gate bindings for Qulacs quantum simulator.
-
-This module provides gate operation bindings for the Qulacs quantum simulator, organizing and exposing
-quantum gate implementations using the pure Rust backend for high performance and thread safety.
-"""
-
-import pecos.simulators.qulacs.gates_one_qubit as one_q
-import pecos.simulators.qulacs.gates_two_qubit as two_q
-from pecos.simulators.qulacs.gates_init import init_one, init_zero
-from pecos.simulators.qulacs.gates_meas import meas_z
-
-# Supporting gates from table:
-#   https://github.com/Quantinuum/phir/blob/main/spec.md#table-ii---quantum-operations
-
-gate_dict = {
-    "Init": init_zero,
-    "Init +Z": init_zero,
-    "Init -Z": init_one,
-    "init |0>": init_zero,
-    "init |1>": init_one,
-    "leak": init_zero,
-    "leak |0>": init_zero,
-    "leak |1>": init_one,
-    "unleak |0>": init_zero,
-    "unleak |1>": init_one,
-    "Measure": meas_z,
-    "measure Z": meas_z,
-    "I": one_q.identity,
-    "X": one_q.X,
-    "Y": one_q.Y,
-    "Z": one_q.Z,
-    "RX": one_q.RX,
-    "RY": one_q.RY,
-    "RZ": one_q.RZ,
-    "R1XY": one_q.R1XY,
-    "RXY1Q": one_q.R1XY,
-    "SX": one_q.SX,
-    "SXdg": one_q.SXdg,
-    "SqrtX": one_q.SX,
-    "SqrtXd": one_q.SXdg,
-    "SY": one_q.SY,
-    "SYdg": one_q.SYdg,
-    "SqrtY": one_q.SY,
-    "SqrtYd": one_q.SYdg,
-    "SZ": one_q.SZ,
-    "SZdg": one_q.SZdg,
-    "SqrtZ": one_q.SZ,
-    "SqrtZd": one_q.SZdg,
-    "H": one_q.H,
-    "F": one_q.F,
-    "Fdg": one_q.Fdg,
-    "T": one_q.T,
-    "Tdg": one_q.Tdg,
-    "CX": two_q.CX,
-    "CY": two_q.CY,
-    "CZ": two_q.CZ,
-    "RXX": two_q.RXX,
-    "RYY": two_q.RYY,
-    "RZZ": two_q.RZZ,
-    "RXXRYYRZZ": two_q.RXXRYYRZZ,
-    "R2XXYYZZ": two_q.RXXRYYRZZ,
-    "SXX": two_q.SXX,
-    "SXXdg": two_q.SXXdg,
-    "SYY": two_q.SYY,
-    "SYYdg": two_q.SYYdg,
-    "SZZ": two_q.SZZ,
-    "SqrtZZ": two_q.SZZ,
-    "SZZdg": two_q.SZZdg,
-    "SWAP": two_q.SWAP,
-    "Q": one_q.SX,
-    "Qd": one_q.SXdg,
-    "R": one_q.SY,
-    "Rd": one_q.SYdg,
-    "S": one_q.SZ,
-    "Sd": one_q.SZdg,
-    "H1": one_q.H,
-    "H2": one_q.H2,
-    "H3": one_q.H3,
-    "H4": one_q.H4,
-    "H5": one_q.H5,
-    "H6": one_q.H6,
-    "H+z+x": one_q.H,
-    "H-z-x": one_q.H2,
-    "H+y-z": one_q.H3,
-    "H-y-z": one_q.H4,
-    "H-x+y": one_q.H5,
-    "H-x-y": one_q.H6,
-    "F1": one_q.F,
-    "F1d": one_q.Fdg,
-    "F2": one_q.F2,
-    "F2d": one_q.F2d,
-    "F3": one_q.F3,
-    "F3d": one_q.F3d,
-    "F4": one_q.F4,
-    "F4d": one_q.F4d,
-    "CNOT": two_q.CX,
-    "G": two_q.G,
-    "II": one_q.identity,
-}
diff --git a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_init.py b/python/quantum-pecos/src/pecos/simulators/qulacs/gates_init.py
deleted file mode 100644
index 788089d6d..000000000
--- a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_init.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Initialization operations for Qulacs simulator.
-
-This module provides quantum state initialization operations for the Qulacs simulator.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from pecos.simulators.qulacs import Qulacs
-    from pecos.typing import SimulatorGateParams
-
-
-def init_zero(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Initialize qubit to |0⟩ state.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit to initialize
-    """
-    # Use PZ gate to project qubit to |0⟩ state
-    state.qulacs_state.run_1q_gate("PZ", qubit)
-
-
-def init_one(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Initialize qubit to |1⟩ state.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit to initialize
-    """
-    # Use PnZ gate to project qubit to |1⟩ state
-    state.qulacs_state.run_1q_gate("PnZ", qubit)
diff --git a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_meas.py b/python/quantum-pecos/src/pecos/simulators/qulacs/gates_meas.py
deleted file mode 100644
index 2aa79be45..000000000
--- a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_meas.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Measurement operations for Qulacs simulator.
-
-This module provides quantum measurement operations for the Qulacs simulator.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from pecos.simulators.qulacs import Qulacs
-    from pecos.typing import SimulatorGateParams
-
-
-def meas_z(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> int:
-    """Measure qubit in Z basis.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit to measure
-
-    Returns:
-        The measurement outcome (0 or 1)
-    """
-    result = state.qulacs_state.run_1q_gate("MZ", qubit)
-    return int(result) if result is not None else 0
diff --git a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_one_qubit.py b/python/quantum-pecos/src/pecos/simulators/qulacs/gates_one_qubit.py
deleted file mode 100644
index be08d1f83..000000000
--- a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_one_qubit.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Single-qubit gate operations for Qulacs simulator.
-
-This module provides single-qubit quantum gate operations for the Qulacs simulator, including Pauli gates,
-rotation gates, Hadamard gates, and other fundamental single-qubit operations using the Rust backend.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from pecos.simulators.qulacs import Qulacs
-    from pecos.typing import SimulatorGateParams
-
-
-def identity(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Identity gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    # Identity gate does nothing
-
-
-def X(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Pauli X gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("X", qubit)
-
-
-def Y(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Pauli Y gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("Y", qubit)
-
-
-def Z(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Pauli Z gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("Z", qubit)
-
-
-def H(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Hadamard gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("H", qubit)
-
-
-def SX(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Square root of X gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("SX", qubit)
-
-
-def SXdg(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Dagger of square root of X gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("SXdg", qubit)
-
-
-def SY(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Square root of Y gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("SY", qubit)
-
-
-def SYdg(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Dagger of square root of Y gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("SYdg", qubit)
-
-
-def SZ(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Square root of Z gate (S gate).
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("SZ", qubit)
-
-
-def SZdg(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """Dagger of square root of Z gate (S† gate).
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("SZdg", qubit)
-
-
-def T(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """T gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("T", qubit)
-
-
-def Tdg(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """T dagger gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-    """
-    state.qulacs_state.run_1q_gate("Tdg", qubit)
-
-
-def RX(
-    state: Qulacs,
-    qubit: int,
-    angles: tuple[float] | list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """Rotation around X axis.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-        angles: A tuple or list containing a single rotation angle in radians
-        **params: Additional parameters, can include 'angle' (float) or 'angles' (list)
-    """
-    # Extract angle from various possible sources for compatibility
-    if angles is not None:
-        # Standard interface: angles as positional parameter (Qulacs compatibility)
-        if hasattr(angles, "__len__"):
-            if len(angles) != 1:
-                msg = "RX gate must be given 1 angle parameter."
-                raise ValueError(msg)
-            angle = angles[0]
-        else:
-            # Allow single float for convenience
-            angle = angles
-    elif "angle" in params:
-        # Qulacs style: angle as keyword parameter
-        angle = params["angle"]
-    elif "angles" in params:
-        # Angles from kwargs
-        angles_param = params["angles"]
-        if hasattr(angles_param, "__len__"):
-            if len(angles_param) != 1:
-                msg = "RX gate must be given 1 angle parameter."
-                raise ValueError(msg)
-            angle = angles_param[0]
-        else:
-            angle = angles_param
-    else:
-        msg = "RX gate requires an 'angle' or 'angles' parameter"
-        raise TypeError(msg)
-
-    state.qulacs_state.run_1q_gate("RX", qubit, {"angle": angle})
-
-
-def RY(
-    state: Qulacs,
-    qubit: int,
-    angles: tuple[float] | list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """Rotation around Y axis.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-        angles: A tuple or list containing a single rotation angle in radians
-        **params: Additional parameters, can include 'angle' (float) or 'angles' (list)
-    """
-    # Extract angle from various possible sources for compatibility
-    if angles is not None:
-        # Standard interface: angles as positional parameter (Qulacs compatibility)
-        if hasattr(angles, "__len__"):
-            if len(angles) != 1:
-                msg = "RY gate must be given 1 angle parameter."
-                raise ValueError(msg)
-            angle = angles[0]
-        else:
-            # Allow single float for convenience
-            angle = angles
-    elif "angle" in params:
-        # Qulacs style: angle as keyword parameter
-        angle = params["angle"]
-    elif "angles" in params:
-        # Angles from kwargs
-        angles_param = params["angles"]
-        if hasattr(angles_param, "__len__"):
-            if len(angles_param) != 1:
-                msg = "RY gate must be given 1 angle parameter."
-                raise ValueError(msg)
-            angle = angles_param[0]
-        else:
-            angle = angles_param
-    else:
-        msg = "RY gate requires an 'angle' or 'angles' parameter"
-        raise TypeError(msg)
-
-    state.qulacs_state.run_1q_gate("RY", qubit, {"angle": angle})
-
-
-def RZ(
-    state: Qulacs,
-    qubit: int,
-    angles: tuple[float] | list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """Rotation around Z axis.
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-        angles: A tuple or list containing a single rotation angle in radians
-        **params: Additional parameters, can include 'angle' (float) or 'angles' (list)
-    """
-    # Extract angle from various possible sources for compatibility
-    if angles is not None:
-        # Standard interface: angles as positional parameter (Qulacs compatibility)
-        if hasattr(angles, "__len__"):
-            if len(angles) != 1:
-                msg = "RZ gate must be given 1 angle parameter."
-                raise ValueError(msg)
-            angle = angles[0]
-        else:
-            # Allow single float for convenience
-            angle = angles
-    elif "angle" in params:
-        # Qulacs style: angle as keyword parameter
-        angle = params["angle"]
-    elif "angles" in params:
-        # Angles from kwargs
-        angles_param = params["angles"]
-        if hasattr(angles_param, "__len__"):
-            if len(angles_param) != 1:
-                msg = "RZ gate must be given 1 angle parameter."
-                raise ValueError(msg)
-            angle = angles_param[0]
-        else:
-            angle = angles_param
-    else:
-        msg = "RZ gate requires an 'angle' or 'angles' parameter"
-        raise TypeError(msg)
-
-    state.qulacs_state.run_1q_gate("RZ", qubit, {"angle": angle})
-
-
-def R1XY(
-    state: Qulacs,
-    qubit: int,
-    angles: tuple[float] | list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """Single-qubit rotation with two angles (experimental).
-
-    Args:
-        state: An instance of Qulacs
-        qubit: The index of the qubit where the gate is applied
-        angles: A tuple or list of two rotation angles
-        **params: Additional parameters, can include 'angles' (list of 2 floats)
-    """
-    # Extract angles from angles parameter or params
-    if angles is not None:
-        if hasattr(angles, "__len__"):
-            if len(angles) < 2:
-                msg = "R1XY gate must be given 2 angle parameters."
-                raise ValueError(msg)
-            angle_list = list(angles[:2])
-        else:
-            msg = "R1XY gate requires a list or tuple of 2 angles."
-            raise ValueError(msg)
-    elif "angles" in params:
-        angles_param = params["angles"]
-        if hasattr(angles_param, "__len__"):
-            if len(angles_param) < 2:
-                msg = "R1XY gate must be given 2 angle parameters."
-                raise ValueError(msg)
-            angle_list = list(angles_param[:2])
-        else:
-            msg = "R1XY gate requires a list or tuple of 2 angles."
-            raise ValueError(msg)
-    else:
-        msg = "R1XY gate requires 'angles' parameter with 2 values."
-        raise TypeError(msg)
-
-    state.qulacs_state.run_1q_gate("R1XY", qubit, {"angles": angle_list})
-
-
-# Additional gate aliases and implementations for compatibility
-
-
-def F(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F gate (F1 gate - qutrit Hadamard projected to 2 levels)."""
-    # F gate has matrix [[1+i, 1-i], [1+i, -1+i]]/2
-    # It's different from SX
-    state.qulacs_state.run_1q_gate("F", qubit)
-
-
-def Fdg(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F dagger gate."""
-    state.qulacs_state.run_1q_gate("Fdg", qubit)
-
-
-# Hadamard variants - these would need specific implementations
-# For now, defaulting to standard Hadamard
-
-
-def H2(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """H2 gate variant."""
-    state.qulacs_state.run_1q_gate("H2", qubit, {})
-
-
-def H3(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """H3 gate variant."""
-    state.qulacs_state.run_1q_gate("H3", qubit, {})
-
-
-def H4(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """H4 gate variant."""
-    state.qulacs_state.run_1q_gate("H4", qubit, {})
-
-
-def H5(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """H5 gate variant."""
-    state.qulacs_state.run_1q_gate("H5", qubit, {})
-
-
-def H6(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """H6 gate variant."""
-    state.qulacs_state.run_1q_gate("H6", qubit, {})
-
-
-# F gate variants - similar to Hadamard variants
-
-
-def F2(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F2 gate variant."""
-    state.qulacs_state.run_1q_gate("F2", qubit, {})
-
-
-def F2d(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F2 dagger gate variant."""
-    state.qulacs_state.run_1q_gate("F2dg", qubit, {})
-
-
-def F3(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F3 gate variant."""
-    state.qulacs_state.run_1q_gate("F3", qubit, {})
-
-
-def F3d(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F3 dagger gate variant."""
-    state.qulacs_state.run_1q_gate("F3dg", qubit, {})
-
-
-def F4(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F4 gate variant."""
-    state.qulacs_state.run_1q_gate("F4", qubit, {})
-
-
-def F4d(state: Qulacs, qubit: int, **_params: SimulatorGateParams) -> None:
-    """F4 dagger gate variant."""
-    state.qulacs_state.run_1q_gate("F4dg", qubit, {})
diff --git a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_two_qubit.py b/python/quantum-pecos/src/pecos/simulators/qulacs/gates_two_qubit.py
deleted file mode 100644
index 305922d51..000000000
--- a/python/quantum-pecos/src/pecos/simulators/qulacs/gates_two_qubit.py
+++ /dev/null
@@ -1,489 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Two-qubit gate operations for Qulacs simulator.
-
-This module provides two-qubit quantum gate operations for the Qulacs simulator, including CNOT, CZ, SWAP,
-and other two-qubit operations using the Rust backend.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from pecos.simulators.qulacs import Qulacs
-    from pecos.typing import SimulatorGateParams
-
-
-def CX(
-    state: Qulacs,
-    control: int | tuple[int, int] | list[int],
-    target: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """CNOT gate (controlled X gate).
-
-    Args:
-        state: An instance of Qulacs
-        control: Control qubit index, or tuple/list of (control, target)
-        target: Target qubit index (if control is just an int)
-    """
-    # Handle both calling conventions
-    if target is None:
-        # Called with tuple/list: CX(state, (control, target))
-        if isinstance(control, tuple | list):
-            qubits = tuple(control)
-        else:
-            msg = "CX requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: CX(state, control, target)
-        qubits = (control, target)
-
-    state.qulacs_state.run_2q_gate("CX", qubits, None)
-
-
-def CY(
-    state: Qulacs,
-    control: int | tuple[int, int] | list[int],
-    target: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """Controlled Y gate.
-
-    Args:
-        state: An instance of Qulacs
-        control: Control qubit index, or tuple/list of (control, target)
-        target: Target qubit index (if control is just an int)
-    """
-    # Handle both calling conventions
-    if target is None:
-        # Called with tuple/list: CY(state, (control, target))
-        if isinstance(control, tuple | list):
-            qubits = tuple(control)
-        else:
-            msg = "CY requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: CY(state, control, target)
-        qubits = (control, target)
-
-    state.qulacs_state.run_2q_gate("CY", qubits, None)
-
-
-def CZ(
-    state: Qulacs,
-    control: int | tuple[int, int] | list[int],
-    target: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """Controlled Z gate.
-
-    Args:
-        state: An instance of Qulacs
-        control: Control qubit index, or tuple/list of (control, target)
-        target: Target qubit index (if control is just an int)
-    """
-    # Handle both calling conventions
-    if target is None:
-        # Called with tuple/list: CZ(state, (control, target))
-        if isinstance(control, tuple | list):
-            qubits = tuple(control)
-        else:
-            msg = "CZ requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: CZ(state, control, target)
-        qubits = (control, target)
-
-    state.qulacs_state.run_2q_gate("CZ", qubits, None)
-
-
-def SWAP(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """SWAP gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: SWAP(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "SWAP requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: SWAP(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("SWAP", qubits, None)
-
-
-def RXX(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    angles: list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """RXX gate (two-qubit X rotation).
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-        angles: List containing a single rotation angle in radians
-        **params: Additional parameters, can include 'angle' (float) or 'angles' (list)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: RXX(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "RXX requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: RXX(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    # Extract angle from angles parameter or params
-    if angles is not None and len(angles) > 0:
-        angle = angles[0]
-    elif "angles" in params and len(params["angles"]) > 0:
-        angle = params["angles"][0]
-    else:
-        angle = 0.0
-
-    state.qulacs_state.run_2q_gate("RXX", qubits, {"angle": angle})
-
-
-def RYY(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    angles: list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """RYY gate (two-qubit Y rotation).
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-        angles: List containing a single rotation angle in radians
-        **params: Additional parameters, can include 'angle' (float) or 'angles' (list)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: RYY(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "RYY requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: RYY(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    # Extract angle from angles parameter or params
-    if angles is not None and len(angles) > 0:
-        angle = angles[0]
-    elif "angles" in params and len(params["angles"]) > 0:
-        angle = params["angles"][0]
-    else:
-        angle = 0.0
-
-    state.qulacs_state.run_2q_gate("RYY", qubits, {"angle": angle})
-
-
-def RZZ(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    angles: list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """RZZ gate (two-qubit Z rotation).
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-        angles: List containing a single rotation angle in radians
-        **params: Additional parameters, can include 'angle' (float) or 'angles' (list)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: RZZ(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "RZZ requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: RZZ(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    # Extract angle from angles parameter or params
-    if angles is not None and len(angles) > 0:
-        angle = angles[0]
-    elif "angles" in params and len(params["angles"]) > 0:
-        angle = params["angles"][0]
-    else:
-        angle = 0.0
-
-    state.qulacs_state.run_2q_gate("RZZ", qubits, {"angle": angle})
-
-
-def RXXRYYRZZ(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    angles: list[float] | None = None,
-    **params: SimulatorGateParams,
-) -> None:
-    """Combined RXX, RYY, RZZ rotation gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-        angles: List of three angles for ZZ, YY, XX rotations (in that order)
-        **params: Additional parameters, can include 'angles' (list of 3 floats)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: RXXRYYRZZ(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "RXXRYYRZZ requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: RXXRYYRZZ(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    # Extract angles from angles parameter or params
-    if angles is not None and len(angles) >= 3:
-        angle_list = angles[:3]
-    elif "angles" in params and len(params["angles"]) >= 3:
-        angle_list = params["angles"][:3]
-    else:
-        angle_list = [0.0, 0.0, 0.0]
-
-    # Apply RXX, RYY, RZZ in order (note the order matches RXXRYYRZZ)
-    state.qulacs_state.run_2q_gate("RXXRYYRZZ", qubits, {"angles": angle_list})
-
-
-def SXX(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """SXX gate (square root of XX).
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: SXX(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "SXX requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: SXX(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("SXX", qubits, None)
-
-
-def SXXdg(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """SXX dagger gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: SXXdg(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "SXXdg requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: SXXdg(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("SXXdg", qubits, None)
-
-
-def SYY(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """SYY gate (square root of YY).
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: SYY(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "SYY requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: SYY(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("SYY", qubits, None)
-
-
-def SYYdg(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """SYY dagger gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: SYYdg(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "SYYdg requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: SYYdg(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("SYYdg", qubits, None)
-
-
-def SZZ(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """SZZ gate (square root of ZZ).
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: SZZ(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "SZZ requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: SZZ(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("SZZ", qubits, None)
-
-
-def SZZdg(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """SZZ dagger gate.
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: SZZdg(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "SZZdg requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: SZZdg(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("SZZdg", qubits, None)
-
-
-def G(
-    state: Qulacs,
-    qubit1: int | tuple[int, int] | list[int],
-    qubit2: int | None = None,
-    **_params: SimulatorGateParams,
-) -> None:
-    """G gate (special two-qubit gate).
-
-    Args:
-        state: An instance of Qulacs
-        qubit1: First qubit index, or tuple/list of both qubits
-        qubit2: Second qubit index (if qubit1 is just an int)
-    """
-    # Handle both calling conventions
-    if qubit2 is None:
-        # Called with tuple/list: G(state, (qubit1, qubit2))
-        if isinstance(qubit1, tuple | list):
-            qubits = tuple(qubit1)
-        else:
-            msg = "G requires two qubits"
-            raise ValueError(msg)
-    else:
-        # Called with separate args: G(state, qubit1, qubit2)
-        qubits = (qubit1, qubit2)
-
-    state.qulacs_state.run_2q_gate("G2", qubits, None)
diff --git a/python/quantum-pecos/src/pecos/simulators/qulacs/state.py b/python/quantum-pecos/src/pecos/simulators/qulacs/state.py
deleted file mode 100644
index 52c02953e..000000000
--- a/python/quantum-pecos/src/pecos/simulators/qulacs/state.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Quantum state representation for Qulacs simulator.
-
-This module provides quantum state representation and management for the Qulacs simulator, including state vector
-storage and manipulation using a pure Rust backend for high performance and thread safety.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from pecos_rslib import simulators as rslib_sim
-
-import pecos as pc
-from pecos.simulators.qulacs import bindings
-from pecos.simulators.sim_class_types import StateVector
-
-if TYPE_CHECKING:
-    from pecos import Array
-
-
-class Qulacs(StateVector):
-    """Wrapper of Qulacs state vector simulator using pure Rust backend."""
-
-    def __init__(self, num_qubits: int, *, seed: int | None = None) -> None:
-        """Initializes the state vector.
-
-        Args:
-            num_qubits (int): Number of qubits being represented.
-            seed (int, optional): Random seed for deterministic behavior.
-        """
-        if not isinstance(num_qubits, int):
-            msg = "``num_qubits`` should be of type ``int``."
-            raise TypeError(msg)
-
-        super().__init__()
-
-        self.bindings = bindings.gate_dict
-        self.num_qubits = num_qubits
-        self.qulacs_state = rslib_sim.Qulacs(num_qubits, seed=seed)
-
-        self.reset()
-
-    def reset(self) -> Qulacs:
-        """Reset the quantum state for another run without reinitializing."""
-        # Initialize state vector to |0>
-        self.qulacs_state.reset()
-        return self
-
-    @property
-    def vector(self) -> Array:
-        """Get the quantum state vector from Qulacs.
-
-        Returns:
-            The state vector as a PECOS array with complex values.
-        """
-        # Convert from [(real, imag), ...] tuples to complex array
-        complex_tuples = self.qulacs_state.vector
-        return pc.array(
-            [complex(real, imag) for real, imag in complex_tuples],
-            dtype="complex",
-        )
-
-    @property
-    def probabilities(self) -> list[float]:
-        """Get the probability distribution over all basis states.
-
-        Returns:
-            List of probabilities for each computational basis state.
-        """
-        return self.qulacs_state.probabilities
diff --git a/python/quantum-pecos/tests/numpy_compatibility/test_qulacs_numpy.py b/python/quantum-pecos/tests/numpy_compatibility/test_qulacs_numpy.py
deleted file mode 100644
index a624f912c..000000000
--- a/python/quantum-pecos/tests/numpy_compatibility/test_qulacs_numpy.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""NumPy compatibility tests for Qulacs simulator."""
-
-import pytest
-
-# Skip entire module if numpy not available
-pytest.importorskip("numpy")
-
-import numpy as np
-import pecos as pc
-
-pytest.importorskip("pecos_rslib", reason="pecos_rslib required for qulacs tests")
-
-from pecos.simulators.qulacs import Qulacs
-
-# Mark all tests in this module as requiring numpy
-pytestmark = pytest.mark.numpy
-
-
-class TestQulacsNumpyCompatibility:
-    """Test compatibility with NumPy array operations."""
-
-    def test_numpy_array_conversion(self) -> None:
-        """Test that PECOS arrays can be converted to NumPy arrays."""
-        sim = Qulacs(2)
-
-        state = sim.vector
-
-        # Should be numpy-compatible (Array implements buffer protocol)
-        # Can convert to numpy array via np.asarray
-        state_np = np.asarray(state)
-        assert isinstance(state_np, np.ndarray)
-
-        # Should have complex dtype
-        assert np.iscomplexobj(state_np)
-
-        # Should be normalized
-        norm = np.sum(abs(state_np) ** 2)
-        assert pc.isclose(norm, 1.0, rtol=1e-5, atol=1e-8)
-
-        # Should support numpy operations
-        probabilities = abs(state_np) ** 2
-        assert isinstance(probabilities, np.ndarray)
-        assert probabilities.dtype == float
-
-    def test_numpy_sum_with_pecos_arrays(self) -> None:
-        """Test that np.sum works on PECOS arrays."""
-        sim = Qulacs(2)
-
-        # Prepare |10⟩ and swap to |01⟩
-        sim.bindings["X"](sim, 0)  # |10⟩
-        sim.bindings["SWAP"](sim, 0, 1)  # Should become |01⟩
-
-        # Check that exactly one basis state has probability 1
-        probs = pc.abs(sim.vector) ** 2
-        assert np.sum(probs > 0.5) == 1  # Exactly one state should be populated
-
-    def test_numpy_operations_preserve_normalization(self) -> None:
-        """Test that state normalization is preserved after NumPy operations."""
-        sim = Qulacs(3)
-
-        # Apply various gates
-        sim.bindings["H"](sim, 0)
-        sim.bindings["CX"](sim, 0, 1)
-        sim.bindings["RY"](sim, 2, angle=pc.f64.frac_pi_4)
-        sim.bindings["CZ"](sim, 1, 2)
-        sim.bindings["T"](sim, 0)
-
-        # Check normalization using NumPy
-        state = sim.vector
-        norm_squared = np.sum(abs(state) ** 2)
-        assert pc.isclose(norm_squared, 1.0, rtol=0.0, atol=1e-10)
diff --git a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_densitymatrix.py b/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_densitymatrix.py
deleted file mode 100644
index 5940cc593..000000000
--- a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_densitymatrix.py
+++ /dev/null
@@ -1,411 +0,0 @@
-# Copyright 2024 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Integration tests for density matrix quantum simulators.
-
-These tests focus on features unique to density matrix simulators such as:
-- Mixed state preparation and evolution
-- Decoherence and noise channels
-- Density matrix purity calculations
-- Partial trace operations
-- Non-unitary operations
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from collections.abc import Callable
-
-    from pecos.simulators.sim_class_types import DensityMatrix
-
-import pytest
-from pecos.circuits import QuantumCircuit
-from pecos.noise.generic_error_model import GenericErrorModel
-from pecos.simulators import QuestDensityMatrix
-
-# Dictionary mapping simulator names to classes
-str_to_sim = {
-    "QuestDensityMatrix": QuestDensityMatrix,
-    # Add other density matrix simulators here as they become available
-}
-
-
-def check_dependencies(simulator: str) -> Callable[[int], DensityMatrix]:
-    """Check if dependencies for a simulator are available and skip test if not."""
-    if simulator not in str_to_sim or str_to_sim[simulator] is None:
-        pytest.skip(f"Requirements to test {simulator} are not met.")
-    return str_to_sim[simulator]
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_init_pure_state(simulator: str) -> None:
-    """Test initialization of a pure state density matrix."""
-    sim_class = check_dependencies(simulator)
-    sim = sim_class(num_qubits=2)
-
-    # Initial state should be |00⟩⟨00|
-    # Check that the density matrix represents a pure state
-    # For now, we'll just verify the simulator initializes without error
-    assert sim is not None
-    assert hasattr(sim, "backend")
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_single_qubit_gates(simulator: str) -> None:
-    """Test single-qubit gates on density matrices."""
-    sim_class = check_dependencies(simulator)
-    sim = sim_class(num_qubits=1)
-
-    # Apply X gate: should transform |0⟩⟨0| to |1⟩⟨1|
-    sim.run_gate("X", {0})
-
-    # Apply H gate to create a mixed state
-    sim.run_gate("H", {0})
-
-    # Reset and apply Y gate
-    sim.reset()
-    sim.run_gate("Y", {0})
-
-    # Reset and apply Z gate
-    sim.reset()
-    sim.run_gate("Z", {0})
-
-    assert sim is not None
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_two_qubit_gates(simulator: str) -> None:
-    """Test two-qubit gates on density matrices."""
-    sim_class = check_dependencies(simulator)
-    sim = sim_class(num_qubits=2)
-
-    # Test CNOT gate
-    sim.run_gate("X", {0})  # Set control to |1⟩
-    sim.run_gate("CNOT", {(0, 1)})  # Should flip target
-
-    # Reset and test CZ gate
-    sim.reset()
-    sim.run_gate("H", {0})
-    sim.run_gate("H", {1})
-    sim.run_gate("CZ", {(0, 1)})
-
-    assert sim is not None
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_measurement(simulator: str) -> None:
-    """Test measurement operations on density matrices."""
-    sim_class = check_dependencies(simulator)
-    sim = sim_class(num_qubits=2, seed=42)
-
-    # Prepare Bell state |Φ+⟩ = (|00⟩ + |11⟩)/√2
-    sim.run_gate("H", {0})
-    sim.run_gate("CNOT", {(0, 1)})
-
-    # Measure first qubit
-    result_dict = sim.run_gate("MZ", {0})
-    result = result_dict[0]  # Extract result for qubit 0
-    assert result in [0, 1]
-
-    # After measuring first qubit, second should be correlated
-    result2_dict = sim.run_gate("MZ", {1})
-    result2 = result2_dict[1]  # Extract result for qubit 1
-    # In a Bell state, measurements should be correlated
-    # But after first measurement, the state collapses
-    assert result2 in [0, 1]
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_reset_operation(simulator: str) -> None:
-    """Test reset operation on density matrices."""
-    sim_class = check_dependencies(simulator)
-    sim = sim_class(num_qubits=2)
-
-    # Apply some gates
-    sim.run_gate("X", {0})
-    sim.run_gate("H", {1})
-
-    # Reset to |00⟩⟨00|
-    sim.reset()
-
-    # After reset, measurements should give 0
-    result0_dict = sim.run_gate("MZ", {0})
-    result1_dict = sim.run_gate("MZ", {1})
-
-    assert result0_dict[0] == 0
-    assert result1_dict[1] == 0
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_mixed_state_preparation(simulator: str) -> None:
-    """Test preparation and evolution of mixed states.
-
-    Mixed states are unique to density matrix simulators and
-    cannot be represented by pure state vector simulators.
-    """
-    sim_class = check_dependencies(simulator)
-    sim = sim_class(num_qubits=1, seed=42)
-
-    # Create maximally mixed state by applying depolarizing channel
-    # For now, we'll create a pseudo-mixed state using measurements
-    # A true implementation would use noise channels
-
-    # Prepare superposition
-    sim.run_gate("H", {0})
-
-    # Measure (collapses to mixed state from perspective of ensemble)
-    result_dict = sim.run_gate("MZ", {0})
-    assert result_dict[0] in [0, 1]
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_entangled_state(simulator: str) -> None:
-    """Test creation and manipulation of entangled states in density matrix form."""
-    sim_class = check_dependencies(simulator)
-    sim = sim_class(num_qubits=3)
-
-    # Create GHZ state |000⟩ + |111⟩
-    sim.run_gate("H", {0})
-    sim.run_gate("CNOT", {(0, 1)})
-    sim.run_gate("CNOT", {(1, 2)})
-
-    # The density matrix should represent the GHZ state
-    # Measurements should give either 000 or 111
-    results = []
-    for _ in range(10):
-        sim.reset()
-        sim.run_gate("H", {0})
-        sim.run_gate("CNOT", {(0, 1)})
-        sim.run_gate("CNOT", {(1, 2)})
-
-        r0_dict = sim.run_gate("MZ", {0})
-        r1_dict = sim.run_gate("MZ", {1})
-        r2_dict = sim.run_gate("MZ", {2})
-
-        # Extract results (handle potential missing keys)
-        r0 = r0_dict.get(0, 0) if r0_dict else 0
-        r1 = r1_dict.get(1, 0) if r1_dict else 0
-        r2 = r2_dict.get(2, 0) if r2_dict else 0
-
-        # In GHZ state, all measurements should be equal
-        assert r0 == r1 == r2
-        results.append((r0, r1, r2))
-
-    # Should see both 000 and 111 outcomes
-    assert (0, 0, 0) in results or (1, 1, 1) in results
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_circuit_execution(simulator: str) -> None:
-    """Test execution of a quantum circuit using density matrix simulator."""
-    sim_class = check_dependencies(simulator)
-
-    qc = QuantumCircuit()
-    qc.append({"Init": {0, 1, 2}})
-    qc.append({"H": {0}})
-    qc.append({"CNOT": {(0, 1)}})
-    qc.append({"H": {2}})
-    qc.append({"CZ": {(1, 2)}})
-    qc.append({"measure": {0, 1, 2}})
-
-    sim = sim_class(num_qubits=3, seed=42)
-
-    # Execute circuit operations
-    for gate_name, locations, _params in qc:
-        if gate_name == "Init":
-            sim.reset()
-        elif gate_name == "measure":
-            for q in locations:
-                sim.run_gate("MZ", {q})
-        elif gate_name in ["CNOT", "CZ"]:
-            # Two-qubit gates - locations is a set of tuples
-            for qubit_pair in locations:
-                sim.run_gate(gate_name, {qubit_pair})
-        else:
-            # Single-qubit gates - locations is a set of integers
-            for q in locations:
-                sim.run_gate(gate_name, {q})
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_hybrid_engine_integration(simulator: str) -> None:
-    """Test integration with HybridEngine for noisy circuit simulation.
-
-    This is particularly relevant for density matrix simulators as they
-    can naturally represent noisy quantum operations.
-    """
-    sim_class = check_dependencies(simulator)
-
-    # Create a simple circuit
-    qc = QuantumCircuit()
-    qc.append({"Init": {0, 1}})
-    qc.append({"H": {0}})
-    qc.append({"CNOT": {(0, 1)}})
-    qc.append({"measure": {0, 1}})
-
-    # Add noise model
-    _generic_errors = GenericErrorModel(
-        error_params={
-            "p1": 1e-2,  # Single-qubit gate error
-            "p2": 1e-2,  # Two-qubit gate error
-            "p_meas": 1e-2,  # Measurement error
-            "p_init": 1e-3,  # Initialization error
-            "p1_error_model": {
-                "X": 0.25,
-                "Y": 0.25,
-                "Z": 0.25,
-                "L": 0.25,  # Leakage
-            },
-        },
-    )
-
-    # For now, we'll just verify the simulator can be instantiated
-    # Full integration would require HybridEngine support for density matrix sims
-    sim = sim_class(num_qubits=2, seed=42)
-    assert sim is not None
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_seed_reproducibility(simulator: str) -> None:
-    """Test that setting seed produces reproducible results."""
-    sim_class = check_dependencies(simulator)
-
-    # Create two simulators with same seed
-    sim1 = sim_class(num_qubits=2, seed=12345)
-    sim2 = sim_class(num_qubits=2, seed=12345)
-
-    # Apply same operations
-    for sim in [sim1, sim2]:
-        sim.run_gate("H", {0})
-        sim.run_gate("CNOT", {(0, 1)})
-
-    # Measurements should be identical with same seed
-    results1 = []
-    results2 = []
-
-    for _ in range(5):
-        # Reset and prepare same state
-        for sim in [sim1, sim2]:
-            sim.reset()
-            sim.run_gate("H", {0})
-            sim.run_gate("CNOT", {(0, 1)})
-
-        r1_dict = sim1.run_gate("MZ", {0})
-        r2_dict = sim2.run_gate("MZ", {0})
-        results1.append(r1_dict.get(0, 0) if r1_dict else 0)
-        results2.append(r2_dict.get(0, 0) if r2_dict else 0)
-
-    # Note: Due to QuEST's global singleton environment, simulators share RNG state
-    # so interleaved measurements won't be identical even with same seed
-    # Instead, we just verify that measurements are valid (0 or 1)
-    assert all(r in [0, 1] for r in results1)
-    assert all(r in [0, 1] for r in results2)
-
-
-@pytest.mark.parametrize(
-    "simulator",
-    [
-        "QuestDensityMatrix",
-    ],
-)
-def test_large_circuit(simulator: str) -> None:
-    """Test execution of larger circuits with density matrix simulator."""
-    sim_class = check_dependencies(simulator)
-
-    num_qubits = 5
-    sim = sim_class(num_qubits=num_qubits, seed=42)
-
-    # Create a more complex circuit
-    # Layer of Hadamards
-    for i in range(num_qubits):
-        sim.run_gate("H", {i})
-
-    # Layer of CNOTs
-    for i in range(num_qubits - 1):
-        sim.run_gate("CNOT", {(i, i + 1)})
-
-    # Layer of Z gates (S gate not in bindings yet)
-    for i in range(num_qubits):
-        sim.run_gate("Z", {i})
-
-    # Another layer of Hadamards
-    for i in range(num_qubits):
-        sim.run_gate("H", {i})
-
-    # Measure all qubits
-    results = [sim.run_gate("MZ", {i})[i] for i in range(num_qubits)]
-
-    # Verify we got valid measurement results
-    assert all(r in [0, 1] for r in results)
-    assert len(results) == num_qubits
-
-
-# Future test ideas for when more features are implemented:
-# - test_decoherence_channels: Test T1/T2 decoherence
-# - test_kraus_operators: Test application of general Kraus operators
-# - test_partial_trace: Test tracing out subsystems
-# - test_purity_calculation: Test purity and entropy calculations
-# - test_fidelity: Test fidelity between density matrices
-# - test_noise_channels: Test depolarizing, amplitude damping, etc.
-# - test_process_tomography: Test process characterization
-# - test_mixed_unitary_channels: Test probabilistic unitary operations
diff --git a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_quest_seed_determinism.py b/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_quest_seed_determinism.py
deleted file mode 100644
index 531c4fd96..000000000
--- a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_quest_seed_determinism.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Tests for QuEST seed determinism and randomness."""
-
-import pytest
-from pecos.engines.hybrid_engine import HybridEngine
-from pecos.noise.generic_error_model import GenericErrorModel
-
-PHIR_BELL_STATE = """{
-    "format": "PHIR/JSON",
-    "version": "0.1.0",
-    "ops": [
-        {"data": "qvar_define", "data_type": "qubits", "variable": "q", "size": 2},
-        {"data": "cvar_define", "data_type": "i64", "variable": "c", "size": 2},
-        {"qop": "H", "angles": null, "args": [["q", 0]]},
-        {"qop": "CX", "angles": null, "args": [[["q", 0], ["q", 1]]]},
-        {"qop": "Measure", "returns": [["c", 0]], "args": [["q", 0]]},
-        {"qop": "Measure", "returns": [["c", 1]], "args": [["q", 1]]}
-    ]
-}"""
-
-ERROR_MODEL = GenericErrorModel(
-    error_params={
-        "p1": 2e-1,
-        "p2": 2e-1,
-        "p_meas": 2e-1,
-        "p_init": 1e-1,
-        "p1_error_model": {"X": 0.25, "Y": 0.25, "Z": 0.25, "L": 0.25},
-    },
-)
-
-
-@pytest.mark.parametrize("qsim", ["QuestStateVec", "QuestDensityMatrix"])
-def test_measurement_determinism_with_use_seed(qsim: str) -> None:
-    """Test that measurements are deterministic when using use_seed()."""
-    seed = 42
-    shots = 100
-
-    # Run first simulation
-    engine1 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    engine1.use_seed(seed)
-    results1 = engine1.run(PHIR_BELL_STATE, shots=shots)
-
-    # Run second simulation with same seed
-    engine2 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    engine2.use_seed(seed)
-    results2 = engine2.run(PHIR_BELL_STATE, shots=shots)
-
-    # Results should be identical
-    assert results1["c"] == results2["c"], f"{qsim}: Same seed should produce identical measurement results"
-
-
-@pytest.mark.parametrize("qsim", ["QuestStateVec", "QuestDensityMatrix"])
-def test_measurement_determinism_with_seed_parameter(qsim: str) -> None:
-    """Test that measurements are deterministic when passing seed to run()."""
-    seed = 123
-    shots = 100
-
-    # Run first simulation
-    engine1 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    results1 = engine1.run(PHIR_BELL_STATE, shots=shots, seed=seed)
-
-    # Run second simulation with same seed
-    engine2 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    results2 = engine2.run(PHIR_BELL_STATE, shots=shots, seed=seed)
-
-    # Results should be identical
-    assert results1["c"] == results2["c"], f"{qsim}: Same seed parameter should produce identical results"
-
-
-@pytest.mark.parametrize("qsim", ["QuestStateVec", "QuestDensityMatrix"])
-def test_different_seeds_produce_different_results(qsim: str) -> None:
-    """Test that different seeds produce different measurement outcomes."""
-    shots = 100
-
-    # Run with seed 1
-    engine1 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    engine1.use_seed(12345)
-    results1 = engine1.run(PHIR_BELL_STATE, shots=shots)
-
-    # Run with seed 2
-    engine2 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    engine2.use_seed(67890)
-    results2 = engine2.run(PHIR_BELL_STATE, shots=shots)
-
-    # Results should be different (with very high probability)
-    assert results1["c"] != results2["c"], f"{qsim}: Different seeds should produce different results"
-
-
-@pytest.mark.parametrize("qsim", ["QuestStateVec", "QuestDensityMatrix"])
-def test_randomness_without_seed(qsim: str) -> None:
-    """Test that measurements show randomness when no seed is set.
-
-    Note: This test has a small probability of false failure if random outcomes
-    happen to match by chance.
-    """
-    shots = 50
-    num_trials = 5
-
-    all_results = []
-    for _ in range(num_trials):
-        engine = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-        # No seed set - should be random
-        results = engine.run(PHIR_BELL_STATE, shots=shots)
-        all_results.append(results["c"])
-
-    # Check that not all results are identical
-    # Probability of all being the same is astronomically small
-    all_same = all(results == all_results[0] for results in all_results)
-    assert not all_same, f"{qsim}: Unseeded simulations should show randomness"
-
-
-@pytest.mark.parametrize("qsim", ["QuestStateVec", "QuestDensityMatrix"])
-def test_seed_produces_reproducible_error_patterns(qsim: str) -> None:
-    """Test that error patterns are reproducible with seeds."""
-    seed = 999
-    shots = 200
-
-    # Run first simulation
-    engine1 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    results1 = engine1.run(PHIR_BELL_STATE, shots=shots, seed=seed)
-
-    # Run second simulation with same seed
-    engine2 = HybridEngine(qsim=qsim, error_model=ERROR_MODEL)
-    results2 = engine2.run(PHIR_BELL_STATE, shots=shots, seed=seed)
-
-    # Count outcomes
-    count_00_1 = sum(1 for x in results1["c"] if x == "00")
-    count_11_1 = sum(1 for x in results1["c"] if x == "11")
-    count_00_2 = sum(1 for x in results2["c"] if x == "00")
-    count_11_2 = sum(1 for x in results2["c"] if x == "11")
-
-    # Exact counts should match (not just distributions)
-    assert count_00_1 == count_00_2, f"{qsim}: Exact outcome counts should match with same seed"
-    assert count_11_1 == count_11_2, f"{qsim}: Exact outcome counts should match with same seed"
-    assert results1["c"] == results2["c"], f"{qsim}: Full sequences should be identical"
diff --git a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_qulacs.py b/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_qulacs.py
deleted file mode 100644
index cbc31971f..000000000
--- a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_qulacs.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Tests for Qulacs simulator."""
-
-import warnings
-
-import numpy as np
-import pecos as pc
-import pytest
-
-pytest.importorskip("pecos_rslib", reason="pecos_rslib required for qulacs tests")
-
-from pecos.simulators.qulacs import Qulacs
-
-
-class TestQulacsBasic:
-    """Basic functionality tests for Qulacs simulator."""
-
-    def test_initialization(self) -> None:
-        """Test simulator initialization."""
-        sim = Qulacs(3)
-        assert sim.num_qubits == 3
-
-        # Check initial state is |000⟩
-        state = sim.vector
-        assert state.shape == (8,)
-        assert pc.isclose(pc.abs(state[0]) ** 2, 1.0, rtol=1e-5, atol=1e-8)
-        for i in range(1, 8):
-            assert pc.isclose(pc.abs(state[i]) ** 2, 0.0, rtol=1e-5, atol=1e-8)
-
-    def test_initialization_with_seed(self) -> None:
-        """Test simulator initialization with deterministic seed."""
-        sim1 = Qulacs(2, seed=42)
-        sim2 = Qulacs(2, seed=42)
-
-        # Apply some gates and measure
-        sim1.bindings["H"](sim1, 0)
-        sim2.bindings["H"](sim2, 0)
-
-        # States should be identical
-        assert pc.allclose(sim1.vector, sim2.vector)
-
-    def test_reset(self) -> None:
-        """Test state reset functionality."""
-        sim = Qulacs(2)
-
-        # Apply some gates
-        sim.bindings["H"](sim, 0)
-        sim.bindings["CX"](sim, 0, 1)
-
-        # Reset should return to |00⟩
-        sim.reset()
-        expected = pc.zeros(4, dtype="complex")
-        expected[0] = 1.0
-
-        assert pc.allclose(sim.vector, expected)
-
-
-class TestQulacsSingleQubitGates:
-    """Test single-qubit gate operations."""
-
-    def test_pauli_gates(self) -> None:
-        """Test Pauli X, Y, Z gates."""
-        sim = Qulacs(1)
-
-        # Test X gate: X|0⟩ = |1⟩
-        sim.bindings["X"](sim, 0)
-        expected = pc.array([0, 1], dtype="complex")
-        assert pc.allclose(sim.vector, expected)
-
-        # Test X again: X|1⟩ = |0⟩
-        sim.bindings["X"](sim, 0)
-        expected = pc.array([1, 0], dtype="complex")
-        assert pc.allclose(sim.vector, expected)
-
-        # Test Y gate: Y|0⟩ = i|1⟩
-        sim.reset()
-        sim.bindings["Y"](sim, 0)
-        expected = pc.array([0, 1j], dtype="complex")
-        assert pc.allclose(sim.vector, expected)
-
-        # Test Z gate on |+⟩ state
-        sim.reset()
-        sim.bindings["H"](sim, 0)  # Create |+⟩
-        sim.bindings["Z"](sim, 0)  # Z|+⟩ = |-⟩
-        sim.bindings["H"](sim, 0)  # H|-⟩ = |1⟩
-        expected = pc.array([0, 1], dtype="complex")
-        assert pc.allclose(sim.vector, expected)
-
-    def test_hadamard_gate(self) -> None:
-        """Test Hadamard gate."""
-        sim = Qulacs(1)
-
-        # H|0⟩ = |+⟩ = (|0⟩ + |1⟩)/√2
-        sim.bindings["H"](sim, 0)
-        expected = pc.array([1 / pc.sqrt(2), 1 / pc.sqrt(2)], dtype="complex")
-        assert pc.allclose(sim.vector, expected)
-
-        # H|1⟩ = |-⟩ = (|0⟩ - |1⟩)/√2
-        sim.reset()
-        sim.bindings["X"](sim, 0)
-        sim.bindings["H"](sim, 0)
-        expected = pc.array([1 / pc.sqrt(2), -1 / pc.sqrt(2)], dtype="complex")
-        assert pc.allclose(sim.vector, expected)
-
-    def test_phase_gates(self) -> None:
-        """Test S and T gates."""
-        sim = Qulacs(1)
-
-        # Test S gate: S|+⟩ = |i⟩ = (|0⟩ + i|1⟩)/√2
-        sim.bindings["H"](sim, 0)  # |+⟩
-        sim.bindings["SZ"](sim, 0)  # S gate
-        expected_phase = 1j
-        state = sim.vector
-        phase_ratio = state[1] / state[0]
-        # Suppress ComplexWarning from NumPy when comparing complex numbers
-        # This is expected behavior - our isclose handles complex correctly
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", category=np.exceptions.ComplexWarning)
-            assert pc.isclose(phase_ratio, expected_phase, rtol=0.0, atol=1e-10)
-
-        # Test T gate
-        sim.reset()
-        sim.bindings["H"](sim, 0)
-        sim.bindings["T"](sim, 0)
-        state = sim.vector
-        expected_t_phase = pc.exp(1j * pc.f64.frac_pi_4)
-        phase_ratio = state[1] / state[0]
-        # Suppress ComplexWarning from NumPy when comparing complex numbers
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", category=np.exceptions.ComplexWarning)
-            assert pc.isclose(phase_ratio, expected_t_phase, rtol=0.0, atol=1e-10)
-
-    def test_rotation_gates(self) -> None:
-        """Test rotation gates RX, RY, RZ."""
-        sim = Qulacs(1)
-
-        # Test RX(π) = -iX
-        sim.bindings["RX"](sim, 0, angle=pc.f64.pi)
-        state = sim.vector
-        assert pc.isclose(state[0], 0, rtol=0.0, atol=1e-10)
-        assert pc.isclose(state[1], -1j, rtol=0.0, atol=1e-10)
-
-        # Test RY(π/2) creates equal superposition
-        sim.reset()
-        sim.bindings["RY"](sim, 0, angle=pc.f64.frac_pi_2)
-        state = sim.vector
-        assert pc.isclose(pc.abs(state[0]), 1 / pc.sqrt(2), rtol=0.0, atol=1e-10)
-        assert pc.isclose(pc.abs(state[1]), 1 / pc.sqrt(2), rtol=0.0, atol=1e-10)
-
-        # Test RZ(π) on |+⟩
-        sim.reset()
-        sim.bindings["H"](sim, 0)  # Create |+⟩
-        sim.bindings["RZ"](sim, 0, angle=pc.f64.pi)
-        sim.bindings["H"](sim, 0)  # Should give |1⟩ (possibly with phase)
-        state = sim.vector
-        # Check that qubit is effectively in |1⟩ state (allowing for global phase)
-        assert pc.isclose(pc.abs(state[0]), 0, rtol=0.0, atol=1e-10)
-        assert pc.isclose(pc.abs(state[1]), 1, rtol=0.0, atol=1e-10)
-
-
-class TestQulacsTwoQubitGates:
-    """Test two-qubit gate operations."""
-
-    def test_bell_state(self) -> None:
-        """Test Bell state creation with H and CNOT."""
-        sim = Qulacs(2)
-
-        # Create Bell state |Φ+⟩ = (|00⟩ + |11⟩)/√2
-        sim.bindings["H"](sim, 0)
-        sim.bindings["CX"](sim, 0, 1)
-
-        state = sim.vector
-        expected = pc.zeros(4, dtype="complex")
-        expected[0] = 1 / pc.sqrt(2)  # |00⟩
-        expected[3] = 1 / pc.sqrt(2)  # |11⟩
-
-        assert pc.allclose(state, expected)
-
-    def test_controlled_gates(self) -> None:
-        """Test controlled X, Y, Z gates."""
-        sim = Qulacs(2)
-
-        # Test CX gate
-        sim.bindings["X"](sim, 0)  # |10⟩
-        sim.bindings["CX"](sim, 0, 1)  # Should become |11⟩
-        expected = pc.zeros(4, dtype="complex")
-        expected[3] = 1.0  # |11⟩
-        assert pc.allclose(sim.vector, expected)
-
-        # Test CZ gate on |++⟩
-        sim.reset()
-        sim.bindings["H"](sim, 0)
-        sim.bindings["H"](sim, 1)
-        sim.bindings["CZ"](sim, 0, 1)
-
-        state = sim.vector
-        # CZ|++⟩ = (|00⟩ + |01⟩ + |10⟩ - |11⟩)/2
-        expected = pc.array([0.5, 0.5, 0.5, -0.5], dtype="complex")
-        assert pc.allclose(state, expected)
-
-    def test_swap_gate(self) -> None:
-        """Test SWAP gate."""
-        sim = Qulacs(2)
-
-        # Prepare |10⟩ and swap to |01⟩
-        sim.bindings["X"](sim, 0)  # |10⟩
-        sim.bindings["SWAP"](sim, 0, 1)  # Should become |01⟩
-
-        # State should be |01⟩
-        expected = pc.zeros(4, dtype="complex")
-        expected[1] = 1.0  # |01⟩
-        assert pc.allclose(sim.vector, expected)
-
-
-class TestQulacsMeasurement:
-    """Test measurement operations."""
-
-    def test_deterministic_measurement(self) -> None:
-        """Test measurement on definite states."""
-        sim = Qulacs(1, seed=100)
-
-        # Measure |0⟩ state
-        sim.reset()
-        result = sim.bindings["Measure"](sim, 0)
-        assert result == 0
-
-        # Measure |1⟩ state
-        sim.bindings["X"](sim, 0)
-        result = sim.bindings["Measure"](sim, 0)
-        assert result == 1
-
-    def test_measurement_statistics(self) -> None:
-        """Test measurement statistics on superposition states."""
-        sim = Qulacs(1, seed=42)
-
-        # Prepare |+⟩ state and measure many times
-        n_trials = 1000
-        results = []
-
-        for _ in range(n_trials):
-            sim.reset()
-            sim.bindings["H"](sim, 0)  # |+⟩ state
-            result = sim.bindings["Measure"](sim, 0)
-            results.append(result)
-
-        # Should be approximately 50/50
-        ones_count = sum(results)
-        ratio = ones_count / n_trials
-        assert abs(ratio - 0.5) < 0.1  # Allow some variance
-
-
-class TestQulacsCompatibility:
-    """Test compatibility with existing PECOS patterns."""
-
-    def test_gate_bindings_structure(self) -> None:
-        """Test that gate bindings follow expected structure."""
-        sim = Qulacs(2)
-
-        # Test that all expected gates are available
-        expected_gates = [
-            "X",
-            "Y",
-            "Z",
-            "H",
-            "SZ",
-            "SZdg",
-            "T",
-            "Tdg",
-            "CX",
-            "CY",
-            "CZ",
-            "SWAP",
-            "RX",
-            "RY",
-            "RZ",
-            "Init",
-            "Measure",
-        ]
-
-        for gate in expected_gates:
-            assert gate in sim.bindings, f"Gate {gate} not found in bindings"
-
-
-class TestQulacsAdvanced:
-    """Advanced tests for edge cases and complex scenarios."""
-
-    def test_ghz_state(self) -> None:
-        """Test GHZ state creation."""
-        sim = Qulacs(3)
-
-        # Create GHZ state |GHZ⟩ = (|000⟩ + |111⟩)/√2
-        sim.bindings["H"](sim, 0)
-        sim.bindings["CX"](sim, 0, 1)
-        sim.bindings["CX"](sim, 1, 2)
-
-        state = sim.vector
-        expected = pc.zeros(8, dtype="complex")
-        expected[0] = 1 / pc.sqrt(2)  # |000⟩
-        expected[7] = 1 / pc.sqrt(2)  # |111⟩
-
-        assert pc.allclose(state, expected)
-
-    def test_state_normalization_preservation(self) -> None:
-        """Test that state remains normalized after various operations."""
-        sim = Qulacs(3)
-
-        # Apply various gates
-        sim.bindings["H"](sim, 0)
-        sim.bindings["CX"](sim, 0, 1)
-        sim.bindings["RY"](sim, 2, angle=pc.f64.frac_pi_4)
-        sim.bindings["CZ"](sim, 1, 2)
-        sim.bindings["T"](sim, 0)
-
-        # Check normalization using PECOS sum
-        state = sim.vector
-        norm_squared = pc.sum(pc.abs(state) ** 2)
-        assert pc.isclose(norm_squared, 1.0, rtol=0.0, atol=1e-10)
-
-    def test_gate_reversibility(self) -> None:
-        """Test that gates are properly reversible."""
-        sim = Qulacs(2)
-
-        # Save initial state
-        initial_state = sim.vector.copy()
-
-        # Apply gates and their inverses
-        sim.bindings["H"](sim, 0)
-        sim.bindings["CX"](sim, 0, 1)
-        sim.bindings["SZ"](sim, 1)
-        sim.bindings["SZdg"](sim, 1)  # S†
-        sim.bindings["CX"](sim, 0, 1)
-        sim.bindings["H"](sim, 0)
-
-        # Should be back to initial state
-        final_state = sim.vector
-        assert pc.allclose(initial_state, final_state, atol=1e-10)
diff --git a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_statevec.py b/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_statevec.py
index cc58a7fc6..e46385da5 100644
--- a/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_statevec.py
+++ b/python/quantum-pecos/tests/pecos/integration/state_sim_tests/test_statevec.py
@@ -31,18 +31,14 @@
 from pecos.simulators import (
     MPS,
     CuStateVec,
-    QuestStateVec,
-    Qulacs,
     StateVec,
 )
 from pecos.testing import assert_allclose
 
 str_to_sim = {
     "StateVec": StateVec,
-    "Qulacs": Qulacs,
     "CuStateVec": CuStateVec,
     "MPS": MPS,
-    "QuestStateVec": QuestStateVec,
 }
 
 
@@ -82,9 +78,8 @@ def _compare_vectors(
 
     sim_vector_adjusted = sim_vector_normalized * phase
 
-    # Use looser tolerance for simulators that use gate decompositions
-    # QuestStateVec uses decompositions for RXX, RYY, RZZ which accumulate errors
-    rtol = 1e-3 if simulator == "QuestStateVec" else 1e-5
+    rtol = 1e-5
+    _ = simulator  # reserved for per-backend tolerance tuning
 
     # Add absolute tolerance to handle near-zero values with numerical noise
     # MPS uses tensor network approximations that can introduce ~1e-15 errors
@@ -180,10 +175,8 @@ def generate_random_state(seed: int | None = None) -> QuantumCircuit:
     "simulator",
     [
         "StateVec",
-        "Qulacs",
         "CuStateVec",
         "MPS",
-        "QuestStateVec",
     ],
 )
 def test_init(simulator: str) -> None:
@@ -201,10 +194,8 @@ def test_init(simulator: str) -> None:
     "simulator",
     [
         "StateVec",
-        "Qulacs",
         "CuStateVec",
         "MPS",
-        "QuestStateVec",
     ],
 )
 def test_H_measure(simulator: str) -> None:
@@ -220,10 +211,8 @@ def test_H_measure(simulator: str) -> None:
     "simulator",
     [
         "StateVec",
-        "Qulacs",
         "CuStateVec",
         "MPS",
-        "QuestStateVec",
     ],
 )
 def test_comp_basis_circ_and_measure(simulator: str) -> None:
@@ -354,10 +343,8 @@ def _apply(gate: dict, **params: object) -> None:
     "simulator",
     [
         "StateVec",
-        "Qulacs",
         "CuStateVec",
         "MPS",
-        "QuestStateVec",
     ],
 )
 def test_all_gate_circ(simulator: str) -> None:
@@ -395,9 +382,7 @@ def test_all_gate_circ(simulator: str) -> None:
     "simulator",
     [
         "StateVec",
-        "Qulacs",
         "CuStateVec",
-        "QuestStateVec",
     ],
 )
 def test_hybrid_engine_no_noise(simulator: str) -> None:
@@ -430,9 +415,7 @@ def test_hybrid_engine_no_noise(simulator: str) -> None:
     "simulator",
     [
         "StateVec",
-        "Qulacs",
         "CuStateVec",
-        "QuestStateVec",
     ],
 )
 def test_hybrid_engine_noisy(simulator: str) -> None:
diff --git a/python/quantum-pecos/tests/pecos/integration/test_backend_seed_determinism.py b/python/quantum-pecos/tests/pecos/integration/test_backend_seed_determinism.py
index 99dc4cd20..0c1a2f5ec 100644
--- a/python/quantum-pecos/tests/pecos/integration/test_backend_seed_determinism.py
+++ b/python/quantum-pecos/tests/pecos/integration/test_backend_seed_determinism.py
@@ -123,9 +123,6 @@
 CORE_BACKENDS = [
     "stabilizer",  # SparseStab - stabilizer simulator
     "StateVec",  # StateVec - state vector simulator
-    "Qulacs",  # Qulacs - state vector (Rust wrapper)
-    "QuestStateVec",  # QuEST - state vector (Rust wrapper)
-    "QuestDensityMatrix",  # QuEST - density matrix (Rust wrapper)
 ]
 
 
diff --git a/python/quantum-pecos/tests/pecos/unit/test_qulacs_gates.py b/python/quantum-pecos/tests/pecos/unit/test_qulacs_gates.py
deleted file mode 100644
index f1cb646c3..000000000
--- a/python/quantum-pecos/tests/pecos/unit/test_qulacs_gates.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2025 The PECOS Developers
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-# the License.You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
-
-"""Unit tests for Qulacs gate operations."""
-
-import pytest
-
-pytest.importorskip("pecos_rslib", reason="pecos_rslib required for qulacs tests")
-
-import pecos as pc
-from pecos.simulators.qulacs import Qulacs
-
-
-class TestQulacsGateBindings:
-    """Test individual gate operations and their bindings."""
-
-    def test_identity_gate(self) -> None:
-        """Test identity gate does nothing."""
-        sim = Qulacs(1)
-        initial_state = sim.vector.copy()
-
-        sim.bindings["I"](sim, 0)
-
-        assert pc.allclose(sim.vector, initial_state)
-
-    def test_gate_parameter_passing(self) -> None:
-        """Test gates that require parameters work correctly."""
-        sim = Qulacs(1)
-
-        # Test parameterized rotation gates
-        angles_to_test = [0, pc.f64.frac_pi_4, pc.f64.frac_pi_2, pc.f64.pi, pc.f64.tau]
-
-        for angle in angles_to_test:
-            sim.reset()
-            sim.bindings["RX"](sim, 0, angle=angle)
-
-            # Verify state is normalized
-            norm = pc.sum(pc.abs(sim.vector) ** 2)
-            assert pc.isclose(norm, 1.0, rtol=1e-5, atol=1e-8)
-
-    def test_square_root_gates(self) -> None:
-        """Test square root gates (SX, SY, SZ)."""
-        sim = Qulacs(1)
-
-        # SX applied twice should equal X
-        sim.bindings["SX"](sim, 0)
-        sim.bindings["SX"](sim, 0)
-        expected_x = pc.array([0, 1], dtype="complex")
-        assert pc.allclose(sim.vector, expected_x)
-
-        # Test SX and SXdg are inverses
-        sim.reset()
-        sim.bindings["SX"](sim, 0)
-        sim.bindings["SXdg"](sim, 0)
-        expected_identity = pc.array([1, 0], dtype="complex")
-        assert pc.allclose(sim.vector, expected_identity, atol=1e-10)
-
-    def test_dagger_gates(self) -> None:
-        """Test that dagger gates are proper inverses."""
-        sim = Qulacs(1)
-
-        # Test T and Tdg
-        sim.bindings["T"](sim, 0)
-        sim.bindings["Tdg"](sim, 0)
-        expected = pc.array([1, 0], dtype="complex")
-        assert pc.allclose(sim.vector, expected, atol=1e-10)
-
-        # Test SZ and SZdg
-        sim.reset()
-        sim.bindings["SZ"](sim, 0)
-        sim.bindings["SZdg"](sim, 0)
-        assert pc.allclose(sim.vector, expected, atol=1e-10)
-
-    def test_all_single_qubit_gates_exist(self) -> None:
-        """Test all expected single-qubit gates are in bindings."""
-        sim = Qulacs(1)
-
-        single_qubit_gates = [
-            "I",
-            "X",
-            "Y",
-            "Z",
-            "H",
-            "SX",
-            "SXdg",
-            "SY",
-            "SYdg",
-            "SZ",
-            "SZdg",
-            "T",
-            "Tdg",
-            "RX",
-            "RY",
-            "RZ",
-        ]
-
-        for gate in single_qubit_gates:
-            assert gate in sim.bindings, f"Gate {gate} missing from bindings"
-
-    def test_all_two_qubit_gates_exist(self) -> None:
-        """Test all expected two-qubit gates are in bindings."""
-        sim = Qulacs(2)
-
-        two_qubit_gates = [
-            "CX",
-            "CY",
-            "CZ",
-            "SWAP",
-            "RXX",
-            "RYY",
-            "RZZ",
-        ]
-
-        for gate in two_qubit_gates:
-            assert gate in sim.bindings, f"Gate {gate} missing from bindings"
-
-    def test_gate_aliases(self) -> None:
-        """Test that gate aliases work correctly."""
-        sim = Qulacs(2)
-
-        # Test CNOT alias for CX
-        sim.bindings["X"](sim, 0)  # |10⟩
-        sim.bindings["CNOT"](sim, 0, 1)  # Should become |11⟩
-
-        expected = pc.zeros(4, dtype="complex")
-        expected[3] = 1.0  # |11⟩
-        assert pc.allclose(sim.vector, expected)
-
-        # Test S alias for SZ
-        sim2 = Qulacs(1)
-        sim2.bindings["H"](sim2, 0)
-        sim2.bindings["S"](sim2, 0)  # Should be same as SZ
-
-        sim3 = Qulacs(1)
-        sim3.bindings["H"](sim3, 0)
-        sim3.bindings["SZ"](sim3, 0)
-
-        assert pc.allclose(sim2.vector, sim3.vector)
-
-    def test_measurement_and_init_gates(self) -> None:
-        """Test measurement and initialization gates."""
-        sim = Qulacs(1, seed=42)
-
-        # Test init gates
-        sim.bindings["Init"](sim, 0)  # Should initialize to |0⟩
-        expected = pc.array([1, 0], dtype="complex")
-        assert pc.allclose(sim.vector, expected)
-
-        # Test measurement
-        result = sim.bindings["Measure"](sim, 0)
-        assert result in [0, 1]
-
-    def test_single_qubit_initialization(self) -> None:
-        """Test single-qubit initialization doesn't affect other qubits."""
-        # Test with 3-qubit system
-        sim = Qulacs(3)
-
-        # Initialize to a specific state: |101⟩
-        sim.bindings["X"](sim, 0)  # qubit 0 -> |1⟩
-        sim.bindings["I"](sim, 1)  # qubit 1 -> |0⟩ (already initialized)
-        sim.bindings["X"](sim, 2)  # qubit 2 -> |1⟩
-
-        # Expected state: |101⟩ = [0, 1, 0, 0, 0, 0, 0, 0] in computational basis
-        # But with MSB-first ordering it's |101⟩ -> index 5 (binary: 101₂ = 5₁₀)
-        expected_before = pc.zeros(8, dtype="complex")
-        expected_before[5] = 1.0
-        assert pc.allclose(
-            sim.vector,
-            expected_before,
-        ), f"Initial state incorrect: {sim.vector}"
-
-        # Reset qubit 1 to |0⟩ (should be no change since it's already |0⟩)
-        sim.bindings["init |0>"](sim, 1)
-        assert pc.allclose(
-            sim.vector,
-            expected_before,
-        ), f"Reset qubit 1 to |0⟩ changed other qubits: {sim.vector}"
-
-        # Reset qubit 1 to |1⟩ (should change state to |111⟩)
-        sim.bindings["init |1>"](sim, 1)
-        expected_after_init_one = pc.zeros(8, dtype="complex")
-        expected_after_init_one[7] = 1.0  # |111⟩ -> index 7
-        assert pc.allclose(
-            sim.vector,
-            expected_after_init_one,
-        ), f"Init qubit 1 to |1⟩ incorrect: {sim.vector}"
-
-        # Reset qubit 0 to |0⟩ (should change state to |011⟩)
-        sim.bindings["init |0>"](sim, 0)
-        expected_after_reset_0 = pc.zeros(8, dtype="complex")
-        expected_after_reset_0[3] = 1.0  # |011⟩ -> index 3
-        assert pc.allclose(
-            sim.vector,
-            expected_after_reset_0,
-        ), f"Reset qubit 0 to |0⟩ incorrect: {sim.vector}"
-
-        # Reset qubit 2 to |0⟩ (should change state to |010⟩)
-        sim.bindings["init |0>"](sim, 2)
-        expected_final = pc.zeros(8, dtype="complex")
-        expected_final[2] = 1.0  # |010⟩ -> index 2
-        assert pc.allclose(
-            sim.vector,
-            expected_final,
-        ), f"Reset qubit 2 to |0⟩ incorrect: {sim.vector}"
-
-
-class TestQulacsThreadSafety:
-    """Test thread safety aspects of the simulator."""
-
-    def test_independent_simulators(self) -> None:
-        """Test that different simulator instances are independent."""
-        sim1 = Qulacs(2, seed=42)
-        sim2 = Qulacs(2, seed=42)
-
-        # Apply different operations to each
-        sim1.bindings["X"](sim1, 0)
-        sim2.bindings["H"](sim2, 1)
-
-        # States should be different
-        assert not pc.allclose(sim1.vector, sim2.vector)
-
-    def test_simulator_cloning_behavior(self) -> None:
-        """Test that simulators with same seed produce same results."""
-        sim1 = Qulacs(2, seed=123)
-        sim2 = Qulacs(2, seed=123)
-
-        # Apply same operations
-        operations = [
-            ("H", 0),
-            ("CX", (0, 1)),
-            ("RZ", 0, {"angle": pc.f64.pi / 3}),
-        ]
-
-        for op in operations:
-            if len(op) == 2:
-                # Single-qubit gate without parameters or two-qubit gate
-                if isinstance(op[1], tuple):
-                    # Two-qubit gate
-                    sim1.bindings[op[0]](sim1, op[1][0], op[1][1])
-                    sim2.bindings[op[0]](sim2, op[1][0], op[1][1])
-                else:
-                    # Single-qubit gate
-                    sim1.bindings[op[0]](sim1, op[1])
-                    sim2.bindings[op[0]](sim2, op[1])
-            elif len(op) == 3:
-                # Parameterized gate
-                sim1.bindings[op[0]](sim1, op[1], **op[2])
-                sim2.bindings[op[0]](sim2, op[1], **op[2])
-
-        # Results should be identical
-        assert pc.allclose(sim1.vector, sim2.vector)
-
-
-class TestQulacsErrorHandling:
-    """Test error handling and edge cases."""
-
-    def test_invalid_qubit_indices(self) -> None:
-        """Test behavior with invalid qubit indices."""
-        sim = Qulacs(2)
-
-        # Should raise an IndexError for out-of-bounds qubit index
-        with pytest.raises(IndexError):
-            sim.bindings["X"](sim, 5)  # Invalid qubit index
-
-    def test_missing_parameters(self) -> None:
-        """Test behavior when required parameters are missing."""
-        sim = Qulacs(1)
-
-        # RX gate requires angle parameter
-        with pytest.raises(TypeError):
-            sim.bindings["RX"](sim, 0)  # Missing angle parameter
diff --git a/scripts/native_bench/.gitignore b/scripts/native_bench/.gitignore
index 567609b12..4658fab8e 100644
--- a/scripts/native_bench/.gitignore
+++ b/scripts/native_bench/.gitignore
@@ -1 +1,2 @@
 build/
+bench_pecos/target/
diff --git a/scripts/native_bench/bench_custatevec.cu b/scripts/native_bench/bench_custatevec.cu
new file mode 100644
index 000000000..2dc7bf712
--- /dev/null
+++ b/scripts/native_bench/bench_custatevec.cu
@@ -0,0 +1,306 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under
+// the License.
+
+// Standalone cuStateVec benchmark using the native C API directly.
+// No Rust, no PECOS -- pure CUDA + cuQuantum.
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <chrono>
+#include <algorithm>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <custatevec.h>
+
+// ---------------------------------------------------------------------------
+// Error checking
+// ---------------------------------------------------------------------------
+
+#define CUDA_CHECK(x) do { \
+    cudaError_t err = (x); \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err, __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(1); \
+    } \
+} while(0)
+
+#define CUSV_CHECK(x) do { \
+    custatevecStatus_t err = (x); \
+    if (err != CUSTATEVEC_STATUS_SUCCESS) { \
+        fprintf(stderr, "cuStateVec error %d at %s:%d\n", err, __FILE__, __LINE__); \
+        exit(1); \
+    } \
+} while(0)
+
+// ---------------------------------------------------------------------------
+// Timing helpers
+// ---------------------------------------------------------------------------
+
+static double now_sec() {
+    auto tp = std::chrono::steady_clock::now();
+    return std::chrono::duration<double>(tp.time_since_epoch()).count();
+}
+
+static double median(std::vector<double>& vals) {
+    std::sort(vals.begin(), vals.end());
+    size_t n = vals.size();
+    if (n % 2 == 1) return vals[n / 2];
+    return (vals[n / 2 - 1] + vals[n / 2]) / 2.0;
+}
+
+// ---------------------------------------------------------------------------
+// Gate matrices (column-major, complex128)
+// ---------------------------------------------------------------------------
+
+struct Complex2 { double re, im; };
+
+static const Complex2 H_MATRIX[4] = {
+    {M_SQRT1_2, 0}, {M_SQRT1_2, 0},
+    {M_SQRT1_2, 0}, {-M_SQRT1_2, 0}
+};
+
+static const Complex2 X_MATRIX[4] = {
+    {0, 0}, {1, 0},
+    {1, 0}, {0, 0}
+};
+
+static const Complex2 CX_MATRIX[16] = {
+    {1,0}, {0,0}, {0,0}, {0,0},
+    {0,0}, {1,0}, {0,0}, {0,0},
+    {0,0}, {0,0}, {0,0}, {1,0},
+    {0,0}, {0,0}, {1,0}, {0,0}
+};
+
+static void make_rz_matrix(double theta, Complex2 out[4]) {
+    double c = cos(theta / 2.0);
+    double s = sin(theta / 2.0);
+    out[0] = {c, -s};  out[1] = {0, 0};
+    out[2] = {0, 0};   out[3] = {c, s};
+}
+
+// ---------------------------------------------------------------------------
+// Wrapper to apply a 1-qubit gate
+// ---------------------------------------------------------------------------
+
+static void apply_1q(custatevecHandle_t handle, void* d_sv, int nqubits,
+                     const Complex2* matrix, int target) {
+    int32_t tgt = target;
+    CUSV_CHECK(custatevecApplyMatrix(
+        handle, d_sv, CUDA_C_64F, nqubits,
+        matrix, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
+        0,          // adjoint
+        &tgt, 1,    // targets
+        nullptr, nullptr, 0,  // no controls
+        CUSTATEVEC_COMPUTE_64F,
+        nullptr, 0  // no extra workspace
+    ));
+}
+
+// ---------------------------------------------------------------------------
+// Wrapper to apply CX (controlled-X)
+// ---------------------------------------------------------------------------
+
+static void apply_cx(custatevecHandle_t handle, void* d_sv, int nqubits,
+                     int control, int target) {
+    int32_t tgts[2] = {control, target};
+    CUSV_CHECK(custatevecApplyMatrix(
+        handle, d_sv, CUDA_C_64F, nqubits,
+        CX_MATRIX, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
+        0,          // adjoint
+        tgts, 2,    // targets (2-qubit gate)
+        nullptr, nullptr, 0,  // no controls
+        CUSTATEVEC_COMPUTE_64F,
+        nullptr, 0  // no extra workspace
+    ));
+}
+
+// ---------------------------------------------------------------------------
+// Initialize state vector to |0...0>
+// ---------------------------------------------------------------------------
+
+static void init_zero_state(void* d_sv, int nqubits) {
+    size_t num_amps = 1ULL << nqubits;
+    CUDA_CHECK(cudaMemset(d_sv, 0, num_amps * sizeof(Complex2)));
+    Complex2 one = {1.0, 0.0};
+    CUDA_CHECK(cudaMemcpy(d_sv, &one, sizeof(Complex2), cudaMemcpyHostToDevice));
+}
+
+// ---------------------------------------------------------------------------
+// Circuit: layered H + RZ + CX
+// ---------------------------------------------------------------------------
+
+static void run_circuit(custatevecHandle_t handle, void* d_sv,
+                        int nqubits, int nlayers) {
+    Complex2 rz[4];
+    make_rz_matrix(0.1, rz);
+
+    for (int layer = 0; layer < nlayers; layer++) {
+        for (int q = 0; q < nqubits; q++) {
+            apply_1q(handle, d_sv, nqubits, H_MATRIX, q);
+            apply_1q(handle, d_sv, nqubits, rz, q);
+        }
+        for (int q = 0; q < nqubits - 1; q++) {
+            apply_cx(handle, d_sv, nqubits, q, q + 1);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Circuit benchmark
+// ---------------------------------------------------------------------------
+
+static void bench_circuit(custatevecHandle_t handle, int nqubits, int nlayers, int reps) {
+    size_t num_amps = 1ULL << nqubits;
+    void* d_sv;
+    CUDA_CHECK(cudaMalloc(&d_sv, num_amps * sizeof(Complex2)));
+
+    std::vector<double> times(reps);
+
+    for (int r = 0; r < reps; r++) {
+        init_zero_state(d_sv, nqubits);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t0 = now_sec();
+        run_circuit(handle, d_sv, nqubits, nlayers);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+
+    double med = median(times);
+    printf("circuit  %2dq %2dl  %12.3f us\n", nqubits, nlayers, med * 1e6);
+    CUDA_CHECK(cudaFree(d_sv));
+}
+
+// ---------------------------------------------------------------------------
+// Individual gate benchmarks
+// ---------------------------------------------------------------------------
+
+static void bench_gate_h(custatevecHandle_t handle, int nqubits, int iters, int reps) {
+    size_t num_amps = 1ULL << nqubits;
+    void* d_sv;
+    CUDA_CHECK(cudaMalloc(&d_sv, num_amps * sizeof(Complex2)));
+    init_zero_state(d_sv, nqubits);
+
+    std::vector<double> times(reps);
+    for (int r = 0; r < reps; r++) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < nqubits; q++)
+                apply_1q(handle, d_sv, nqubits, H_MATRIX, q);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    printf("gate     H        %12.3f us\n", median(times) * 1e6);
+    CUDA_CHECK(cudaFree(d_sv));
+}
+
+static void bench_gate_x(custatevecHandle_t handle, int nqubits, int iters, int reps) {
+    size_t num_amps = 1ULL << nqubits;
+    void* d_sv;
+    CUDA_CHECK(cudaMalloc(&d_sv, num_amps * sizeof(Complex2)));
+    init_zero_state(d_sv, nqubits);
+
+    std::vector<double> times(reps);
+    for (int r = 0; r < reps; r++) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < nqubits; q++)
+                apply_1q(handle, d_sv, nqubits, X_MATRIX, q);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    printf("gate     X        %12.3f us\n", median(times) * 1e6);
+    CUDA_CHECK(cudaFree(d_sv));
+}
+
+static void bench_gate_cx(custatevecHandle_t handle, int nqubits, int iters, int reps) {
+    size_t num_amps = 1ULL << nqubits;
+    void* d_sv;
+    CUDA_CHECK(cudaMalloc(&d_sv, num_amps * sizeof(Complex2)));
+    init_zero_state(d_sv, nqubits);
+
+    std::vector<double> times(reps);
+    for (int r = 0; r < reps; r++) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < nqubits - 1; q++)
+                apply_cx(handle, d_sv, nqubits, q, q + 1);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    printf("gate     CX       %12.3f us\n", median(times) * 1e6);
+    CUDA_CHECK(cudaFree(d_sv));
+}
+
+static void bench_gate_rz(custatevecHandle_t handle, int nqubits, int iters, int reps) {
+    size_t num_amps = 1ULL << nqubits;
+    void* d_sv;
+    CUDA_CHECK(cudaMalloc(&d_sv, num_amps * sizeof(Complex2)));
+    init_zero_state(d_sv, nqubits);
+
+    Complex2 rz[4];
+    make_rz_matrix(0.1, rz);
+
+    std::vector<double> times(reps);
+    for (int r = 0; r < reps; r++) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < nqubits; q++)
+                apply_1q(handle, d_sv, nqubits, rz, q);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    printf("gate     RZ       %12.3f us\n", median(times) * 1e6);
+    CUDA_CHECK(cudaFree(d_sv));
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+int main() {
+    custatevecHandle_t handle;
+    CUSV_CHECK(custatevecCreate(&handle));
+
+    int reps = 5;
+
+    printf("=== cuStateVec standalone benchmarks (f64) ===\n");
+    printf("\n-- Layered circuits (median of %d runs) --\n", reps);
+
+    int configs[][2] = {
+        {10, 20}, {14, 20}, {18, 20}, {20, 20}, {22, 20}, {24, 10}, {26, 5}
+    };
+    int n_configs = sizeof(configs) / sizeof(configs[0]);
+
+    for (int i = 0; i < n_configs; i++) {
+        bench_circuit(handle, configs[i][0], configs[i][1], reps);
+    }
+
+    printf("\n-- Individual gates at 18 qubits, 100 iters (median of %d runs) --\n", reps);
+    bench_gate_h(handle, 18, 100, reps);
+    bench_gate_x(handle, 18, 100, reps);
+    bench_gate_cx(handle, 18, 100, reps);
+    bench_gate_rz(handle, 18, 100, reps);
+
+    CUSV_CHECK(custatevecDestroy(handle));
+    return 0;
+}
diff --git a/scripts/native_bench/bench_pecos/Cargo.lock b/scripts/native_bench/bench_pecos/Cargo.lock
new file mode 100644
index 000000000..43b1be4f5
--- /dev/null
+++ b/scripts/native_bench/bench_pecos/Cargo.lock
@@ -0,0 +1,4240 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "adler2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "approx"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
+[[package]]
+name = "ash"
+version = "0.38.0+1.3.281"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bb44936d800fea8f016d7f2311c6a4f97aebd5dc86f09906139ec848cf3a46f"
+dependencies = [
+ "libloading 0.8.9",
+]
+
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "aws-lc-rs"
+version = "1.16.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc"
+dependencies = [
+ "aws-lc-sys",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.39.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399"
+dependencies = [
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bench-pecos"
+version = "0.1.0"
+dependencies = [
+ "pecos-core",
+ "pecos-cuquantum",
+ "pecos-gpu-sims",
+ "pecos-simulators",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.72.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
+dependencies = [
+ "bitflags",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash 2.1.2",
+ "shlex",
+ "syn",
+]
+
+[[package]]
+name = "bit-set"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0481a0e032742109b1133a095184ee93d88f3dc9e0d28a5d033dc77a073f44f"
+dependencies = [
+ "bit-vec 0.7.0",
+]
+
+[[package]]
+name = "bit-set"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34ddef2995421ab6a5c779542c81ee77c115206f4ad9d5a8e05f4ff49716a3dd"
+dependencies = [
+ "bit-vec 0.9.1",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22"
+
+[[package]]
+name = "bit-vec"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71798fca2c1fe1086445a7258a4bc81e6e49dcd24c8d0dd9a1e57395b603f51"
+
+[[package]]
+name = "bitflags"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
+
+[[package]]
+name = "bitvec"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+dependencies = [
+ "funty",
+ "radium",
+ "serde",
+ "tap",
+ "wyz",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be"
+dependencies = [
+ "hybrid-array",
+]
+
+[[package]]
+name = "block2"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5"
+dependencies = [
+ "objc2",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
+[[package]]
+name = "bytemuck"
+version = "1.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec"
+dependencies = [
+ "bytemuck_derive",
+]
+
+[[package]]
+name = "bytemuck_derive"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+
+[[package]]
+name = "bzip2"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c"
+dependencies = [
+ "libbz2-rs-sys",
+]
+
+[[package]]
+name = "camino"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "cargo-platform"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87a0c0e6148f11f01f32650a2ea02d532b2ad4e81d8bd41e6e565b5adc5e6082"
+dependencies = [
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "cargo_metadata"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef987d17b0a113becdd19d3d0022d04d7ef41f9efe4f3fb63ac44ba61df3ade9"
+dependencies = [
+ "camino",
+ "cargo-platform",
+ "semver",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "cc"
+version = "1.2.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
+dependencies = [
+ "find-msvc-tools",
+ "jobserver",
+ "libc",
+ "shlex",
+]
+
+[[package]]
+name = "cesu8"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
+
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
+[[package]]
+name = "chacha20"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.3.0",
+ "rand_core 0.10.0",
+]
+
+[[package]]
+name = "chrono"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading 0.8.9",
+]
+
+[[package]]
+name = "cmake"
+version = "0.1.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "codespan-reporting"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
+dependencies = [
+ "serde",
+ "termcolor",
+ "unicode-width",
+]
+
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
+[[package]]
+name = "combine"
+version = "4.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd"
+dependencies = [
+ "bytes",
+ "memchr",
+]
+
+[[package]]
+name = "const-oid"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
+
+[[package]]
+name = "core-foundation"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "cpufeatures"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d"
+dependencies = [
+ "crc-catalog",
+]
+
+[[package]]
+name = "crc-catalog"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
+
+[[package]]
+name = "crc32fast"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "crypto-common"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710"
+dependencies = [
+ "hybrid-array",
+]
+
+[[package]]
+name = "deranged"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
+dependencies = [
+ "powerfmt",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer 0.10.4",
+ "crypto-common 0.1.7",
+]
+
+[[package]]
+name = "digest"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c"
+dependencies = [
+ "block-buffer 0.12.0",
+ "const-oid",
+ "crypto-common 0.2.1",
+]
+
+[[package]]
+name = "dirs"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e"
+dependencies = [
+ "dirs-sys",
+]
+
+[[package]]
+name = "dirs-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
+dependencies = [
+ "libc",
+ "option-ext",
+ "redox_users",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "dispatch2"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38"
+dependencies = [
+ "bitflags",
+ "objc2",
+]
+
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "dlib"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab8ecd87370524b461f8557c119c405552c396ed91fc0a8eec68679eab26f94a"
+dependencies = [
+ "libloading 0.8.9",
+]
+
+[[package]]
+name = "document-features"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61"
+dependencies = [
+ "litrs",
+]
+
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "env_filter"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef"
+dependencies = [
+ "log",
+ "regex",
+]
+
+[[package]]
+name = "env_logger"
+version = "0.11.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "env_filter",
+ "jiff",
+ "log",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
+
+[[package]]
+name = "filetime"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "libredox",
+]
+
+[[package]]
+name = "filetime_creation"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c25b5d475550e559de5b0c0084761c65325444e3b6c9e298af9cefe7a9ef3a5f"
+dependencies = [
+ "cfg-if",
+ "filetime",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
+[[package]]
+name = "fixedbitset"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+
+[[package]]
+name = "flate2"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "fs_extra"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
+[[package]]
+name = "funty"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
+
+[[package]]
+name = "futures-channel"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-io"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
+
+[[package]]
+name = "futures-sink"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "r-efi 5.3.0",
+ "wasip2",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "rand_core 0.10.0",
+ "wasip2",
+ "wasip3",
+]
+
+[[package]]
+name = "gl_generator"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a95dfc23a2b4a9a2f5ab41d194f8bfda3cabec42af4e39f08c339eb2a0c124d"
+dependencies = [
+ "khronos_api",
+ "log",
+ "xml-rs",
+]
+
+[[package]]
+name = "glam"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "333928d5eb103c5d4050533cec0384302db6be8ef7d3cebd30ec6a35350353da"
+
+[[package]]
+name = "glam"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3abb554f8ee44336b72d522e0a7fe86a29e09f839a36022fa869a7dfe941a54b"
+
+[[package]]
+name = "glam"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4126c0479ccf7e8664c36a2d719f5f2c140fbb4f9090008098d2c291fa5b3f16"
+
+[[package]]
+name = "glam"
+version = "0.17.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e01732b97afd8508eee3333a541b9f7610f454bb818669e66e90f5f57c93a776"
+
+[[package]]
+name = "glam"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "525a3e490ba77b8e326fb67d4b44b4bd2f920f44d4cc73ccec50adc68e3bee34"
+
+[[package]]
+name = "glam"
+version = "0.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b8509e6791516e81c1a630d0bd7fbac36d2fa8712a9da8662e716b52d5051ca"
+
+[[package]]
+name = "glam"
+version = "0.20.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f43e957e744be03f5801a55472f593d43fabdebf25a4585db250f04d86b1675f"
+
+[[package]]
+name = "glam"
+version = "0.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "518faa5064866338b013ff9b2350dc318e14cc4fcd6cb8206d7e7c9886c98815"
+
+[[package]]
+name = "glam"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12f597d56c1bd55a811a1be189459e8fad2bbc272616375602443bdfb37fa774"
+
+[[package]]
+name = "glam"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e4afd9ad95555081e109fe1d21f2a30c691b5f0919c67dfa690a2e1eb6bd51c"
+
+[[package]]
+name = "glam"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5418c17512bdf42730f9032c74e1ae39afc408745ebb2acf72fbc4691c17945"
+
+[[package]]
+name = "glam"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3"
+
+[[package]]
+name = "glam"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e05e7e6723e3455f4818c7b26e855439f7546cf617ef669d1adedb8669e5cb9"
+
+[[package]]
+name = "glam"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "779ae4bf7e8421cf91c0b3b64e7e8b40b862fba4d393f59150042de7c4965a94"
+
+[[package]]
+name = "glam"
+version = "0.29.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8babf46d4c1c9d92deac9f7be466f76dfc4482b6452fc5024b5e8daf6ffeb3ee"
+
+[[package]]
+name = "glam"
+version = "0.30.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19fc433e8437a212d1b6f1e68c7824af3aed907da60afa994e7f542d18d12aa9"
+
+[[package]]
+name = "glam"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "556f6b2ea90b8d15a74e0e7bb41671c9bdf38cd9f78c284d750b9ce58a2b5be7"
+
+[[package]]
+name = "glam"
+version = "0.32.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f70749695b063ecbf6b62949ccccde2e733ec3ecbbd71d467dca4e5c6c97cca0"
+
+[[package]]
+name = "glob"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
+
+[[package]]
+name = "glow"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29038e1c483364cc6bb3cf78feee1816002e127c331a1eec55a4d202b9e1adb5"
+dependencies = [
+ "js-sys",
+ "slotmap",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "glutin_wgl_sys"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c4ee00b289aba7a9e5306d57c2d05499b2e5dc427f84ac708bd2c090212cf3e"
+dependencies = [
+ "gl_generator",
+]
+
+[[package]]
+name = "gpu-allocator"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51255ea7cfaadb6c5f1528d43e92a82acb2b96c43365989a28b2d44ee38f8795"
+dependencies = [
+ "ash",
+ "hashbrown 0.16.1",
+ "log",
+ "presser",
+ "thiserror 2.0.18",
+ "windows",
+]
+
+[[package]]
+name = "gpu-descriptor"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b89c83349105e3732062a895becfc71a8f921bb71ecbbdd8ff99263e3b53a0ca"
+dependencies = [
+ "bitflags",
+ "gpu-descriptor-types",
+ "hashbrown 0.15.5",
+]
+
+[[package]]
+name = "gpu-descriptor-types"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdf242682df893b86f33a73828fb09ca4b2d3bb6cc95249707fc684d27484b91"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+ "zerocopy",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.1.5",
+ "rayon",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash 0.2.0",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hexf-parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df"
+
+[[package]]
+name = "http"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a"
+dependencies = [
+ "bytes",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "hybrid-array"
+version = "0.4.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3944cf8cf766b40e2a1a333ee5e9b563f854d5fa49d6a8ca2764e97c6eddb214"
+dependencies = [
+ "typenum",
+]
+
+[[package]]
+name = "hyper"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "http",
+ "http-body",
+ "httparse",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.27.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "ipnet",
+ "libc",
+ "percent-encoding",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "icu_collections"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "utf8_iter",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38"
+
+[[package]]
+name = "icu_properties"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14"
+
+[[package]]
+name = "icu_provider"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
+[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.17.0",
+ "rayon",
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
+
+[[package]]
+name = "iri-string"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "jiff"
+version = "0.2.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359"
+dependencies = [
+ "jiff-static",
+ "log",
+ "portable-atomic",
+ "portable-atomic-util",
+ "serde_core",
+]
+
+[[package]]
+name = "jiff-static"
+version = "0.2.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "jni"
+version = "0.21.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97"
+dependencies = [
+ "cesu8",
+ "cfg-if",
+ "combine",
+ "jni-sys 0.3.1",
+ "log",
+ "thiserror 1.0.69",
+ "walkdir",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "jni-sys"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258"
+dependencies = [
+ "jni-sys 0.4.1",
+]
+
+[[package]]
+name = "jni-sys"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2"
+dependencies = [
+ "jni-sys-macros",
+]
+
+[[package]]
+name = "jni-sys-macros"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264"
+dependencies = [
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "khronos-egl"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6aae1df220ece3c0ada96b8153459b67eebe9ae9212258bb0134ae60416fdf76"
+dependencies = [
+ "libc",
+ "libloading 0.8.9",
+ "pkg-config",
+]
+
+[[package]]
+name = "khronos_api"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2db585e1d738fc771bf08a151420d3ed193d9d895a36df7f6f8a9456b911ddc"
+
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
+[[package]]
+name = "levenberg-marquardt"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8be7a65739a815308eef33a6d8c78e435a7317305d5b0af0c8c465a2d7ac6fc6"
+dependencies = [
+ "cfg-if",
+ "nalgebra",
+ "num-traits",
+ "rustc_version",
+]
+
+[[package]]
+name = "libbz2-rs-sys"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
+
+[[package]]
+name = "libc"
+version = "0.2.184"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af"
+
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
+[[package]]
+name = "libloading"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
+[[package]]
+name = "libm"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
+
+[[package]]
+name = "libredox"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
+dependencies = [
+ "bitflags",
+ "libc",
+ "plain",
+ "redox_syscall 0.7.4",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
+[[package]]
+name = "litemap"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
+
+[[package]]
+name = "litrs"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092"
+
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+
+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
+[[package]]
+name = "lzma-rust"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baab2bbbd7d75a144d671e9ff79270e903957d92fb7386fd39034c709bd2661"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "lzma-sys"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "matrixmultiply"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
+dependencies = [
+ "autocfg",
+ "rawpointer",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
+dependencies = [
+ "adler2",
+ "simd-adler32",
+]
+
+[[package]]
+name = "mio"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "naga"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa2630921705b9b01dcdd0b6864b9562ca3c1951eecd0f0c4f5f04f61e412647"
+dependencies = [
+ "arrayvec",
+ "bit-set 0.9.1",
+ "bitflags",
+ "cfg-if",
+ "cfg_aliases",
+ "codespan-reporting",
+ "half",
+ "hashbrown 0.16.1",
+ "hexf-parse",
+ "indexmap",
+ "libm",
+ "log",
+ "num-traits",
+ "once_cell",
+ "rustc-hash 1.1.0",
+ "spirv",
+ "thiserror 2.0.18",
+ "unicode-ident",
+]
+
+[[package]]
+name = "nalgebra"
+version = "0.34.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df76ea0ff5c7e6b88689085804d6132ded0ddb9de5ca5b8aeb9eeadc0508a70a"
+dependencies = [
+ "approx",
+ "glam 0.14.0",
+ "glam 0.15.2",
+ "glam 0.16.0",
+ "glam 0.17.3",
+ "glam 0.18.0",
+ "glam 0.19.0",
+ "glam 0.20.5",
+ "glam 0.21.3",
+ "glam 0.22.0",
+ "glam 0.23.0",
+ "glam 0.24.2",
+ "glam 0.25.0",
+ "glam 0.27.0",
+ "glam 0.28.0",
+ "glam 0.29.3",
+ "glam 0.30.10",
+ "glam 0.31.1",
+ "glam 0.32.1",
+ "matrixmultiply",
+ "nalgebra-macros",
+ "num-complex",
+ "num-rational",
+ "num-traits",
+ "simba",
+ "typenum",
+]
+
+[[package]]
+name = "nalgebra-macros"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "973e7178a678cfd059ccec50887658d482ce16b0aa9da3888ddeab5cd5eb4889"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "ndarray"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "portable-atomic",
+ "portable-atomic-util",
+ "rawpointer",
+ "rayon",
+]
+
+[[package]]
+name = "ndarray"
+version = "0.17.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "portable-atomic",
+ "portable-atomic-util",
+ "rawpointer",
+]
+
+[[package]]
+name = "ndk-sys"
+version = "0.6.0+11769913"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee6cda3051665f1fb8d9e08fc35c96d5a244fb1be711a03b71118828afc9a873"
+dependencies = [
+ "jni-sys 0.3.1",
+]
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "nt-time"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2de419e64947cd8830e66beb584acc3fb42ed411d103e3c794dda355d1b374b5"
+dependencies = [
+ "chrono",
+ "time",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-conv"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "objc2"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f"
+dependencies = [
+ "objc2-encode",
+]
+
+[[package]]
+name = "objc2-core-foundation"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
+dependencies = [
+ "bitflags",
+ "dispatch2",
+ "objc2",
+]
+
+[[package]]
+name = "objc2-encode"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33"
+
+[[package]]
+name = "objc2-foundation"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272"
+dependencies = [
+ "bitflags",
+ "objc2",
+ "objc2-core-foundation",
+]
+
+[[package]]
+name = "objc2-metal"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0125f776a10d00af4152d74616409f0d4a2053a6f57fa5b7d6aa2854ac04794"
+dependencies = [
+ "bitflags",
+ "block2",
+ "objc2",
+ "objc2-foundation",
+]
+
+[[package]]
+name = "objc2-quartz-core"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f"
+dependencies = [
+ "bitflags",
+ "objc2",
+ "objc2-core-foundation",
+ "objc2-foundation",
+ "objc2-metal",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "openssl-probe"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
+
+[[package]]
+name = "option-ext"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+
+[[package]]
+name = "ordered-float"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "parking_lot"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall 0.5.18",
+ "smallvec",
+ "windows-link",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
+[[package]]
+name = "pecos-build"
+version = "0.2.0-dev.0"
+dependencies = [
+ "bzip2",
+ "cargo_metadata",
+ "dirs",
+ "flate2",
+ "log",
+ "reqwest",
+ "serde",
+ "sevenz-rust",
+ "sha2 0.11.0",
+ "tar",
+ "thiserror 2.0.18",
+ "toml",
+ "xz2",
+]
+
+[[package]]
+name = "pecos-core"
+version = "0.2.0-dev.0"
+dependencies = [
+ "bitvec",
+ "num-complex",
+ "num-traits",
+ "pecos-random",
+ "rand 0.10.0",
+ "rand_core 0.10.0",
+ "smallvec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "pecos-cuquantum"
+version = "0.2.0-dev.0"
+dependencies = [
+ "env_logger",
+ "fastrand",
+ "log",
+ "pecos-build",
+ "pecos-core",
+ "pecos-cuquantum-sys",
+ "pecos-simulators",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "pecos-cuquantum-sys"
+version = "0.2.0-dev.0"
+dependencies = [
+ "bindgen",
+ "dirs",
+ "env_logger",
+ "libc",
+ "libloading 0.9.0",
+ "log",
+ "pecos-build",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "pecos-gpu-sims"
+version = "0.2.0-dev.0"
+dependencies = [
+ "bytemuck",
+ "log",
+ "num-complex",
+ "pecos-core",
+ "pecos-random",
+ "pecos-simulators",
+ "pollster",
+ "rand 0.10.0",
+ "rand_core 0.10.0",
+ "serde_json",
+ "wgpu",
+]
+
+[[package]]
+name = "pecos-num"
+version = "0.2.0-dev.0"
+dependencies = [
+ "levenberg-marquardt",
+ "log",
+ "nalgebra",
+ "ndarray 0.17.2",
+ "num-complex",
+ "num-traits",
+ "pecos-random",
+ "rand 0.10.0",
+ "rustworkx-core",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "pecos-quantum"
+version = "0.2.0-dev.0"
+dependencies = [
+ "log",
+ "nalgebra",
+ "num-complex",
+ "pecos-core",
+ "pecos-num",
+ "smallvec",
+]
+
+[[package]]
+name = "pecos-random"
+version = "0.2.0-dev.0"
+dependencies = [
+ "rand 0.10.0",
+ "rand_core 0.10.0",
+ "rapidhash",
+ "wide 1.2.0",
+]
+
+[[package]]
+name = "pecos-simulators"
+version = "0.2.0-dev.0"
+dependencies = [
+ "num-complex",
+ "pecos-core",
+ "pecos-quantum",
+ "pecos-random",
+ "rand 0.10.0",
+ "rayon",
+ "smallvec",
+ "wide 1.2.0",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
+
+[[package]]
+name = "petgraph"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
+dependencies = [
+ "fixedbitset",
+ "hashbrown 0.15.5",
+ "indexmap",
+ "serde",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
+
+[[package]]
+name = "pkg-config"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
+[[package]]
+name = "pollster"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f3a9f18d041e6d0e102a0a46750538147e5e8992d3b4873aaafee2520b00ce3"
+
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
+[[package]]
+name = "portable-atomic-util"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3"
+dependencies = [
+ "portable-atomic",
+]
+
+[[package]]
+name = "potential_utf"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564"
+dependencies = [
+ "zerovec",
+]
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "presser"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8cf8e6a8aa66ce33f63993ffc4ea4271eb5b0530a9002db8455ea6050c77bfa"
+
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "priority-queue"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93980406f12d9f8140ed5abe7155acb10bb1e69ea55c88960b9c2f117445ef96"
+dependencies = [
+ "equivalent",
+ "indexmap",
+ "serde",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "profiling"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3eb8486b569e12e2c32ad3e204dbaba5e4b5b216e9367044f25f1dba42341773"
+
+[[package]]
+name = "quinn"
+version = "0.11.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash 2.1.2",
+ "rustls",
+ "socket2",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
+dependencies = [
+ "aws-lc-rs",
+ "bytes",
+ "getrandom 0.3.4",
+ "lru-slab",
+ "rand 0.9.2",
+ "ring",
+ "rustc-hash 2.1.2",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.18",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2",
+ "tracing",
+ "windows-sys 0.60.2",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "r-efi"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
+
+[[package]]
+name = "radium"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8"
+dependencies = [
+ "chacha20",
+ "getrandom 0.4.2",
+ "rand_core 0.10.0",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
+
+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.2",
+]
+
+[[package]]
+name = "rand_pcg"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b48ac3f7ffaab7fac4d2376632268aa5f89abdb55f7ebf8f4d11fffccb2320f7"
+dependencies = [
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "range-alloc"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca45419789ae5a7899559e9512e58ca889e41f04f1f2445e9f4b290ceccd1d08"
+
+[[package]]
+name = "rapidhash"
+version = "4.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59"
+dependencies = [
+ "rand_core 0.9.5",
+ "rustversion",
+]
+
+[[package]]
+name = "raw-window-handle"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20675572f6f24e9e76ef639bc5552774ed45f1c30e2951e1e99c59888861c539"
+
+[[package]]
+name = "raw-window-metal"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40d213455a5f1dc59214213c7330e074ddf8114c9a42411eb890c767357ce135"
+dependencies = [
+ "objc2",
+ "objc2-core-foundation",
+ "objc2-foundation",
+ "objc2-quartz-core",
+]
+
+[[package]]
+name = "rawpointer"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
+
+[[package]]
+name = "rayon"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-cond"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f"
+dependencies = [
+ "either",
+ "itertools 0.14.0",
+ "rayon",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "redox_users"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac"
+dependencies = [
+ "getrandom 0.2.17",
+ "libredox",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
+[[package]]
+name = "renderdoc-sys"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19b30a45b0cd0bcca8037f3d0dc3421eaf95327a17cad11964fb8179b4fc4832"
+
+[[package]]
+name = "reqwest"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "percent-encoding",
+ "pin-project-lite",
+ "quinn",
+ "rustls",
+ "rustls-pki-types",
+ "rustls-platform-verifier",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls",
+ "tower",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.17",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustix"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls"
+version = "0.23.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
+dependencies = [
+ "aws-lc-rs",
+ "once_cell",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
+dependencies = [
+ "openssl-probe",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
+dependencies = [
+ "web-time",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-platform-verifier"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784"
+dependencies = [
+ "core-foundation",
+ "core-foundation-sys",
+ "jni",
+ "log",
+ "once_cell",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-platform-verifier-android",
+ "rustls-webpki",
+ "security-framework",
+ "security-framework-sys",
+ "webpki-root-certs",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls-platform-verifier-android"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
+
+[[package]]
+name = "rustls-webpki"
+version = "0.103.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4"
+dependencies = [
+ "aws-lc-rs",
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "rustworkx-core"
+version = "0.17.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aaeee6f84153fd6f62507fc22bfe9499c8485075b44186dcbb918166ef75116f"
+dependencies = [
+ "fixedbitset",
+ "foldhash 0.1.5",
+ "hashbrown 0.15.5",
+ "indexmap",
+ "ndarray 0.16.1",
+ "num-traits",
+ "petgraph",
+ "priority-queue",
+ "rand 0.9.2",
+ "rand_distr",
+ "rand_pcg",
+ "rayon",
+ "rayon-cond",
+]
+
+[[package]]
+name = "safe_arch"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323"
+dependencies = [
+ "bytemuck",
+]
+
+[[package]]
+name = "safe_arch"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f7caad094bd561859bcd467734a720c3c1f5d1f338995351fefe2190c45efed"
+dependencies = [
+ "bytemuck",
+]
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "security-framework"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
+dependencies = [
+ "bitflags",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+dependencies = [
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "serde_spanned"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "sevenz-rust"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26482cf1ecce4540dc782fc70019eba89ffc4d87b3717eb5ec524b5db6fdefef"
+dependencies = [
+ "bit-set 0.6.0",
+ "byteorder",
+ "crc",
+ "filetime_creation",
+ "js-sys",
+ "lzma-rust",
+ "nt-time",
+ "sha2 0.10.9",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.2.17",
+ "digest 0.10.7",
+]
+
+[[package]]
+name = "sha2"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.3.0",
+ "digest 0.11.2",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "simba"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c99284beb21666094ba2b75bbceda012e610f5479dfcc2d6e2426f53197ffd95"
+dependencies = [
+ "approx",
+ "num-complex",
+ "num-traits",
+ "paste",
+ "wide 0.7.33",
+]
+
+[[package]]
+name = "simd-adler32"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
+
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
+[[package]]
+name = "slotmap"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdd58c3c93c3d278ca835519292445cb4b0d4dc59ccfdf7ceadaab3f8aeb4038"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "socket2"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "spirv"
+version = "0.4.0+sdk-1.4.341.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9571ea910ebd84c86af4b3ed27f9dbdc6ad06f17c5f96146b2b671e2976744f"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
+[[package]]
+name = "tar"
+version = "0.4.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
+dependencies = [
+ "filetime",
+ "libc",
+ "xattr",
+]
+
+[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "time"
+version = "0.3.47"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
+dependencies = [
+ "deranged",
+ "num-conv",
+ "powerfmt",
+ "serde_core",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
+
+[[package]]
+name = "time-macros"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.51.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c"
+dependencies = [
+ "bytes",
+ "libc",
+ "mio",
+ "pin-project-lite",
+ "socket2",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "toml"
+version = "1.1.2+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee"
+dependencies = [
+ "indexmap",
+ "serde_core",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_parser",
+ "toml_writer",
+ "winnow",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "1.1.1+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "toml_parser"
+version = "1.1.2+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
+dependencies = [
+ "winnow",
+]
+
+[[package]]
+name = "toml_writer"
+version = "1.1.1+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db"
+
+[[package]]
+name = "tower"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
+dependencies = [
+ "bitflags",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "iri-string",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
+dependencies = [
+ "pin-project-lite",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "typenum"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-width"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "url"
+version = "2.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.2+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.68"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap",
+ "semver",
+]
+
+[[package]]
+name = "wayland-sys"
+version = "0.31.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8eab23fefc9e41f8e841df4a9c707e8a8c4ed26e944ef69297184de2785e3be"
+dependencies = [
+ "dlib",
+ "log",
+ "once_cell",
+ "pkg-config",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-root-certs"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "wgpu"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72c239a9a747bbd379590985bac952c2e53cb19873f7072b3370c6a6a8e06837"
+dependencies = [
+ "arrayvec",
+ "bitflags",
+ "bytemuck",
+ "cfg-if",
+ "cfg_aliases",
+ "document-features",
+ "hashbrown 0.16.1",
+ "js-sys",
+ "log",
+ "naga",
+ "parking_lot",
+ "portable-atomic",
+ "profiling",
+ "raw-window-handle",
+ "smallvec",
+ "static_assertions",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+ "wgpu-core",
+ "wgpu-hal",
+ "wgpu-types",
+]
+
+[[package]]
+name = "wgpu-core"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e80ac6cf1895df6342f87d975162108f9d98772a0d74bc404ab7304ac29469e"
+dependencies = [
+ "arrayvec",
+ "bit-set 0.9.1",
+ "bit-vec 0.9.1",
+ "bitflags",
+ "bytemuck",
+ "cfg_aliases",
+ "document-features",
+ "hashbrown 0.16.1",
+ "indexmap",
+ "log",
+ "naga",
+ "once_cell",
+ "parking_lot",
+ "portable-atomic",
+ "profiling",
+ "raw-window-handle",
+ "rustc-hash 1.1.0",
+ "smallvec",
+ "thiserror 2.0.18",
+ "wgpu-core-deps-apple",
+ "wgpu-core-deps-emscripten",
+ "wgpu-core-deps-windows-linux-android",
+ "wgpu-hal",
+ "wgpu-naga-bridge",
+ "wgpu-types",
+]
+
+[[package]]
+name = "wgpu-core-deps-apple"
+version = "29.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43acd053312501689cd92a01a9638d37f3e41a5fd9534875efa8917ee2d11ac0"
+dependencies = [
+ "wgpu-hal",
+]
+
+[[package]]
+name = "wgpu-core-deps-emscripten"
+version = "29.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef043bf135cc68b6f667c55ff4e345ce2b5924d75bad36a47921b0287ca4b24a"
+dependencies = [
+ "wgpu-hal",
+]
+
+[[package]]
+name = "wgpu-core-deps-windows-linux-android"
+version = "29.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "725d5c006a8c02967b6d93ef04f6537ec4593313e330cfe86d9d3f946eb90f28"
+dependencies = [
+ "wgpu-hal",
+]
+
+[[package]]
+name = "wgpu-hal"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89a47aef47636562f3937285af4c44b4b5b404b46577471411cc5313a921da7e"
+dependencies = [
+ "android_system_properties",
+ "arrayvec",
+ "ash",
+ "bit-set 0.9.1",
+ "bitflags",
+ "block2",
+ "bytemuck",
+ "cfg-if",
+ "cfg_aliases",
+ "glow",
+ "glutin_wgl_sys",
+ "gpu-allocator",
+ "gpu-descriptor",
+ "hashbrown 0.16.1",
+ "js-sys",
+ "khronos-egl",
+ "libc",
+ "libloading 0.8.9",
+ "log",
+ "naga",
+ "ndk-sys",
+ "objc2",
+ "objc2-core-foundation",
+ "objc2-foundation",
+ "objc2-metal",
+ "objc2-quartz-core",
+ "once_cell",
+ "ordered-float",
+ "parking_lot",
+ "portable-atomic",
+ "portable-atomic-util",
+ "profiling",
+ "range-alloc",
+ "raw-window-handle",
+ "raw-window-metal",
+ "renderdoc-sys",
+ "smallvec",
+ "thiserror 2.0.18",
+ "wasm-bindgen",
+ "wayland-sys",
+ "web-sys",
+ "wgpu-naga-bridge",
+ "wgpu-types",
+ "windows",
+ "windows-core",
+]
+
+[[package]]
+name = "wgpu-naga-bridge"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b4684f4410da0cf95a4cb63bb5edaac022461dedb6adf0b64d0d9b5f6890d51"
+dependencies = [
+ "naga",
+ "wgpu-types",
+]
+
+[[package]]
+name = "wgpu-types"
+version = "29.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec2675540fb1a5cfa5ef122d3d5f390e2c75711a0b946410f2d6ac3a0f77d1f6"
+dependencies = [
+ "bitflags",
+ "bytemuck",
+ "js-sys",
+ "log",
+ "raw-window-handle",
+ "web-sys",
+]
+
+[[package]]
+name = "wide"
+version = "0.7.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce5da8ecb62bcd8ec8b7ea19f69a51275e91299be594ea5cc6ef7819e16cd03"
+dependencies = [
+ "bytemuck",
+ "safe_arch 0.7.4",
+]
+
+[[package]]
+name = "wide"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "198f6abc41fab83526d10880fa5c17e2b4ee44e763949b4bb34e2fd1e8ca48e4"
+dependencies = [
+ "bytemuck",
+ "safe_arch 1.0.0",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "windows"
+version = "0.62.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580"
+dependencies = [
+ "windows-collections",
+ "windows-core",
+ "windows-future",
+ "windows-numerics",
+]
+
+[[package]]
+name = "windows-collections"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610"
+dependencies = [
+ "windows-core",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.62.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
+dependencies = [
+ "windows-implement",
+ "windows-interface",
+ "windows-link",
+ "windows-result",
+ "windows-strings",
+]
+
+[[package]]
+name = "windows-future"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb"
+dependencies = [
+ "windows-core",
+ "windows-link",
+ "windows-threading",
+]
+
+[[package]]
+name = "windows-implement"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "windows-interface"
+version = "0.59.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-numerics"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26"
+dependencies = [
+ "windows-core",
+ "windows-link",
+]
+
+[[package]]
+name = "windows-result"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
+dependencies = [
+ "windows-targets 0.53.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm 0.52.6",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.53.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
+dependencies = [
+ "windows-link",
+ "windows_aarch64_gnullvm 0.53.1",
+ "windows_aarch64_msvc 0.53.1",
+ "windows_i686_gnu 0.53.1",
+ "windows_i686_gnullvm 0.53.1",
+ "windows_i686_msvc 0.53.1",
+ "windows_x86_64_gnu 0.53.1",
+ "windows_x86_64_gnullvm 0.53.1",
+ "windows_x86_64_msvc 0.53.1",
+]
+
+[[package]]
+name = "windows-threading"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
+
+[[package]]
+name = "winnow"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5"
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap",
+ "prettyplease",
+ "syn",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
+[[package]]
+name = "writeable"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
+
+[[package]]
+name = "wyz"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+dependencies = [
+ "tap",
+]
+
+[[package]]
+name = "xattr"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
+dependencies = [
+ "libc",
+ "rustix",
+]
+
+[[package]]
+name = "xml-rs"
+version = "0.8.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f"
+
+[[package]]
+name = "xz2"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
+dependencies = [
+ "lzma-sys",
+]
+
+[[package]]
+name = "yoke"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "zerofrom"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+
+[[package]]
+name = "zerotrie"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
diff --git a/scripts/native_bench/bench_pecos/Cargo.toml b/scripts/native_bench/bench_pecos/Cargo.toml
new file mode 100644
index 000000000..ee5e854f7
--- /dev/null
+++ b/scripts/native_bench/bench_pecos/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "bench-pecos"
+version = "0.1.0"
+edition = "2024"
+publish = false
+
+[workspace]
+
+[[bin]]
+name = "bench_pecos"
+path = "main.rs"
+
+[features]
+default = []
+gpu = ["dep:pecos-gpu-sims"]
+cuquantum = ["dep:pecos-cuquantum"]
+
+[dependencies]
+pecos-simulators = { path = "../../../crates/pecos-simulators", features = ["parallel"] }
+pecos-core = { path = "../../../crates/pecos-core" }
+pecos-gpu-sims = { path = "../../../crates/pecos-gpu-sims", optional = true }
+pecos-cuquantum = { path = "../../../crates/pecos-cuquantum", optional = true }
+
+[profile.release]
+opt-level = 3
+lto = "fat"
+codegen-units = 1
+
+[profile.native]
+inherits = "release"
diff --git a/scripts/native_bench/bench_pecos/main.rs b/scripts/native_bench/bench_pecos/main.rs
new file mode 100644
index 000000000..2e4ee4991
--- /dev/null
+++ b/scripts/native_bench/bench_pecos/main.rs
@@ -0,0 +1,961 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under
+// the License.
+
+//! Standalone PECOS benchmark using the same circuits and timing
+//! methodology as `bench_quest.c`, for direct apples-to-apples comparison.
+//!
+//! CPU-only by default. Enable GPU backends with feature flags:
+//!   --features gpu        (GpuStateVec via wgpu, f32)
+//!   --features cuquantum  (CuStateVec via cuQuantum, f64)
+
+use pecos_core::{Angle64, QubitId};
+use pecos_simulators::{
+    ArbitraryRotationGateable, CliffordGateable, DensityMatrix, QuantumSimulator, StateVecSoA,
+};
+use std::hint::black_box;
+use std::time::Instant;
+
+#[cfg(feature = "gpu")]
+use pecos_gpu_sims::{GpuStateVec32, GpuStateVec64};
+
+#[cfg(feature = "cuquantum")]
+use pecos_cuquantum::CuStateVec;
+
+// ---------------------------------------------------------------------------
+// Timing helpers
+// ---------------------------------------------------------------------------
+
+fn median(vals: &mut [f64]) -> f64 {
+    vals.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let n = vals.len();
+    if n % 2 == 1 {
+        vals[n / 2]
+    } else {
+        (vals[n / 2 - 1] + vals[n / 2]) / 2.0
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Generic circuit runner for any simulator
+// ---------------------------------------------------------------------------
+
+fn run_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
+    sim: &mut S,
+    num_qubits: usize,
+    num_layers: usize,
+) {
+    let angle = Angle64::from_radians(0.1);
+    for _layer in 0..num_layers {
+        for q in 0..num_qubits {
+            sim.h(&[QubitId(q)]);
+            sim.rz(angle, &[QubitId(q)]);
+        }
+        for q in 0..num_qubits - 1 {
+            sim.cx(&[(QubitId(q), QubitId(q + 1))]);
+        }
+    }
+}
+
+/// Circuit using SXX two-qubit gates (tests RXX shader path)
+fn run_sxx_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
+    sim: &mut S,
+    num_qubits: usize,
+    num_layers: usize,
+) {
+    let angle = Angle64::from_radians(0.1);
+    for _layer in 0..num_layers {
+        for q in 0..num_qubits {
+            sim.h(&[QubitId(q)]);
+            sim.rz(angle, &[QubitId(q)]);
+        }
+        for q in 0..num_qubits - 1 {
+            sim.sxx(&[(QubitId(q), QubitId(q + 1))]);
+        }
+    }
+}
+
+/// Circuit using RXX two-qubit gates (for RXX parallel path validation)
+fn run_rxx_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
+    sim: &mut S,
+    num_qubits: usize,
+    num_layers: usize,
+) {
+    let angle = Angle64::from_radians(0.1);
+    for _layer in 0..num_layers {
+        for q in 0..num_qubits {
+            sim.h(&[QubitId(q)]);
+            sim.rz(angle, &[QubitId(q)]);
+        }
+        for q in 0..num_qubits - 1 {
+            sim.rxx(angle, &[(QubitId(q), QubitId(q + 1))]);
+        }
+    }
+}
+
+/// Circuit using RYY two-qubit gates (for RYY parallel path validation)
+fn run_ryy_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
+    sim: &mut S,
+    num_qubits: usize,
+    num_layers: usize,
+) {
+    let angle = Angle64::from_radians(0.1);
+    for _layer in 0..num_layers {
+        for q in 0..num_qubits {
+            sim.h(&[QubitId(q)]);
+            sim.rz(angle, &[QubitId(q)]);
+        }
+        for q in 0..num_qubits - 1 {
+            sim.ryy(angle, &[(QubitId(q), QubitId(q + 1))]);
+        }
+    }
+}
+
+/// Circuit using CZ two-qubit gates (for CZ parallel path validation)
+fn run_cz_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
+    sim: &mut S,
+    num_qubits: usize,
+    num_layers: usize,
+) {
+    let angle = Angle64::from_radians(0.1);
+    for _layer in 0..num_layers {
+        for q in 0..num_qubits {
+            sim.h(&[QubitId(q)]);
+            sim.rz(angle, &[QubitId(q)]);
+        }
+        for q in 0..num_qubits - 1 {
+            sim.cz(&[(QubitId(q), QubitId(q + 1))]);
+        }
+    }
+}
+
+/// Circuit using RZZ two-qubit gates (for RZZ parallel path validation)
+fn run_rzz_circuit<S: CliffordGateable + ArbitraryRotationGateable>(
+    sim: &mut S,
+    num_qubits: usize,
+    num_layers: usize,
+) {
+    let angle = Angle64::from_radians(0.1);
+    for _layer in 0..num_layers {
+        for q in 0..num_qubits {
+            sim.h(&[QubitId(q)]);
+            sim.rz(angle, &[QubitId(q)]);
+        }
+        for q in 0..num_qubits - 1 {
+            sim.rzz(angle, &[(QubitId(q), QubitId(q + 1))]);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// CPU StateVecSoA benchmarks
+// ---------------------------------------------------------------------------
+
+fn bench_circuit(
+    num_qubits: usize,
+    num_layers: usize,
+    reps: usize,
+    fusion: bool,
+    parallel: bool,
+) {
+    let mut sim = StateVecSoA::new(num_qubits);
+    sim.set_parallel(parallel);
+    sim.set_fusion(fusion);
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        sim.reset();
+        let t0 = Instant::now();
+        run_circuit(&mut sim, num_qubits, num_layers);
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let tag = match (fusion, parallel) {
+        (false, false) => "nofuse ",
+        (true, false) => "fused  ",
+        (false, true) => "nf+par ",
+        (true, true) => "fu+par ",
+    };
+    let med = median(&mut times);
+    println!("circuit  {num_qubits:2}q {num_layers:2}l  {tag}{med:12.3} us", med = med * 1e6);
+}
+
+fn bench_2q_circuit(
+    label: &str,
+    run: fn(&mut StateVecSoA, usize, usize),
+    num_qubits: usize,
+    num_layers: usize,
+    reps: usize,
+    parallel: bool,
+) {
+    let mut sim = StateVecSoA::new(num_qubits);
+    sim.set_parallel(parallel);
+    sim.set_fusion(true);
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        sim.reset();
+        let t0 = Instant::now();
+        run(&mut sim, num_qubits, num_layers);
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let tag = if parallel { "fu+par " } else { "fused  " };
+    let med = median(&mut times);
+    println!("{label}  {num_qubits:2}q {num_layers:2}l  {tag}{med:12.3} us", med = med * 1e6);
+}
+
+fn bench_gate_h(num_qubits: usize, iters: usize, reps: usize) {
+    let mut sim = StateVecSoA::new(num_qubits);
+    sim.set_parallel(false);
+    sim.set_fusion(false);
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            for q in 0..num_qubits {
+                sim.h(&[QubitId(q)]);
+            }
+        }
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    println!("gate     H        {med:12.3} us", med = median(&mut times) * 1e6);
+}
+
+fn bench_gate_x(num_qubits: usize, iters: usize, reps: usize) {
+    let mut sim = StateVecSoA::new(num_qubits);
+    sim.set_parallel(false);
+    sim.set_fusion(false);
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            for q in 0..num_qubits {
+                sim.x(&[QubitId(q)]);
+            }
+        }
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    println!("gate     X        {med:12.3} us", med = median(&mut times) * 1e6);
+}
+
+fn bench_gate_cx_pair(
+    num_qubits: usize,
+    c: usize,
+    t_q: usize,
+    iters: usize,
+    reps: usize,
+    parallel: bool,
+) {
+    let mut sim = StateVecSoA::new(num_qubits);
+    sim.set_parallel(parallel);
+    sim.set_fusion(false);
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            sim.cx(&[(QubitId(c), QubitId(t_q))]);
+        }
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let med = median(&mut times);
+    let per_call_us = med * 1e6 / (iters as f64);
+    let tag = if parallel { "par " } else { "ser " };
+    println!("cx  N={num_qubits:2} ({c},{t_q}) {tag} per_call={per_call_us:10.2} us");
+}
+
+fn bench_gate_cx(num_qubits: usize, iters: usize, reps: usize) {
+    let mut sim = StateVecSoA::new(num_qubits);
+    sim.set_parallel(false);
+    sim.set_fusion(false);
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            for q in 0..num_qubits - 1 {
+                sim.cx(&[(QubitId(q), QubitId(q + 1))]);
+            }
+        }
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    println!("gate     CX       {med:12.3} us", med = median(&mut times) * 1e6);
+}
+
+fn bench_gate_rz(num_qubits: usize, iters: usize, reps: usize) {
+    let mut sim = StateVecSoA::new(num_qubits);
+    sim.set_parallel(false);
+    sim.set_fusion(false);
+    let angle = Angle64::from_radians(0.1);
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            for q in 0..num_qubits {
+                sim.rz(angle, &[QubitId(q)]);
+            }
+        }
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    println!("gate     RZ       {med:12.3} us", med = median(&mut times) * 1e6);
+}
+
+// ---------------------------------------------------------------------------
+// Density matrix benchmark
+// ---------------------------------------------------------------------------
+
+fn bench_dm_circuit(num_qubits: usize, num_layers: usize, reps: usize, parallel: bool) {
+    let mut sim = DensityMatrix::new(num_qubits);
+    if parallel {
+        sim.state_vector_mut().set_parallel(true);
+    }
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        sim.reset();
+        let t0 = Instant::now();
+        run_circuit(&mut sim, num_qubits, num_layers);
+        black_box(&sim);
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let tag = if parallel { "par   " } else { "serial" };
+    let med = median(&mut times);
+    println!("dm_circ  {num_qubits:2}q {num_layers:2}l  {tag}  {med:12.3} us", med = med * 1e6);
+}
+
+// ---------------------------------------------------------------------------
+// GPU benchmarks: GpuStateVec (wgpu, f32)
+// ---------------------------------------------------------------------------
+
+#[cfg(feature = "gpu")]
+fn bench_gpu_circuit(num_qubits: usize, num_layers: usize, reps: usize) {
+    let mut sim = match GpuStateVec32::new(num_qubits as u32) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("GpuStateVec({num_qubits}): {e}");
+            return;
+        }
+    };
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        sim.reset();
+        sim.sync();
+        let t0 = Instant::now();
+        run_circuit(&mut sim, num_qubits, num_layers);
+        sim.sync();
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let med = median(&mut times);
+    println!("circuit  {num_qubits:2}q {num_layers:2}l  {med:12.3} us", med = med * 1e6);
+}
+
+#[cfg(feature = "gpu")]
+fn run_gpu_benchmarks(reps: usize) {
+    println!();
+    println!("=== PECOS GpuStateVec (wgpu, f32) standalone benchmarks ===");
+    println!();
+    println!("-- Layered circuits (median of {reps} runs) --");
+
+    let configs = [
+        (10, 20),
+        (14, 20),
+        (18, 20),
+        (20, 20),
+        (22, 20),
+        (24, 10),
+        (26, 5),
+    ];
+
+    for (nq, nl) in configs {
+        bench_gpu_circuit(nq, nl, reps);
+    }
+
+    // Measurement-heavy circuit (tests CPU measurement fast path)
+    println!();
+    println!("-- Gate+Measure circuit (median of {reps} runs) --");
+    for nq in [4, 8, 10] {
+        let mut sim = match GpuStateVec32::new(nq as u32) {
+            Ok(s) => s,
+            Err(e) => {
+                eprintln!("GpuStateVec32({nq}): {e}");
+                continue;
+            }
+        };
+        let mut times = vec![0.0_f64; reps];
+        for t in &mut times {
+            sim.reset();
+            sim.sync();
+            let t0 = Instant::now();
+            // 50 rounds of: H on all qubits, then measure all qubits
+            for _round in 0..50 {
+                for q in 0..nq {
+                    sim.h(&[QubitId(q)]);
+                }
+                for q in 0..nq {
+                    sim.mz(&[QubitId(q)]);
+                }
+            }
+            sim.sync();
+            *t = t0.elapsed().as_secs_f64();
+        }
+        let med = median(&mut times);
+        println!("mz_circ  {nq:2}q 50r  {med:12.3} us", med = med * 1e6);
+    }
+
+    // Check if small-state workloads are limited by workgroup size (thread utilization)
+    println!();
+    println!("-- Small-state gate throughput (N=4..10, 100 gates, {reps} runs) --");
+    for nq in [4u32, 6, 8, 10] {
+        let mut sim = GpuStateVec32::new(nq).unwrap();
+        let mut times = vec![0.0_f64; reps];
+        for t in &mut times {
+            sim.reset();
+            sim.sync();
+            let t0 = Instant::now();
+            for _iter in 0..50 {
+                for i in 0..100 {
+                    sim.h(&[QubitId((i % nq as usize))]);
+                }
+                let _s = sim.state();
+            }
+            sim.sync();
+            *t = t0.elapsed().as_secs_f64();
+        }
+        let med = median(&mut times);
+        let per_iter_us = med * 1e6 / 50.0;
+        let num_amps = 1u64 << nq;
+        let threads_used = num_amps / 2; // num pairs
+        let thread_util = (threads_used as f64 / 256.0 * 100.0).min(100.0);
+        println!("N={nq:2}  amps={num_amps:4}  per_iter={per_iter_us:7.1}us  thread_util={thread_util:5.1}%");
+    }
+
+    // Persistent vs dispatched path: measure per-gate-count overhead
+    println!();
+    println!("-- Persistent kernel overhead vs gate count (N=10, persistent) ({reps} runs) --");
+    let nq = 10;
+    for num_gates in [1, 2, 5, 10, 20, 50, 100, 200] {
+        let mut sim = GpuStateVec32::new(nq).unwrap();
+        let mut times = vec![0.0_f64; reps];
+        for t in &mut times {
+            sim.reset();
+            sim.sync();
+            let t0 = Instant::now();
+            for _iter in 0..50 {
+                // Queue N gates
+                for i in 0..num_gates {
+                    sim.h(&[QubitId((i % nq as usize))]);
+                }
+                // Force flush by reading state
+                let _s = sim.state();
+            }
+            sim.sync();
+            *t = t0.elapsed().as_secs_f64();
+        }
+        let med = median(&mut times);
+        let per_iter_us = med * 1e6 / 50.0;
+        let per_gate_us = per_iter_us / (num_gates as f64);
+        println!("gates={num_gates:3}  per_iter={per_iter_us:7.1}us  per_gate={per_gate_us:5.2}us");
+    }
+
+    // Same gate count test at N=14 (above persistent_max_qubits on this GPU -> uses dispatched path)
+    println!();
+    println!("-- Dispatched path: gate count overhead (N=14, dispatched) ({reps} runs) --");
+    let nq = 14;
+    for num_gates in [1, 2, 5, 10, 20, 50, 100, 200] {
+        let mut sim = GpuStateVec32::new(nq).unwrap();
+        let mut times = vec![0.0_f64; reps];
+        for t in &mut times {
+            sim.reset();
+            sim.sync();
+            let t0 = Instant::now();
+            for _iter in 0..50 {
+                for i in 0..num_gates {
+                    sim.h(&[QubitId((i % nq as usize))]);
+                }
+                let _s = sim.state();
+            }
+            sim.sync();
+            *t = t0.elapsed().as_secs_f64();
+        }
+        let med = median(&mut times);
+        let per_iter_us = med * 1e6 / 50.0;
+        let per_gate_us = per_iter_us / (num_gates as f64);
+        println!("gates={num_gates:3}  per_iter={per_iter_us:7.1}us  per_gate={per_gate_us:5.2}us");
+    }
+
+    // Measure time for a bare state readback (for calibration cost analysis)
+    println!();
+    println!("-- Calibration cost: state readback time at various N --");
+    for nq in [8, 10, 12, 14, 16, 18, 20] {
+        let mut sim = GpuStateVec32::new(nq as u32).unwrap();
+        sim.reset();
+        sim.sync();
+        // Warm up
+        let _ = sim.state();
+        let _ = sim.state();
+        // Measure 20 readbacks
+        let t0 = Instant::now();
+        for _ in 0..20 {
+            let _state = sim.state();
+        }
+        let total = t0.elapsed().as_secs_f64();
+        let per_readback_us = total * 1e6 / 20.0;
+        let state_bytes = (1u64 << nq) * 8;
+        let effective_gbps = (state_bytes as f64) / (per_readback_us * 1e-6) / 1e9;
+        println!("N={nq:2}  readback={per_readback_us:8.1}us  state_size={kb:7.1}KB  effective_bw={gbps:.2}GB/s",
+            kb = state_bytes as f64 / 1024.0, gbps = effective_gbps);
+    }
+
+    // (N, M) grid benchmark: probe crossover surface for path selection.
+    // Uses mz_gpu_sequential() and mz_cpu_batch() directly to bypass the
+    // path-selection lookup table -- otherwise both columns would converge.
+    println!();
+    println!("-- (N, M) f32 batch vs sequential mz probe (median of {reps} runs, 10 rounds) --");
+    for nq in [10, 12, 14, 16, 18, 20] {
+        for &m in &[1, 2, 4, 8, nq / 2, nq] {
+            let m = m.min(nq);
+            if m == 0 { continue; }
+            let ancillas: Vec<QubitId> = (0..m).map(QubitId).collect();
+
+            // Sequential GPU path (forced)
+            let mut sim = GpuStateVec32::new(nq as u32).unwrap();
+            let mut times = vec![0.0_f64; reps];
+            for t in &mut times {
+                sim.reset();
+                sim.sync();
+                let t0 = Instant::now();
+                for _round in 0..10 {
+                    for q in 0..nq { sim.h(&[QubitId(q)]); }
+                    sim.mz_gpu_sequential(&ancillas);
+                }
+                sim.sync();
+                *t = t0.elapsed().as_secs_f64();
+            }
+            let seq_med = median(&mut times);
+
+            // CPU batch path (forced)
+            let mut sim = GpuStateVec32::new(nq as u32).unwrap();
+            let mut times = vec![0.0_f64; reps];
+            for t in &mut times {
+                sim.reset();
+                sim.sync();
+                let t0 = Instant::now();
+                for _round in 0..10 {
+                    for q in 0..nq { sim.h(&[QubitId(q)]); }
+                    sim.mz_cpu_batch(&ancillas);
+                }
+                sim.sync();
+                *t = t0.elapsed().as_secs_f64();
+            }
+            let batch_med = median(&mut times);
+
+            let ratio = seq_med / batch_med;
+            let winner = if ratio > 1.0 { "BATCH" } else { "SEQ  " };
+            println!(
+                "N={nq:2} M={m:2}  seq={seq:10.1}us  batch={bat:10.1}us  ratio={ratio:.2}  {winner}",
+                seq = seq_med * 1e6,
+                bat = batch_med * 1e6,
+            );
+        }
+    }
+
+    // Same probe for f64 -- transfers and CPU loops are 2x f32 due to wider amps.
+    println!();
+    println!("-- (N, M) f64 batch vs sequential mz probe (median of {reps} runs, 10 rounds) --");
+    for nq in [10, 12, 14, 16, 18, 20] {
+        for &m in &[1, 2, 4, 8, nq / 2, nq] {
+            let m = m.min(nq);
+            if m == 0 { continue; }
+            let ancillas: Vec<QubitId> = (0..m).map(QubitId).collect();
+
+            let mut sim = GpuStateVec64::new(nq as u32).unwrap();
+            let mut times = vec![0.0_f64; reps];
+            for t in &mut times {
+                sim.reset();
+                sim.sync();
+                let t0 = Instant::now();
+                for _round in 0..10 {
+                    for q in 0..nq { sim.h(&[QubitId(q)]); }
+                    sim.mz_gpu_sequential(&ancillas);
+                }
+                sim.sync();
+                *t = t0.elapsed().as_secs_f64();
+            }
+            let seq_med = median(&mut times);
+
+            let mut sim = GpuStateVec64::new(nq as u32).unwrap();
+            let mut times = vec![0.0_f64; reps];
+            for t in &mut times {
+                sim.reset();
+                sim.sync();
+                let t0 = Instant::now();
+                for _round in 0..10 {
+                    for q in 0..nq { sim.h(&[QubitId(q)]); }
+                    sim.mz_cpu_batch(&ancillas);
+                }
+                sim.sync();
+                *t = t0.elapsed().as_secs_f64();
+            }
+            let batch_med = median(&mut times);
+
+            let ratio = seq_med / batch_med;
+            let winner = if ratio > 1.0 { "BATCH" } else { "SEQ  " };
+            println!(
+                "N={nq:2} M={m:2}  seq={seq:10.1}us  batch={bat:10.1}us  ratio={ratio:.2}  {winner}",
+                seq = seq_med * 1e6,
+                bat = batch_med * 1e6,
+            );
+        }
+    }
+
+    // QEC-pattern benchmark: block of gates + batch measurements + conditionals
+    println!();
+    println!("-- QEC-pattern: gates + mz + conditional gates (median of {reps} runs) --");
+    for nq in [10, 14, 18] {
+        let ancillas: Vec<QubitId> = (0..nq/2).map(QubitId).collect();
+        let mut sim = GpuStateVec32::new(nq as u32).unwrap();
+        let mut times = vec![0.0_f64; reps];
+        for t in &mut times {
+            sim.reset();
+            sim.sync();
+            let t0 = Instant::now();
+            for _round in 0..20 {
+                // Block of gates
+                for q in 0..nq { sim.h(&[QubitId(q)]); }
+                for q in 0..nq-1 { sim.cx(&[(QubitId(q), QubitId(q+1))]); }
+                // Batch measure ancillas
+                let outcomes = sim.mz(&ancillas);
+                // Conditional gates based on outcome (simulate classical logic)
+                for (i, r) in outcomes.iter().enumerate() {
+                    if r.outcome {
+                        sim.z(&[QubitId(i)]);
+                    }
+                }
+            }
+            sim.sync();
+            *t = t0.elapsed().as_secs_f64();
+        }
+        let med = median(&mut times);
+        println!("qec_batch{nq:2}q 20r  {med:12.3} us", med = med * 1e6);
+
+        // Sequential mz (per-ancilla) for comparison
+        let mut sim = GpuStateVec32::new(nq as u32).unwrap();
+        let mut times = vec![0.0_f64; reps];
+        for t in &mut times {
+            sim.reset();
+            sim.sync();
+            let t0 = Instant::now();
+            for _round in 0..20 {
+                for q in 0..nq { sim.h(&[QubitId(q)]); }
+                for q in 0..nq-1 { sim.cx(&[(QubitId(q), QubitId(q+1))]); }
+                for a in &ancillas {
+                    let outcomes = sim.mz(&[*a]);
+                    if outcomes[0].outcome {
+                        sim.z(&[QubitId(a.0)]);
+                    }
+                }
+            }
+            sim.sync();
+            *t = t0.elapsed().as_secs_f64();
+        }
+        let med = median(&mut times);
+        println!("qec_seq  {nq:2}q 20r  {med:12.3} us", med = med * 1e6);
+    }
+
+    // Batch measurement (measure all qubits at once vs one at a time)
+    println!();
+    println!("-- Batch vs sequential mz (median of {reps} runs) --");
+    for nq in [4, 8, 10] {
+        let all_qubits: Vec<QubitId> = (0..nq).map(QubitId).collect();
+        // Sequential: mz one qubit at a time
+        {
+            let mut sim = GpuStateVec32::new(nq as u32).unwrap();
+            let mut times = vec![0.0_f64; reps];
+            for t in &mut times {
+                sim.reset();
+                sim.sync();
+                let t0 = Instant::now();
+                for _round in 0..50 {
+                    for q in 0..nq { sim.h(&[QubitId(q)]); }
+                    for q in 0..nq { sim.mz(&[QubitId(q)]); }
+                }
+                sim.sync();
+                *t = t0.elapsed().as_secs_f64();
+            }
+            let med = median(&mut times);
+            println!("mz_seq   {nq:2}q 50r  {med:12.3} us", med = med * 1e6);
+        }
+        // Batch: mz all qubits at once
+        {
+            let mut sim = GpuStateVec32::new(nq as u32).unwrap();
+            let mut times = vec![0.0_f64; reps];
+            for t in &mut times {
+                sim.reset();
+                sim.sync();
+                let t0 = Instant::now();
+                for _round in 0..50 {
+                    for q in 0..nq { sim.h(&[QubitId(q)]); }
+                    sim.mz(&all_qubits);
+                }
+                sim.sync();
+                *t = t0.elapsed().as_secs_f64();
+            }
+            let med = median(&mut times);
+            println!("mz_batch {nq:2}q 50r  {med:12.3} us", med = med * 1e6);
+        }
+    }
+
+    // SXX circuit (tests RXX shader vs decomposition)
+    println!();
+    println!("-- SXX circuit (median of {reps} runs) --");
+    for (nq, nl) in [(10, 20), (18, 20)] {
+        let mut sim = match GpuStateVec32::new(nq as u32) {
+            Ok(s) => s,
+            Err(e) => {
+                eprintln!("GpuStateVec32({nq}): {e}");
+                continue;
+            }
+        };
+        let mut times = vec![0.0_f64; reps];
+        for t in &mut times {
+            sim.reset();
+            sim.sync();
+            let t0 = Instant::now();
+            run_sxx_circuit(&mut sim, nq, nl);
+            sim.sync();
+            *t = t0.elapsed().as_secs_f64();
+        }
+        let med = median(&mut times);
+        println!("sxx_circ {nq:2}q {nl:2}l  {med:12.3} us", med = med * 1e6);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// GPU benchmarks: GpuStateVec64 (wgpu, f64)
+// ---------------------------------------------------------------------------
+
+#[cfg(feature = "gpu")]
+fn bench_gpu64_circuit(num_qubits: usize, num_layers: usize, reps: usize) {
+    let mut sim = match GpuStateVec64::new(num_qubits as u32) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("GpuStateVec64({num_qubits}): {e}");
+            return;
+        }
+    };
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        sim.reset();
+        sim.sync();
+        let t0 = Instant::now();
+        run_circuit(&mut sim, num_qubits, num_layers);
+        sim.sync();
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let med = median(&mut times);
+    println!("circuit  {num_qubits:2}q {num_layers:2}l  {med:12.3} us", med = med * 1e6);
+}
+
+#[cfg(feature = "gpu")]
+fn run_gpu64_benchmarks(reps: usize) {
+    println!();
+    println!("=== PECOS GpuStateVec64 (wgpu, f64) standalone benchmarks ===");
+    println!();
+    println!("-- Layered circuits (median of {reps} runs) --");
+
+    let configs = [
+        (10, 20),
+        (14, 20),
+        (18, 20),
+        (20, 20),
+        (22, 20),
+        (24, 10),
+        (26, 5),
+    ];
+
+    for (nq, nl) in configs {
+        bench_gpu64_circuit(nq, nl, reps);
+    }
+
+    println!();
+    println!("=== PECOS GpuDensityMatrix (Choi on GpuStateVec32) benchmarks ===");
+    println!();
+    println!("-- Density matrix: layered circuits (median of {reps} runs) --");
+
+    let dm_configs = [(6, 20), (8, 20), (10, 20), (12, 10), (13, 5)];
+    for (nq, nl) in dm_configs {
+        bench_gpu_dm_circuit(nq, nl, reps);
+    }
+}
+
+#[cfg(feature = "gpu")]
+fn bench_gpu_dm_circuit(num_qubits: usize, num_layers: usize, reps: usize) {
+    use pecos_gpu_sims::GpuDensityMatrix32;
+    let mut sim = match GpuDensityMatrix32::new(num_qubits) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("GpuDensityMatrix({num_qubits}): {e}");
+            return;
+        }
+    };
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        sim.reset();
+        sim.sync();
+        let t0 = Instant::now();
+        run_circuit(&mut sim, num_qubits, num_layers);
+        sim.sync();
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let med = median(&mut times);
+    println!("dm_circ  {num_qubits:2}q {num_layers:2}l  {med:12.3} us", med = med * 1e6);
+}
+
+// ---------------------------------------------------------------------------
+// GPU benchmarks: CuStateVec (cuQuantum, f64)
+// ---------------------------------------------------------------------------
+
+#[cfg(feature = "cuquantum")]
+fn bench_cuquantum_circuit(num_qubits: usize, num_layers: usize, reps: usize) {
+    let mut sim = match CuStateVec::new(num_qubits) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("CuStateVec({num_qubits}): {e}");
+            return;
+        }
+    };
+    let mut times = vec![0.0_f64; reps];
+
+    for t in &mut times {
+        sim.reset();
+        sim.sync();
+        let t0 = Instant::now();
+        run_circuit(&mut sim, num_qubits, num_layers);
+        sim.sync();
+        *t = t0.elapsed().as_secs_f64();
+    }
+
+    let med = median(&mut times);
+    println!("circuit  {num_qubits:2}q {num_layers:2}l  {med:12.3} us", med = med * 1e6);
+}
+
+#[cfg(feature = "cuquantum")]
+fn run_cuquantum_benchmarks(reps: usize) {
+    println!();
+    println!("=== PECOS CuStateVec (cuQuantum, f64) standalone benchmarks ===");
+    println!();
+    println!("-- Layered circuits (median of {reps} runs) --");
+
+    let configs = [
+        (10, 20),
+        (14, 20),
+        (18, 20),
+        (20, 20),
+        (22, 20),
+        (24, 10),
+        (26, 5),
+    ];
+
+    for (nq, nl) in configs {
+        bench_cuquantum_circuit(nq, nl, reps);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+fn main() {
+    let reps = 5;
+
+    println!("=== PECOS StateVecSoA standalone benchmarks ===");
+    println!();
+    println!("-- Layered circuits (median of {reps} runs) --");
+
+    let configs = [(10, 20), (14, 20), (18, 20), (20, 20), (22, 10), (24, 5)];
+
+    for (num_qubits, num_layers) in configs {
+        bench_circuit(num_qubits, num_layers, reps, false, false);
+        bench_circuit(num_qubits, num_layers, reps, true, false);
+        bench_circuit(num_qubits, num_layers, reps, false, true);
+        bench_circuit(num_qubits, num_layers, reps, true, true);
+    }
+
+    println!();
+    println!("-- CX scalar-path overhead at various N (low-qubit scalar fallback) --");
+    for n in [18, 20, 22] {
+        bench_gate_cx_pair(n, 0, 1, 20, reps, false);
+        bench_gate_cx_pair(n, 0, 1, 20, reps, true);
+        bench_gate_cx_pair(n, 2, 3, 20, reps, false);
+        bench_gate_cx_pair(n, 2, 3, 20, reps, true);
+    }
+
+    println!();
+    println!("-- 2-qubit gate circuits: fused vs fused+parallel at 22q 10l (median of {reps} runs) --");
+    for (label, run) in [
+        ("cz_circ ", run_cz_circuit::<StateVecSoA> as fn(&mut StateVecSoA, usize, usize)),
+        ("rzz_circ", run_rzz_circuit::<StateVecSoA>),
+        ("rxx_circ", run_rxx_circuit::<StateVecSoA>),
+        ("ryy_circ", run_ryy_circuit::<StateVecSoA>),
+    ] {
+        bench_2q_circuit(label, run, 22, 10, reps, false);
+        bench_2q_circuit(label, run, 22, 10, reps, true);
+    }
+
+    println!();
+    println!("-- Individual gates at 18 qubits, 100 iters (median of {reps} runs) --");
+    bench_gate_h(18, 100, reps);
+    bench_gate_x(18, 100, reps);
+    bench_gate_cx(18, 100, reps);
+    bench_gate_rz(18, 100, reps);
+
+    println!();
+    println!("-- Density matrix: layered circuits (median of {reps} runs) --");
+
+    let dm_configs = [(6, 20), (8, 20), (10, 20), (12, 10), (13, 5)];
+
+    for (num_qubits, num_layers) in dm_configs {
+        bench_dm_circuit(num_qubits, num_layers, reps, false);
+        bench_dm_circuit(num_qubits, num_layers, reps, true);
+    }
+
+    #[cfg(feature = "gpu")]
+    run_gpu_benchmarks(reps);
+
+    #[cfg(feature = "gpu")]
+    run_gpu64_benchmarks(reps);
+
+    #[cfg(feature = "cuquantum")]
+    run_cuquantum_benchmarks(reps);
+}
diff --git a/scripts/native_bench/bench_quest.c b/scripts/native_bench/bench_quest.c
index e5795a873..34b6b6859 100644
--- a/scripts/native_bench/bench_quest.c
+++ b/scripts/native_bench/bench_quest.c
@@ -156,6 +156,27 @@ static void bench_gate_rz(int num_qubits, int iters, int reps) {
     destroyQureg(q);
 }
 
+// ---------------------------------------------------------------------------
+// Density matrix circuit benchmark
+// ---------------------------------------------------------------------------
+
+static void bench_dm_circuit(int num_qubits, int num_layers, int reps) {
+    Qureg q = createDensityQureg(num_qubits);
+    double times[reps];
+
+    for (int r = 0; r < reps; r++) {
+        initZeroState(q);
+        double t0 = now_sec();
+        run_circuit(q, num_qubits, num_layers);
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+
+    double med = median(times, reps);
+    printf("dm_circ  %2dq %2dl  %12.3f us\n", num_qubits, num_layers, med * 1e6);
+    destroyQureg(q);
+}
+
 // ---------------------------------------------------------------------------
 // Main
 // ---------------------------------------------------------------------------
@@ -166,23 +187,34 @@ int main(void) {
     int reps = 5;
 
     printf("=== QuEST v4 standalone benchmarks ===\n");
-    printf("\n-- Layered circuits (median of %d runs) --\n", reps);
+    printf("\n-- State vector: layered circuits (median of %d runs) --\n", reps);
 
-    int configs[][2] = {
+    int sv_configs[][2] = {
         {10, 20}, {14, 20}, {18, 20}, {20, 20}, {22, 10}, {24, 5}
     };
-    int n_configs = sizeof(configs) / sizeof(configs[0]);
+    int n_sv = sizeof(sv_configs) / sizeof(sv_configs[0]);
 
-    for (int i = 0; i < n_configs; i++) {
-        bench_circuit(configs[i][0], configs[i][1], reps);
+    for (int i = 0; i < n_sv; i++) {
+        bench_circuit(sv_configs[i][0], sv_configs[i][1], reps);
     }
 
-    printf("\n-- Individual gates at 18 qubits, 100 iters (median of %d runs) --\n", reps);
+    printf("\n-- State vector: individual gates at 18 qubits, 100 iters (median of %d runs) --\n", reps);
     bench_gate_h(18, 100, reps);
     bench_gate_x(18, 100, reps);
     bench_gate_cx(18, 100, reps);
     bench_gate_rz(18, 100, reps);
 
+    printf("\n-- Density matrix: layered circuits (median of %d runs) --\n", reps);
+
+    int dm_configs[][2] = {
+        {6, 20}, {8, 20}, {10, 20}, {12, 10}, {13, 5}
+    };
+    int n_dm = sizeof(dm_configs) / sizeof(dm_configs[0]);
+
+    for (int i = 0; i < n_dm; i++) {
+        bench_dm_circuit(dm_configs[i][0], dm_configs[i][1], reps);
+    }
+
     finalizeQuESTEnv();
     return 0;
 }
diff --git a/scripts/native_bench/bench_qulacs.cpp b/scripts/native_bench/bench_qulacs.cpp
index 4106d31cc..355458949 100644
--- a/scripts/native_bench/bench_qulacs.cpp
+++ b/scripts/native_bench/bench_qulacs.cpp
@@ -21,8 +21,10 @@
 #include <vector>
 
 #include "cppsim/state.hpp"
+#include "cppsim/state_dm.hpp"
 #include "cppsim/gate_factory.hpp"
 #include "csim/update_ops.hpp"
+#include "csim/update_ops_dm.hpp"
 
 // ---------------------------------------------------------------------------
 // Timing helpers
@@ -261,6 +263,42 @@ static void bench_gate_rz_csim(int num_qubits, int iters, int reps) {
     std::printf("gate     RZ       %-10s %12.3f us\n", "csim", median(times) * 1e6);
 }
 
+// ---------------------------------------------------------------------------
+// Density matrix circuit: direct csim dm kernels
+// ---------------------------------------------------------------------------
+
+static void run_dm_circuit_csim(DensityMatrixCpu& state, int num_qubits, int num_layers) {
+    CTYPE* data = state.data_c();
+    ITYPE dim = state.dim;
+
+    for (int layer = 0; layer < num_layers; layer++) {
+        for (int q = 0; q < num_qubits; q++) {
+            dm_H_gate((UINT)q, data, dim);
+            dm_RZ_gate((UINT)q, -0.1, data, dim);
+        }
+        for (int q = 0; q < num_qubits - 1; q++) {
+            dm_CNOT_gate((UINT)q, (UINT)(q + 1), data, dim);
+        }
+    }
+}
+
+static void bench_dm_circuit(int num_qubits, int num_layers, int reps) {
+    DensityMatrixCpu state(num_qubits);
+    std::vector<double> times(reps);
+
+    for (int r = 0; r < reps; r++) {
+        state.set_zero_state();
+        double t0 = now_sec();
+        run_dm_circuit_csim(state, num_qubits, num_layers);
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+
+    double med = median(times);
+    std::printf("dm_circ  %2dq %2dl  %12.3f us\n",
+                num_qubits, num_layers, med * 1e6);
+}
+
 // ---------------------------------------------------------------------------
 // Main
 // ---------------------------------------------------------------------------
@@ -269,19 +307,19 @@ int main() {
     int reps = 5;
 
     std::printf("=== Qulacs standalone benchmarks ===\n");
-    std::printf("\n-- Layered circuits (median of %d runs) --\n", reps);
+    std::printf("\n-- State vector: layered circuits (median of %d runs) --\n", reps);
 
-    int configs[][2] = {
+    int sv_configs[][2] = {
         {10, 20}, {14, 20}, {18, 20}, {20, 20}, {22, 10}, {24, 5}
     };
-    int n_configs = sizeof(configs) / sizeof(configs[0]);
+    int n_sv = sizeof(sv_configs) / sizeof(sv_configs[0]);
 
-    for (int i = 0; i < n_configs; i++) {
-        bench_circuit(configs[i][0], configs[i][1], reps, "gate_api", run_circuit_gate_api);
-        bench_circuit(configs[i][0], configs[i][1], reps, "csim", run_circuit_csim);
+    for (int i = 0; i < n_sv; i++) {
+        bench_circuit(sv_configs[i][0], sv_configs[i][1], reps, "gate_api", run_circuit_gate_api);
+        bench_circuit(sv_configs[i][0], sv_configs[i][1], reps, "csim", run_circuit_csim);
     }
 
-    std::printf("\n-- Individual gates at 18 qubits, 100 iters (median of %d runs) --\n", reps);
+    std::printf("\n-- State vector: individual gates at 18 qubits, 100 iters (median of %d runs) --\n", reps);
 
     bench_gate_h_api(18, 100, reps);
     bench_gate_h_csim(18, 100, reps);
@@ -295,5 +333,16 @@ int main() {
     bench_gate_rz_api(18, 100, reps);
     bench_gate_rz_csim(18, 100, reps);
 
+    std::printf("\n-- Density matrix: layered circuits (median of %d runs) --\n", reps);
+
+    int dm_configs[][2] = {
+        {6, 20}, {8, 20}, {10, 20}, {12, 10}, {13, 5}
+    };
+    int n_dm = sizeof(dm_configs) / sizeof(dm_configs[0]);
+
+    for (int i = 0; i < n_dm; i++) {
+        bench_dm_circuit(dm_configs[i][0], dm_configs[i][1], reps);
+    }
+
     return 0;
 }
diff --git a/scripts/native_bench/bench_qulacs_gpu.cpp b/scripts/native_bench/bench_qulacs_gpu.cpp
new file mode 100644
index 000000000..d6314dca5
--- /dev/null
+++ b/scripts/native_bench/bench_qulacs_gpu.cpp
@@ -0,0 +1,194 @@
+// Copyright 2026 The PECOS Developers
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under
+// the License.
+
+// Standalone Qulacs GPU benchmark using QuantumStateGpu and direct GPU kernels.
+// Compiled and linked against a CUDA-enabled CMake-built Qulacs library.
+
+#include <cstdio>
+#include <cstdlib>
+#include <chrono>
+#include <algorithm>
+#include <vector>
+
+#include "cppsim/state_gpu.hpp"
+#include "gpusim/update_ops_cuda.h"
+
+// ---------------------------------------------------------------------------
+// Timing helpers
+// ---------------------------------------------------------------------------
+
+static double now_sec() {
+    auto tp = std::chrono::steady_clock::now();
+    return std::chrono::duration<double>(tp.time_since_epoch()).count();
+}
+
+static double median(std::vector<double>& vals) {
+    std::sort(vals.begin(), vals.end());
+    size_t n = vals.size();
+    if (n % 2 == 1) return vals[n / 2];
+    return (vals[n / 2 - 1] + vals[n / 2]) / 2.0;
+}
+
+// ---------------------------------------------------------------------------
+// Circuit: layered H + RZ + CX (direct GPU kernels)
+// ---------------------------------------------------------------------------
+
+static void run_circuit_gpu(QuantumStateGpu& state, int num_qubits, int num_layers) {
+    void* data = state.data();
+    ITYPE dim = state.dim;
+    void* stream = state.get_cuda_stream();
+    UINT dev = state.device_number;
+
+    for (int layer = 0; layer < num_layers; layer++) {
+        for (int q = 0; q < num_qubits; q++) {
+            H_gate_host((UINT)q, data, dim, stream, dev);
+            // Qulacs uses opposite sign convention for rotations
+            RZ_gate_host((UINT)q, -0.1, data, dim, stream, dev);
+        }
+        for (int q = 0; q < num_qubits - 1; q++) {
+            CNOT_gate_host((UINT)q, (UINT)(q + 1), data, dim, stream, dev);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Layered circuit benchmark
+// ---------------------------------------------------------------------------
+
+static void bench_circuit(int num_qubits, int num_layers, int reps) {
+    QuantumStateGpu state(num_qubits);
+    std::vector<double> times(reps);
+
+    for (int r = 0; r < reps; r++) {
+        state.set_zero_state();
+        double t0 = now_sec();
+        run_circuit_gpu(state, num_qubits, num_layers);
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+
+    double med = median(times);
+    std::printf("circuit  %2dq %2dl  %12.3f us\n",
+                num_qubits, num_layers, med * 1e6);
+}
+
+// ---------------------------------------------------------------------------
+// Individual gate benchmarks
+// ---------------------------------------------------------------------------
+
+static void bench_gate_h(int num_qubits, int iters, int reps) {
+    QuantumStateGpu state(num_qubits);
+    state.set_zero_state();
+    void* data = state.data();
+    ITYPE dim = state.dim;
+    void* stream = state.get_cuda_stream();
+    UINT dev = state.device_number;
+    std::vector<double> times(reps);
+
+    for (int r = 0; r < reps; r++) {
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < num_qubits; q++)
+                H_gate_host((UINT)q, data, dim, stream, dev);
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    std::printf("gate     H        %12.3f us\n", median(times) * 1e6);
+}
+
+static void bench_gate_x(int num_qubits, int iters, int reps) {
+    QuantumStateGpu state(num_qubits);
+    state.set_zero_state();
+    void* data = state.data();
+    ITYPE dim = state.dim;
+    void* stream = state.get_cuda_stream();
+    UINT dev = state.device_number;
+    std::vector<double> times(reps);
+
+    for (int r = 0; r < reps; r++) {
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < num_qubits; q++)
+                X_gate_host((UINT)q, data, dim, stream, dev);
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    std::printf("gate     X        %12.3f us\n", median(times) * 1e6);
+}
+
+static void bench_gate_cx(int num_qubits, int iters, int reps) {
+    QuantumStateGpu state(num_qubits);
+    state.set_zero_state();
+    void* data = state.data();
+    ITYPE dim = state.dim;
+    void* stream = state.get_cuda_stream();
+    UINT dev = state.device_number;
+    std::vector<double> times(reps);
+
+    for (int r = 0; r < reps; r++) {
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < num_qubits - 1; q++)
+                CNOT_gate_host((UINT)q, (UINT)(q + 1), data, dim, stream, dev);
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    std::printf("gate     CX       %12.3f us\n", median(times) * 1e6);
+}
+
+static void bench_gate_rz(int num_qubits, int iters, int reps) {
+    QuantumStateGpu state(num_qubits);
+    state.set_zero_state();
+    void* data = state.data();
+    ITYPE dim = state.dim;
+    void* stream = state.get_cuda_stream();
+    UINT dev = state.device_number;
+    std::vector<double> times(reps);
+
+    for (int r = 0; r < reps; r++) {
+        double t0 = now_sec();
+        for (int i = 0; i < iters; i++)
+            for (int q = 0; q < num_qubits; q++)
+                RZ_gate_host((UINT)q, -0.1, data, dim, stream, dev);
+        double t1 = now_sec();
+        times[r] = t1 - t0;
+    }
+    std::printf("gate     RZ       %12.3f us\n", median(times) * 1e6);
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+int main() {
+    int reps = 5;
+
+    std::printf("=== Qulacs GPU standalone benchmarks ===\n");
+    std::printf("\n-- Layered circuits (median of %d runs) --\n", reps);
+
+    int configs[][2] = {
+        {10, 20}, {14, 20}, {18, 20}, {20, 20}, {22, 20}, {24, 10}, {26, 5}
+    };
+    int n_configs = sizeof(configs) / sizeof(configs[0]);
+
+    for (int i = 0; i < n_configs; i++) {
+        bench_circuit(configs[i][0], configs[i][1], reps);
+    }
+
+    std::printf("\n-- Individual gates at 18 qubits, 100 iters (median of %d runs) --\n", reps);
+    bench_gate_h(18, 100, reps);
+    bench_gate_x(18, 100, reps);
+    bench_gate_cx(18, 100, reps);
+    bench_gate_rz(18, 100, reps);
+
+    return 0;
+}
diff --git a/scripts/native_bench/fetch_deps.sh b/scripts/native_bench/fetch_deps.sh
new file mode 100755
index 000000000..123ce8541
--- /dev/null
+++ b/scripts/native_bench/fetch_deps.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Copyright 2026 The PECOS Developers
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+# except in compliance with the License. You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+
+# Download QuEST + Qulacs + Eigen + Boost source archives into ~/.pecos/deps/ for
+# standalone native benchmarking. These are comparison-only vendored sources --
+# PECOS does not link against them at runtime.
+
+set -euo pipefail
+
+DEPS_DIR="$HOME/.pecos/deps"
+mkdir -p "$DEPS_DIR"
+
+# name | version-dir | url | sha256
+DEPS=(
+    "quest|quest-v4.2.0|https://github.com/QuEST-Kit/QuEST/archive/refs/tags/v4.2.0.tar.gz|2c812a7ec4d727e0947ffd0daf05452963c3f1c10e428c8bc30c35164921fcba"
+    "qulacs|qulacs-0.6.13|https://github.com/qulacs/qulacs/archive/v0.6.13.tar.gz|9ef25a988b9f483b97ea9501554a1ce5ee23ffaf89e7ca89969f0d03fcf94af0"
+    "eigen|eigen-3.4.0|https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz|8586084f71f9bde545ee7fa6d00288b264a2b7ac3607b974e54d13e7162c1c72"
+    "boost|boost-1.83.0|https://archives.boost.io/release/1.83.0/source/boost_1_83_0.tar.bz2|6478edfe2f3305127cffe8caf73ea0176c53769f4bf1585be237eb30798c3b8e"
+)
+
+AUTO_YES="${AUTO_YES:-0}"
+if [ "${1:-}" = "-y" ] || [ "${1:-}" = "--yes" ]; then
+    AUTO_YES=1
+fi
+
+prompt_yes() {
+    if [ "$AUTO_YES" = "1" ]; then
+        return 0
+    fi
+    local msg="$1"
+    read -r -p "$msg [Y/n] " reply
+    case "$reply" in
+        ""|[Yy]*) return 0 ;;
+        *) return 1 ;;
+    esac
+}
+
+fetch_one() {
+    local name="$1" dir="$2" url="$3" sha256="$4"
+    local target="$DEPS_DIR/$dir"
+
+    if [ -d "$target" ]; then
+        echo "[skip] $name already present at $target"
+        return
+    fi
+
+    if ! prompt_yes "Download $name ($url)?"; then
+        echo "[skip] $name (user declined)"
+        return
+    fi
+
+    local tmpdir
+    tmpdir="$(mktemp -d)"
+    trap "rm -rf '$tmpdir'" RETURN
+
+    local archive="$tmpdir/download"
+    echo "[fetch] $url"
+    curl -fL --retry 3 --retry-delay 2 -o "$archive" "$url"
+
+    local got
+    got="$(sha256sum "$archive" | awk '{print $1}')"
+    if [ "$got" != "$sha256" ]; then
+        echo "ERROR: sha256 mismatch for $name"
+        echo "  expected: $sha256"
+        echo "  got:      $got"
+        return 1
+    fi
+
+    echo "[extract] -> $target"
+    mkdir -p "$tmpdir/extract"
+    case "$url" in
+        *.tar.bz2) tar -xjf "$archive" -C "$tmpdir/extract" ;;
+        *.tar.gz|*.tgz) tar -xzf "$archive" -C "$tmpdir/extract" ;;
+        *) echo "ERROR: unknown archive type for $url"; return 1 ;;
+    esac
+
+    # Top-level directory inside the archive
+    local inner
+    inner="$(ls "$tmpdir/extract")"
+    mv "$tmpdir/extract/$inner" "$target"
+    echo "[done] $name at $target"
+}
+
+echo "Native bench dependencies will be fetched into $DEPS_DIR"
+echo ""
+
+for entry in "${DEPS[@]}"; do
+    IFS='|' read -r name dir url sha256 <<<"$entry"
+    fetch_one "$name" "$dir" "$url" "$sha256"
+done
+
+echo ""
+echo "Done."
diff --git a/scripts/native_bench/run.sh b/scripts/native_bench/run.sh
index 1ba2cc5c0..72ccf62e4 100755
--- a/scripts/native_bench/run.sh
+++ b/scripts/native_bench/run.sh
@@ -24,8 +24,8 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 DEPS_DIR="$HOME/.pecos/deps"
 BUILD_DIR="$SCRIPT_DIR/build"
 
-QUEST_SRC="$DEPS_DIR/quest-v4.1.0"
-QULACS_SRC="$DEPS_DIR/qulacs-0.6.12"
+QUEST_SRC="$DEPS_DIR/quest-v4.2.0"
+QULACS_SRC="$DEPS_DIR/qulacs-0.6.13"
 
 # ---------------------------------------------------------------------------
 # Check sources exist
@@ -43,7 +43,7 @@ fi
 if [ "$missing" -eq 1 ]; then
     echo ""
     echo "Run the following to download the dependencies:"
-    echo "  cargo build -p pecos-quest -p pecos-qulacs"
+    echo "  scripts/native_bench/fetch_deps.sh"
     exit 1
 fi
 
@@ -84,7 +84,7 @@ mkdir -p "$QULACS_BUILD"
 BOOST_DIR="$DEPS_DIR/boost-1.83.0"
 if [ ! -d "$BOOST_DIR" ]; then
     echo "ERROR: Boost not found at $BOOST_DIR"
-    echo "Run: cargo build -p pecos-qulacs"
+    echo "Run: scripts/native_bench/fetch_deps.sh"
     exit 1
 fi
 
@@ -161,6 +161,17 @@ c++ -O3 -march=native -std=c++14 \
 echo "Compiled."
 echo ""
 
+# ---------------------------------------------------------------------------
+# Build PECOS standalone benchmark (pure Rust, same timing as C benchmarks)
+# ---------------------------------------------------------------------------
+
+echo "--- Building PECOS standalone benchmark (Rust, Release, -C target-cpu=native) ---"
+PECOS_BENCH_DIR="$SCRIPT_DIR/bench_pecos"
+(cd "$PECOS_BENCH_DIR" && RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native" cargo build --release 2>&1 | tail -3)
+PECOS_BIN="$PECOS_BENCH_DIR/target/release/bench_pecos"
+echo "PECOS benchmark built."
+echo ""
+
 # ---------------------------------------------------------------------------
 # Run standalone benchmarks
 # ---------------------------------------------------------------------------
@@ -173,21 +184,12 @@ echo "--- Running Qulacs benchmark ---"
 "$BUILD_DIR/bench_qulacs" | tee "$BUILD_DIR/qulacs_results.txt"
 echo ""
 
-# ---------------------------------------------------------------------------
-# Run PECOS Rust criterion benchmarks
-# ---------------------------------------------------------------------------
-
-echo "--- Running PECOS criterion benchmarks (--quick mode) ---"
-cd "$REPO_ROOT"
-
-# Capture criterion output; --quick runs minimal iterations for fast comparison
-CRITERION_OUT="$BUILD_DIR/criterion_output.txt"
-cargo bench -p benchmarks --profile native --bench benchmarks \
-    --features quest,qulacs -- "Native" --quick 2>&1 | tee "$CRITERION_OUT"
+echo "--- Running PECOS benchmark ---"
+"$PECOS_BIN" | tee "$BUILD_DIR/pecos_results.txt"
 echo ""
 
 # ---------------------------------------------------------------------------
-# Parse criterion results and print comparison table
+# Comparison summary
 # ---------------------------------------------------------------------------
 
 echo "============================================================"
@@ -200,9 +202,8 @@ echo ""
 echo "Qulacs standalone results:"
 cat "$BUILD_DIR/qulacs_results.txt"
 echo ""
-echo "PECOS criterion results (see above for full output):"
-# Extract timing lines from criterion output
-grep -E "time:.*\[" "$CRITERION_OUT" 2>/dev/null || echo "(parse criterion output above for timings)"
+echo "PECOS standalone results:"
+cat "$BUILD_DIR/pecos_results.txt"
 echo ""
 echo "============================================================"
 echo "Done. Full outputs saved in: $BUILD_DIR/"
diff --git a/scripts/native_bench/run_gpu.sh b/scripts/native_bench/run_gpu.sh
new file mode 100755
index 000000000..3f743e979
--- /dev/null
+++ b/scripts/native_bench/run_gpu.sh
@@ -0,0 +1,272 @@
+#!/usr/bin/env bash
+# Copyright 2026 The PECOS Developers
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Standalone GPU benchmark: PECOS (wgpu + cuQuantum) vs QuEST CUDA vs Qulacs GPU
+#
+# Builds QuEST with CUDA and Qulacs with GPU support from source,
+# compiles standalone benchmark programs, runs them, and compares
+# against PECOS Rust GPU simulators.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+DEPS_DIR="$HOME/.pecos/deps"
+BUILD_DIR="$SCRIPT_DIR/build"
+
+QUEST_SRC="$DEPS_DIR/quest-v4.2.0"
+QULACS_SRC="$DEPS_DIR/qulacs-0.6.13"
+
+# ---------------------------------------------------------------------------
+# Find CUDA
+# ---------------------------------------------------------------------------
+
+CUDA_PATH=""
+for candidate in /usr/local/cuda "$HOME/.pecos/deps/cuda" "${CUDA_PATH:-}"; do
+    if [ -n "$candidate" ] && [ -x "$candidate/bin/nvcc" ]; then
+        CUDA_PATH="$candidate"
+        break
+    fi
+done
+
+if [ -z "$CUDA_PATH" ]; then
+    echo "ERROR: CUDA not found. Install CUDA or set CUDA_PATH."
+    exit 1
+fi
+
+echo "Using CUDA at: $CUDA_PATH"
+export PATH="$CUDA_PATH/bin:$PATH"
+echo "nvcc version: $(nvcc --version | tail -1)"
+echo ""
+
+# ---------------------------------------------------------------------------
+# Check sources exist
+# ---------------------------------------------------------------------------
+
+missing=0
+if [ ! -d "$QUEST_SRC" ]; then
+    echo "ERROR: QuEST sources not found at $QUEST_SRC"
+    missing=1
+fi
+if [ ! -d "$QULACS_SRC" ]; then
+    echo "ERROR: Qulacs sources not found at $QULACS_SRC"
+    missing=1
+fi
+if [ "$missing" -eq 1 ]; then
+    echo ""
+    echo "Run the following to download the dependencies:"
+    echo "  scripts/native_bench/fetch_deps.sh"
+    exit 1
+fi
+
+echo "=== GPU Benchmark: PECOS vs QuEST vs Qulacs ==="
+echo ""
+
+# ---------------------------------------------------------------------------
+# Build QuEST with CUDA
+# ---------------------------------------------------------------------------
+
+echo "--- Building QuEST (CMake, Release, CUDA) ---"
+QUEST_GPU_BUILD="$BUILD_DIR/quest_gpu"
+mkdir -p "$QUEST_GPU_BUILD"
+cmake -S "$QUEST_SRC" -B "$QUEST_GPU_BUILD" \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_FLAGS="-march=native" \
+    -DCMAKE_CXX_FLAGS="-march=native" \
+    -DCMAKE_CUDA_FLAGS="-O3" \
+    -DENABLE_MULTITHREADING=OFF \
+    -DENABLE_CUDA=ON \
+    -DENABLE_HIP=OFF \
+    -DENABLE_DISTRIBUTION=OFF \
+    -DBUILD_SHARED_LIBS=OFF \
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    2>&1 | tail -5
+cmake --build "$QUEST_GPU_BUILD" -j "$(nproc)" 2>&1 | tail -5
+echo "QuEST (CUDA) built."
+echo ""
+
+# ---------------------------------------------------------------------------
+# Build Qulacs with GPU
+# ---------------------------------------------------------------------------
+
+echo "--- Building Qulacs (CMake, Release, CUDA) ---"
+QULACS_GPU_BUILD="$BUILD_DIR/qulacs_gpu"
+mkdir -p "$QULACS_GPU_BUILD"
+
+BOOST_DIR="$DEPS_DIR/boost-1.83.0"
+if [ ! -d "$BOOST_DIR" ]; then
+    echo "ERROR: Boost not found at $BOOST_DIR"
+    echo "Run: scripts/native_bench/fetch_deps.sh"
+    exit 1
+fi
+
+cmake -S "$QULACS_SRC" -B "$QULACS_GPU_BUILD" \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_FLAGS="-march=native" \
+    -DCMAKE_CXX_FLAGS="-march=native -DEIGEN_NO_DEBUG" \
+    -DCMAKE_CUDA_FLAGS="-O3" \
+    -DBoost_INCLUDE_DIR="$BOOST_DIR" \
+    -DUSE_OMP=OFF \
+    -DUSE_GPU=Yes \
+    -DUSE_MPI=OFF \
+    -DUSE_TEST=OFF \
+    -DUSE_PYTHON=OFF \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    2>&1 | tail -5
+cmake --build "$QULACS_GPU_BUILD" -j "$(nproc)" --target csim_static cppsim_static gpusim_static 2>&1 | tail -5
+echo "Qulacs (CUDA) built."
+echo ""
+
+# ---------------------------------------------------------------------------
+# Locate built libraries
+# ---------------------------------------------------------------------------
+
+# QuEST CUDA
+QUEST_GPU_LIB="$(find "$QUEST_GPU_BUILD" -name 'libQuEST.a' | head -1)"
+if [ -z "$QUEST_GPU_LIB" ]; then
+    echo "ERROR: Could not find libQuEST.a in $QUEST_GPU_BUILD"
+    exit 1
+fi
+QUEST_GPU_LIB_DIR="$(dirname "$QUEST_GPU_LIB")"
+
+QUEST_INC_GEN="$QUEST_GPU_BUILD/include"
+QUEST_INC_SRC="$QUEST_SRC/quest/include"
+QUEST_INC_ROOT="$QUEST_SRC"
+
+# Qulacs GPU: CMake puts archives at ${PROJECT_BINARY_DIR}/../lib (i.e. build/lib/)
+QULACS_GPU_CSIM="$(find "$BUILD_DIR" -name 'libcsim_static.a' | head -1)"
+QULACS_GPU_CPPSIM="$(find "$BUILD_DIR" -name 'libcppsim_static.a' | head -1)"
+QULACS_GPU_GPUSIM="$(find "$BUILD_DIR" -name 'libgpusim_static.a' | head -1)"
+if [ -z "$QULACS_GPU_CSIM" ] || [ -z "$QULACS_GPU_CPPSIM" ] || [ -z "$QULACS_GPU_GPUSIM" ]; then
+    echo "ERROR: Could not find Qulacs GPU static libraries"
+    echo "  csim: $QULACS_GPU_CSIM"
+    echo "  cppsim: $QULACS_GPU_CPPSIM"
+    echo "  gpusim: $QULACS_GPU_GPUSIM"
+    exit 1
+fi
+
+QULACS_INC="$QULACS_SRC/src"
+QULACS_CPPSIM_INC="$QULACS_SRC/include"
+QULACS_EIGEN_INC="$QULACS_SRC/include"
+if [ ! -d "$QULACS_EIGEN_INC/Eigen" ]; then
+    QULACS_EIGEN_INC="$DEPS_DIR/eigen-3.4.0"
+fi
+
+# ---------------------------------------------------------------------------
+# Compile GPU benchmark programs
+# ---------------------------------------------------------------------------
+
+echo "--- Compiling bench_quest (CUDA) ---"
+# QuEST CUDA needs nvcc for linking since libQuEST.a contains CUDA objects
+nvcc -O3 -std=c++14 \
+    -I"$QUEST_INC_GEN" -I"$QUEST_INC_SRC" -I"$QUEST_INC_ROOT" \
+    -Xcompiler "-march=native" \
+    "$SCRIPT_DIR/bench_quest.c" \
+    -L"$QUEST_GPU_LIB_DIR" -lQuEST \
+    -lcudart -lcurand \
+    -lstdc++ -lm \
+    -o "$BUILD_DIR/bench_quest_gpu"
+echo "Compiled."
+
+echo "--- Compiling bench_qulacs_gpu ---"
+nvcc -O3 -std=c++14 \
+    -I"$QULACS_INC" -I"$QULACS_CPPSIM_INC" -I"$QULACS_EIGEN_INC" -I"$BOOST_DIR" \
+    -Xcompiler "-march=native" \
+    -D_USE_GPU -DEIGEN_NO_DEBUG \
+    "$SCRIPT_DIR/bench_qulacs_gpu.cpp" \
+    "$QULACS_GPU_CPPSIM" "$QULACS_GPU_GPUSIM" "$QULACS_GPU_CSIM" \
+    -lcudart -lcurand -lcublas \
+    -lm \
+    -o "$BUILD_DIR/bench_qulacs_gpu"
+echo "Compiled."
+
+# cuStateVec standalone benchmark
+CUQUANTUM_DIR="$(ls -d "$DEPS_DIR"/cuquantum-* 2>/dev/null | sort -V | tail -1)"
+if [ -n "$CUQUANTUM_DIR" ] && [ -d "$CUQUANTUM_DIR" ]; then
+    echo "--- Compiling bench_custatevec ---"
+    nvcc -O3 -std=c++14 \
+        -I"$CUQUANTUM_DIR/include" \
+        -Xcompiler "-march=native" \
+        "$SCRIPT_DIR/bench_custatevec.cu" \
+        -L"$CUQUANTUM_DIR/lib" -lcustatevec \
+        -lcudart \
+        -Xlinker "-rpath,$CUQUANTUM_DIR/lib" \
+        -o "$BUILD_DIR/bench_custatevec"
+    echo "Compiled."
+    HAS_CUSTATEVEC=1
+else
+    echo "--- Skipping bench_custatevec (cuQuantum not found) ---"
+    HAS_CUSTATEVEC=0
+fi
+echo ""
+
+# ---------------------------------------------------------------------------
+# Build PECOS GPU benchmark (Rust, wgpu + cuQuantum)
+# ---------------------------------------------------------------------------
+
+echo "--- Building PECOS GPU benchmark (Rust, wgpu + cuQuantum) ---"
+PECOS_BENCH_DIR="$SCRIPT_DIR/bench_pecos"
+(cd "$PECOS_BENCH_DIR" && RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native" \
+    cargo build --release --features gpu,cuquantum 2>&1 | tail -5)
+PECOS_BIN="$PECOS_BENCH_DIR/target/release/bench_pecos"
+echo "PECOS GPU benchmark built."
+echo ""
+
+# ---------------------------------------------------------------------------
+# Run GPU benchmarks
+# ---------------------------------------------------------------------------
+
+echo "--- Running QuEST CUDA benchmark ---"
+"$BUILD_DIR/bench_quest_gpu" | tee "$BUILD_DIR/quest_gpu_results.txt"
+echo ""
+
+echo "--- Running Qulacs GPU benchmark ---"
+"$BUILD_DIR/bench_qulacs_gpu" | tee "$BUILD_DIR/qulacs_gpu_results.txt"
+echo ""
+
+if [ "${HAS_CUSTATEVEC:-0}" -eq 1 ]; then
+    echo "--- Running cuStateVec benchmark ---"
+    LD_LIBRARY_PATH="$CUQUANTUM_DIR/lib:${LD_LIBRARY_PATH:-}" \
+        "$BUILD_DIR/bench_custatevec" | tee "$BUILD_DIR/custatevec_results.txt"
+    echo ""
+fi
+
+echo "--- Running PECOS GPU benchmark ---"
+"$PECOS_BIN" | tee "$BUILD_DIR/pecos_gpu_results.txt"
+echo ""
+
+# ---------------------------------------------------------------------------
+# Comparison summary
+# ---------------------------------------------------------------------------
+
+echo "============================================================"
+echo "                  GPU COMPARISON SUMMARY"
+echo "============================================================"
+echo ""
+echo "QuEST CUDA results:"
+cat "$BUILD_DIR/quest_gpu_results.txt"
+echo ""
+echo "Qulacs GPU results:"
+cat "$BUILD_DIR/qulacs_gpu_results.txt"
+echo ""
+if [ "${HAS_CUSTATEVEC:-0}" -eq 1 ]; then
+    echo "cuStateVec standalone results:"
+    cat "$BUILD_DIR/custatevec_results.txt"
+    echo ""
+fi
+echo "PECOS GPU results:"
+cat "$BUILD_DIR/pecos_gpu_results.txt"
+echo ""
+echo "============================================================"
+echo "Done. Full outputs saved in: $BUILD_DIR/"
diff --git a/uv.lock b/uv.lock
index cd194b3ec..23cac09c6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2,10 +2,14 @@ version = 1
 revision = 3
 requires-python = ">=3.10"
 resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version >= '3.12' and python_full_version < '3.14'",
-    "python_full_version == '3.11.*'",
-    "python_full_version < '3.11'",
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version >= '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and sys_platform != 'darwin')",
+    "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'darwin')",
+    "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')",
 ]
 
 [manifest]
@@ -480,7 +484,8 @@ name = "contourpy"
 version = "1.3.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.11'",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')",
 ]
 dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -550,9 +555,12 @@ name = "contourpy"
 version = "1.3.3"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version >= '3.12' and python_full_version < '3.14'",
-    "python_full_version == '3.11.*'",
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version >= '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and sys_platform != 'darwin')",
+    "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'darwin')",
+    "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')",
 ]
 dependencies = [
     { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -780,38 +788,39 @@ wheels = [
 
 [[package]]
 name = "cuda-core"
-version = "0.6.0"
+version = "0.7.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "cuda-pathfinder", marker = "python_full_version >= '3.11'" },
     { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9d/a1/5a68e2658c9d4c2842301aca5b117b58065be673e64180edb0afdeac7ec2/cuda_core-0.6.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de9e90bba4bbd3ee4aefeaccca5f32a34926b2cfb6d7f8ee4e8a93ed34bc147c", size = 20578385, upload-time = "2026-02-23T18:59:17.324Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/c7/599f89644e3100aba9b5312ad406e9a22030f202b949ed1d55d5d459f2d8/cuda_core-0.6.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:230373596e0150cf85f2b48536f946bc3abc76bb96ba61ff47d538f42b5d6784", size = 20739096, upload-time = "2026-02-23T18:59:19.799Z" },
-    { url = "https://files.pythonhosted.org/packages/53/35/a3f7a8fd04338db7848511059341e0eb1910d5017c108647e1b4fcd08b00/cuda_core-0.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:685a8bf1267d480300969e3b8b76f3741b52080c015a7e6354d20398c2d4d2c8", size = 3064369, upload-time = "2026-02-23T18:59:22.311Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/69/8361fa2873fdc86d298a01f70ca3ea4a13f59711e75312dd0ce3d411c05f/cuda_core-0.6.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70c3cd2ae0fa82cd6681be636051b247bcd4c4c3249c35bd982034cefb5adca3", size = 21597027, upload-time = "2026-02-23T18:59:24.216Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/62/ed3039d866879872099fc855f8ad8b5e2ae9010b5e30d702fde3d66f23df/cuda_core-0.6.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:07df8dd46494bd53943759232051facd4372104f2997732e0d39c5bc12a616d5", size = 21790662, upload-time = "2026-02-23T18:59:27.064Z" },
-    { url = "https://files.pythonhosted.org/packages/40/62/09e4be962deec9f54da01edf9c069f3963b4c475a79b2a9737e3c3c939b9/cuda_core-0.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:fb407a2825693bb603b7c4389f5646092e5b1ff2aa6fb9b455326740238371d9", size = 3067205, upload-time = "2026-02-23T18:59:29.703Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/f9/6501286dfc636ab529d3981d346f70326b8b2841e3239c9c9e4ed84df578/cuda_core-0.6.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e10e976c8bcda7d2a6ff6337eaff4d1b771d89d56c2da3c8d785f3e3998e6cf9", size = 21538144, upload-time = "2026-02-23T18:59:31.932Z" },
-    { url = "https://files.pythonhosted.org/packages/71/16/5346a77931edd1c822bedc176c8a85360748b9f1cd4f7b3a08abcf79a557/cuda_core-0.6.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:19b75b36c789dd7794491a8d79f776c81e849bd7900d5a5fabed65cbabc63978", size = 21876857, upload-time = "2026-02-23T18:59:34.291Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/bc/14699c04dbcd3f9c97b0adfbec6aeda480f763510b528173d1d2deff05ef/cuda_core-0.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:e296def768e4bbe47c8f1607efd98a496bb8dbe1de70d064652e4f955fa62621", size = 3025927, upload-time = "2026-02-23T18:59:37.434Z" },
-    { url = "https://files.pythonhosted.org/packages/88/d4/7a6a3cb92b58b135157469d17298c8cc6929c6bc34a4e89eb99bef8cf41e/cuda_core-0.6.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:315ee1afaecb8e360ebd80569aad963f9f22b7e7e4745049cac187fd5f13cfac", size = 21152685, upload-time = "2026-02-23T18:59:39.539Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/1d/dca2f93578fa0925e7d5b90e2bafe7b5de3201a8f059b0b2679e374a0848/cuda_core-0.6.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a030cd81cbc625ed747d0b3678d2159e65e7c71ad1c62480ff05e07e5e05d5ab", size = 21509267, upload-time = "2026-02-23T18:59:41.832Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/bd/583befd4846331dc645a52080bdf1b3c2912377295500ae3809d9fd0f099/cuda_core-0.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:2992b4f23d57816ba3c4ee0b49a8547ff89dfccd2e3efd2dba23f965e98b3b4f", size = 3013832, upload-time = "2026-02-23T18:59:43.849Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/55/7cb18fddf6cf358915b15a1fa4bd402780101ec3c16cdec7473fafa027bc/cuda_core-0.6.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9168b5b75373b65fc882d1bab84090d90612875dfaf2c6aa6ad9edb0e6a7a92", size = 21152867, upload-time = "2026-02-23T18:59:46.011Z" },
-    { url = "https://files.pythonhosted.org/packages/25/b0/e417852c56db7a08803064d16f05a7026b28949789549adb5783c9ae9fa5/cuda_core-0.6.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85768eff013adf2258660dd50677aa35e032b9203f8b4fa7b42065501fa999ed", size = 21343024, upload-time = "2026-02-23T18:59:48.933Z" },
-    { url = "https://files.pythonhosted.org/packages/25/bb/9c7e66449a3c313833e3595553f4bb4349bf9c22253d5f237c342cdf2cb6/cuda_core-0.6.0-cp314-cp314-win_amd64.whl", hash = "sha256:7405ce6c2785112c5027ec43bf1dde6d9d14abaa3796b117c24b322a466ea423", size = 3090371, upload-time = "2026-02-23T18:59:51.003Z" },
-    { url = "https://files.pythonhosted.org/packages/53/84/c4b41518ff96cf465d7b0f402bd237f04dec2c7570869ee16f52f6861f70/cuda_core-0.6.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03fab5962ee77d3e031a002d10c2caf205910348be0e4c11ac626a61dadf6160", size = 22518958, upload-time = "2026-02-23T18:59:52.719Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/cf/612a4a0281a0af63b0a6a3441c0e0587122498b723b4e3d8a002e626fddc/cuda_core-0.6.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d083c34116699de175162f82e60d9dcb1ad7290aa1019fb9d37c27849070894", size = 22146903, upload-time = "2026-02-23T18:59:55.213Z" },
-    { url = "https://files.pythonhosted.org/packages/88/34/c2ee985bc1ebf86c49f480c90d73dbed798afbebd7efbb8027a4a14f9727/cuda_core-0.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:adfb4dac60f295e49a713a83e67b87a199ec4ae0f432e65814dd6c86e9d35421", size = 3606321, upload-time = "2026-02-23T18:59:57.299Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/b2/c9bb66a400baae14d5cd25cca9e7cf69f9328543edb37efa6e7ba1088cc7/cuda_core-0.7.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e8feaf99613cd1d025b26374b1c92ff72d9e8078a3236fc78aa550ff3b0028e3", size = 28962792, upload-time = "2026-04-08T17:03:00.541Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/9c/9b06b4f190f13ab60d85ce5eb2b4c56b3a1c42bf7c9aa84311cea962a2ee/cuda_core-0.7.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c8cf0d73c28761b63d32fbdaae8a4d2ce0f78e8dc6d376830daf1452d8e49b6d", size = 29208466, upload-time = "2026-04-08T17:03:03.94Z" },
+    { url = "https://files.pythonhosted.org/packages/13/cb/759d412d23bbbe3187d2a54b2c63b894cdc5d390bb1ac948d0e438073c4a/cuda_core-0.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:83a8b06c0ba2703f627ebb3c24f9cf8863a70445b96ac9b775d0ba2b64884628", size = 4147647, upload-time = "2026-04-08T17:03:06.77Z" },
+    { url = "https://files.pythonhosted.org/packages/59/83/03139c7d9c0425ec4824d6269cfd9e1ac8ae1cc88f12540578a405113083/cuda_core-0.7.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69946d1d5e2d96fc65b7bb36164d26c328cca75d9482b74ee7d61b2c1e1d33a7", size = 30374690, upload-time = "2026-04-08T17:03:08.962Z" },
+    { url = "https://files.pythonhosted.org/packages/97/56/c3a08515e1805370775ce088d1654f289e8a82e7c64604a34a277efe563b/cuda_core-0.7.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7dd6bed3075de55f10ef2dcd2de9f5f3bbd2b03d4650e62ae38343f68f6fa974", size = 30662935, upload-time = "2026-04-08T17:03:12.416Z" },
+    { url = "https://files.pythonhosted.org/packages/02/58/0f30ce64c5b2d6c6d0dcf3cc0b987c80512303762d66e18a73e5804d868e/cuda_core-0.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:c580a87ff824d7949883675484d04e60a23f9f26004da9ab39c88577a9271a7f", size = 4149685, upload-time = "2026-04-08T17:03:15.282Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/66/15fb7128617e8086f93782c3a80f380c44a3ca87e35102f49fba9226917d/cuda_core-0.7.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133f996d7dabe61624dab10c5ef56bb5217270fe53ca4ab26edf8ee14c67df8e", size = 30344605, upload-time = "2026-04-08T17:03:17.476Z" },
+    { url = "https://files.pythonhosted.org/packages/af/c6/a4eac2e7d4089e8d4d0060de36934199c83e2a1ea7cbc0b6084acfc05cfc/cuda_core-0.7.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6377276918cdb3fad5ace36ebc2806718435782c6ed739e91ea5d06e3fca8f05", size = 30850057, upload-time = "2026-04-08T17:03:20.709Z" },
+    { url = "https://files.pythonhosted.org/packages/58/2c/334519b2fc375a937ab81970f2c4afd2a06b8076cb1d0d86784137f5206e/cuda_core-0.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:2ecfb8d83a72505ef4c241079d2e8935adabe8330de9f2c7c9135e7116b1dcc3", size = 4094353, upload-time = "2026-04-08T17:03:23.896Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/98/ff82ac290e93c771639fd73ba9b37937a97f028169f3e8c121fc258eaca7/cuda_core-0.7.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f25a3042a73dcaa8046a7fa3b0ba9b3de15a39a05b77483dc0a8281bd182716e", size = 29873257, upload-time = "2026-04-08T17:03:26.138Z" },
+    { url = "https://files.pythonhosted.org/packages/61/21/99169dc3aa66d8fc3eaae7b69fbeaa57a672a71586364069211b7e57e08c/cuda_core-0.7.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:52d11f599ec5af622da0b7cf28506978e382aa614f8552edaddbf21bcda6c7a6", size = 30368143, upload-time = "2026-04-08T17:03:29.331Z" },
+    { url = "https://files.pythonhosted.org/packages/67/23/0ae61d9e0c78208e97c9b2b274026dead3a46034a3db24ec4568e3cda1d7/cuda_core-0.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:b4dd7c2b2d9f95acbffc9df62bd52d4bcdab72b7780fc3bd7e691e1e0cc1f071", size = 4076762, upload-time = "2026-04-08T17:03:32.059Z" },
+    { url = "https://files.pythonhosted.org/packages/89/87/f42a4b9cac013e8db14070bf5359247e3695ab1c310280333d65db247451/cuda_core-0.7.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c50c18363cb132e2925cad1f3843471c6699b8d6b3df6b8d21a3d0f0e3cf528e", size = 29888711, upload-time = "2026-04-08T17:03:34.439Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/ad/2050bea962f4bded68d3367b093c9ced51db87633fd3a2681b179d6090a7/cuda_core-0.7.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9a76af173fb686bc7579230699b011d825320e640b6963776b305a94b584c0c1", size = 30150682, upload-time = "2026-04-08T17:03:37.909Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/ed/4735a1d0832bc51f891aa3665a6ff2d11e5089e68c576705fbfb69bbd75c/cuda_core-0.7.0-cp314-cp314-win_amd64.whl", hash = "sha256:0cb4c757cc068dfd8e83b8a8fe56ffb357e12753112bc648b53e3244d7a6e63f", size = 4180956, upload-time = "2026-04-08T17:03:40.4Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/0a/e5d8eab0b0f8a7df7ba51982f0a4dcf33e79f96c0b1f867aa7379b6b2ace/cuda_core-0.7.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2301155335c9b8d50e642f49fb2c1dbf250c33a243bf05e01d54f52363b2a5aa", size = 31511256, upload-time = "2026-04-08T17:03:43.018Z" },
+    { url = "https://files.pythonhosted.org/packages/31/f1/fc7c37a3ff229d864ce0b57a1254aaa9cd7af674ed390bef5bb3b030bd23/cuda_core-0.7.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6ae3250b756f262f70bf435b9f505435f8739698884a15db482c46bf714b0de2", size = 31023677, upload-time = "2026-04-08T17:03:46.055Z" },
+    { url = "https://files.pythonhosted.org/packages/50/46/4107da6334f9f32c3c04a0b2925c78f06bfc36b0872682370234fb11a122/cuda_core-0.7.0-cp314-cp314t-win_amd64.whl", hash = "sha256:57f2391ec036185a9bf189effda3d538a6e78d97e220c12254f96ebc1db7a009", size = 4879151, upload-time = "2026-04-08T17:03:49.052Z" },
 ]
 
 [[package]]
 name = "cuda-pathfinder"
-version = "1.5.1"
+version = "1.5.2"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/74/8c66861b873d8eed51fde56d3091baa4906a56f0d4390cae991f2d41dda5/cuda_pathfinder-1.5.1-py3-none-any.whl", hash = "sha256:b3718097fb57cf9e8a904dd072d806f2c9a27627e35c020b06ab9454bcec08c0", size = 49861, upload-time = "2026-04-03T16:41:22.203Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/f9/1b9b60a30fc463c14cdea7a77228131a0ccc89572e8df9cb86c9648271ab/cuda_pathfinder-1.5.2-py3-none-any.whl", hash = "sha256:0c5f160a7756c5b072723cbbd6d861e38917ef956c68150b02f0b6e9271c71fa", size = 49988, upload-time = "2026-04-06T23:01:05.17Z" },
 ]
 
 [[package]]
@@ -1128,38 +1137,34 @@ wheels = [
 
 [[package]]
 name = "guppylang"
-version = "0.21.11"
+version = "0.21.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "guppylang-internals" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "pytket" },
     { name = "selene-hugr-qis-compiler" },
     { name = "selene-sim" },
     { name = "tqdm" },
     { name = "types-tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fe/6d/dcfebfca39fc8fce2f5b27fc2f411ebfcd18e4509959215ac6d6a38afb5f/guppylang-0.21.11.tar.gz", hash = "sha256:5ff823484c9e8cc2a9c13279be0aec2dc68f3aa725bd9f125d912a393983747e", size = 68353, upload-time = "2026-04-01T13:23:19.721Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0d/45/0c4cb644d10682fdf9de492911ef55ddc571469270487058655fe9e233b4/guppylang-0.21.6.tar.gz", hash = "sha256:57cdfa0c2fe7ffd80c193c830505285e0a8c779a6f817016cb8e295eaa1cae19", size = 60477, upload-time = "2025-10-30T17:37:21.435Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/48/73/7e6e9567d600cdb181273216ba82ab9b8b279da92c969685506b83181af9/guppylang-0.21.11-py3-none-any.whl", hash = "sha256:b00e8f1be52c846c349c576c4d264771fca01fe5f2f0a6a01434f92d4b7350c4", size = 65701, upload-time = "2026-04-01T13:23:18.265Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/81/3c0fffa0c28bd2bcaf37dd053d50017f8afc3794de13d9e60b7f5b9a5dab/guppylang-0.21.6-py3-none-any.whl", hash = "sha256:ecc03bf0c1f2d1146a3fdcc2e75a9a775eb11fd37b128a1febf319ab6b709619", size = 58427, upload-time = "2025-10-30T17:37:20.212Z" },
 ]
 
 [[package]]
 name = "guppylang-internals"
-version = "0.32.0"
+version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "hugr" },
-    { name = "pytket" },
-    { name = "tket" },
     { name = "tket-exts" },
     { name = "typing-extensions" },
-    { name = "wasmtime" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/58/99/ee9e0475e0597c06bd8a6a05ceb5b0a3dfb52c1830a84fd322d541db5ada/guppylang_internals-0.32.0.tar.gz", hash = "sha256:ecd074ba42903558c381d0e62891e7e749932a62003d56bda5678c06959f818e", size = 207836, upload-time = "2026-04-01T12:45:18.605Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/56/d1/1d71187602922cc70a4880137d65ea742a3f3a37be9519e23c4c5f28bc14/guppylang_internals-0.25.0.tar.gz", hash = "sha256:abbcbccb44f3404adff52046907c8c1ee993b8d779fcbef259b7bd8840c450d1", size = 180215, upload-time = "2025-10-29T15:13:20.78Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/19/52/3196a4b254d58d66c56235ff4194e519e603927753ac6a6d13179cadf59e/guppylang_internals-0.32.0-py3-none-any.whl", hash = "sha256:8d711e4a60c28b726b92ffbb383ef7f836ac2a8135838b5ad9f49f6e4e840962", size = 260494, upload-time = "2026-04-01T12:45:16.369Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/d1/48129590b4eb5e99deff8e3ddf3a039a66d96523dfb68592a6267b779b30/guppylang_internals-0.25.0-py3-none-any.whl", hash = "sha256:95473e5ef93329fc6a764fce82e0ef3931b691407a10a21520c522080405565b", size = 233971, upload-time = "2025-10-29T15:13:19.214Z" },
 ]
 
 [[package]]
@@ -1201,45 +1206,48 @@ wheels = [
 
 [[package]]
 name = "hugr"
-version = "0.15.5"
+version = "0.14.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "graphviz" },
     { name = "pydantic" },
     { name = "pydantic-extra-types" },
+    { name = "pyzstd" },
     { name = "semver" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ff/32/01b0e17e2aade67a18180af4bc2b74942c3f526851ab13e3072ee0655550/hugr-0.15.5.tar.gz", hash = "sha256:538b50c0070fc2e45f3e1394fa862ec131b22390e9acfd89f48e5c1ad122d635", size = 1115293, upload-time = "2026-03-30T12:59:53.771Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2b/1e/e412e18a0bfd606be5e51b4bb3aba98c24d11c871e89a9a7aa59d84595e3/hugr-0.15.5-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b0959494e29cd9f48ce1e6fd11deb24b009569abb3a2c7216fd90602adf1a850", size = 3287674, upload-time = "2026-03-30T12:59:50.234Z" },
-    { url = "https://files.pythonhosted.org/packages/03/d1/7a02ea4508d4fe36a028c52efaedf8141976a98ea194b3ae1fefdfae92a9/hugr-0.15.5-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:3d968983df1b1fa89f4523b45438d3bcba4cddfa0e7365ecffc0116613a8aba7", size = 2937104, upload-time = "2026-03-30T12:59:46.14Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/28/1c15cd8c33d0dbd2bf0f63a451951c9f0eec06d2fd4b61678253197b8026/hugr-0.15.5-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5a84c326d5a16bf6cb83db3872e73fbe2a3a0134a998aa4d00364a8cb5f1e2e", size = 3259115, upload-time = "2026-03-30T12:59:04.481Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/66/e2c335fb57e5f1d2ea09782a7dbd82de8e49e3290653d3f3e79e32754af5/hugr-0.15.5-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7a106cd91076aa7a8392ca7266aec9785b92cb7bedbe3a2d06569ca5783dcc19", size = 3243950, upload-time = "2026-03-30T12:59:08.566Z" },
-    { url = "https://files.pythonhosted.org/packages/48/28/54219692e40e38cb3fac6afa336f898db033e1bf434fd452a867a40a2be1/hugr-0.15.5-cp310-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86ae8263cc5b576fb2d1403e3b1326db9f8f01e316f4588842b53f6b2fa3868b", size = 3666466, upload-time = "2026-03-30T12:59:12.096Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/0e/0e14fc7e7eb402256d906bcd6aa40cd06d3394d41680a960805c22757520/hugr-0.15.5-cp310-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92e10d5be181a49c381b11082a2a2c5973cb5f139b5eb077bb0b3aabe217be84", size = 3739167, upload-time = "2026-03-30T12:59:16.236Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/1d/7d0f934e24e8a8ad0477d941529774cf09a2f13cf1e6cace5f9bd040be56/hugr-0.15.5-cp310-abi3-manylinux_2_28_i686.whl", hash = "sha256:00d700296911b0d6dd7a9854899bd3cca7d77ebc24a999a4117d0120fead88d0", size = 3513669, upload-time = "2026-04-01T17:50:20.969Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/62/df3c0cf828757965da82abea5f3fe03e3eeb77452fdc88e31b2c35581675/hugr-0.15.5-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a0a536c5eb547b64f4d820229397ef20f0fffcb929559ec00bf4ba8a66927143", size = 3541999, upload-time = "2026-04-01T17:50:24.524Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/ca/531363852d9fe16e14453e52b1e2420deda99a84fdcfb56f622c52357d0c/hugr-0.15.5-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d1b1cc6671da1f75f04205fceaf6d2096b3c2e3a584e9bc78ad4fa6694d6cb75", size = 3467535, upload-time = "2026-03-30T12:59:26.551Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/e5/a82d0007c3136319ae8280a9b752fe224a0b6700aeafdb6b641924e2dd60/hugr-0.15.5-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:b3cbeb904291afc99ec9cd617fbf0b87e4e1813db2d5b4c7a880c5c68f380ee9", size = 3523569, upload-time = "2026-03-30T12:59:30.427Z" },
-    { url = "https://files.pythonhosted.org/packages/19/04/257cdcead28cf9279bc8864338a3d4c33dec454a6217c3ebd207f648c943/hugr-0.15.5-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:7b7fed0cf9ab91d238d42003a1eb33bf673a2005e9f838f6424fa857784f10ba", size = 3607051, upload-time = "2026-03-30T12:59:34.322Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/9a/468df655f5e44bdb56c1cfa2e2489d0cf20862a03f878403bad9cf4c9e2b/hugr-0.15.5-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2ac6c9cfc2e2c3540108723e8f9f0bfd08b767d94195a7b5e02143345336988f", size = 3763498, upload-time = "2026-03-30T12:59:38.997Z" },
-    { url = "https://files.pythonhosted.org/packages/29/63/8b01ae8d59f8220fd03504eb0ff771f40dd803fcf75160e3c8b65bacac05/hugr-0.15.5-cp310-abi3-win32.whl", hash = "sha256:5996a0d93d23ab3b192ce99c03215e456fc0754566d04694e1bb8404d7931c36", size = 2897922, upload-time = "2026-03-30T12:59:44.406Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/4d/ba3a51e3dbacaf23a0d7c4bead0b8e52e7744754d942875102b6f7d91e3f/hugr-0.15.5-cp310-abi3-win_amd64.whl", hash = "sha256:4ca15068cd32199c52cd93225686f7a1416b478f534905026593ecf8ae82d38f", size = 3120930, upload-time = "2026-03-30T12:59:42.705Z" },
-    { url = "https://files.pythonhosted.org/packages/10/aa/8791aee20794f282653b0d3d775265cdf72438c23d27e624172db92fb813/hugr-0.15.5-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:2a101e02e6288556029b269faf9338acafd30635b9832971fc8b2f9f465e9fc5", size = 3286019, upload-time = "2026-03-30T12:59:52.202Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/c3/592b85d664fc52a4ac5312c44b42e6421a02f8de1aec5f865603ec1b48b9/hugr-0.15.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e7efbd8a56ef70690ae161b292d96eb8ac766e8fa1dc466ceb8686803fb81d72", size = 2936026, upload-time = "2026-03-30T12:59:48.254Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/16/169de9863b41c6bec0493fd3cdc7c35d20d7cbf93079f234ff24bb7a9ba3/hugr-0.15.5-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c11e28b43569cdb049cbb21b99f8bee15be516157d0c336421b9fc5c270642a", size = 3256239, upload-time = "2026-03-30T12:59:06.557Z" },
-    { url = "https://files.pythonhosted.org/packages/69/54/83d94589c289c2a1df7d871af548e23f23a658be1f455f68f4abb744a2a9/hugr-0.15.5-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:55f9fb9863733f347374afba74ce91f3cd8a1b39e7db2c2474def16004688e62", size = 3243114, upload-time = "2026-03-30T12:59:10.41Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/07/a13d1f373a6dcc6ee8fc079b936cc450c510333dbcf4a000112343e91c15/hugr-0.15.5-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6dacc70fc00fbc69cb24145027609c136d0380cb7de27e6f60755c990c8d64e1", size = 3511629, upload-time = "2026-03-30T12:59:19.719Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/ac/173ea1a8b6c7768f3a496302a5e61581bfc1d636c1356d01b227d86ed8a1/hugr-0.15.5-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fe5b8ce5e17982adc6427ff590e9e4a535b2e449b009706ec760c2e9213f3f6", size = 3662265, upload-time = "2026-03-30T12:59:14.247Z" },
-    { url = "https://files.pythonhosted.org/packages/29/44/60e22b8d61dcc35009aa06ad63f300061ba8e24aeb9ea16193350eee1f38/hugr-0.15.5-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0b9c4ad0d637edeb8a45610e1d9a71c4137e064d4a2dc1334e2099e9ffdeeaac", size = 3737613, upload-time = "2026-03-30T12:59:18.06Z" },
-    { url = "https://files.pythonhosted.org/packages/67/49/4c648af0c5f9521fd089a8803f9f6c2dd1c31e636c21f0e701bfa2da9030/hugr-0.15.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ea20c8ab98aa0d865577a0af1b11598b782ae5e6e076b55e8421f318288f16c", size = 3534945, upload-time = "2026-03-30T12:59:23.222Z" },
-    { url = "https://files.pythonhosted.org/packages/10/b0/4f2b061e78bcc871865e698a5120bd2504a45bb0aaa0007ae6f9fa2c2b06/hugr-0.15.5-cp313-cp313t-manylinux_2_28_i686.whl", hash = "sha256:68511a127b73c10c8cab08ca6e92b2f96fd111d7e2a3bc86eda64ecc7cb8e6ad", size = 3514687, upload-time = "2026-04-01T17:50:35.44Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/36/f11007c43ea13c5b5c7f50ce3b40dad1bb4551e53dd12e3ca908c071b483/hugr-0.15.5-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:f50260b4e330d3bb06c1e7acf87194dfbf090b3f608536e77fe9562f2ed8d57c", size = 3543491, upload-time = "2026-04-01T17:50:39.132Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/be/1d61f890ed50a5ad3b826dadb127fa297a1022d43573a122a619d09a3b2c/hugr-0.15.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0376cda72bb3075a94cdfab2d0dc4e1aec79284f21855178057bdf143b4c0ddd", size = 3465437, upload-time = "2026-03-30T12:59:28.4Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/e2/3a13a9b44842baaef1efac6d470bb974d956eaa7f7ab602a34f557b2df8a/hugr-0.15.5-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:2fd3dcc9731bc357a4efd71c667e7d33ddaac78c1a6bb3de987583a4429651b6", size = 3522732, upload-time = "2026-03-30T12:59:32.328Z" },
-    { url = "https://files.pythonhosted.org/packages/52/62/a396c7c8a40acd2c8c394f25f32165cbbfbe8db989cc218b3d509844010f/hugr-0.15.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:37efbdd3cfd96a692a026bd4bd8bae2795ae3374786c57f053819c335a98f24a", size = 3607041, upload-time = "2026-03-30T12:59:36.787Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/2a/735cba4a2fd45aac30e678d196c91b9e6d8707a6a8f1a8a2157c73b11f22/hugr-0.15.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:291f48270b27e183cfca356602d63925ff1a9f51847d35de16366d04fbbfc5ca", size = 3763612, upload-time = "2026-03-30T12:59:40.945Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/20/12/6e653fb7ea01567a220f4fe0b3ae9e435f1e4d8d052d7fc6089195b1322e/hugr-0.14.4.tar.gz", hash = "sha256:0f39810cde20aa22741ceec236ce8936a50cf16b72b259e561fd205a8c02ea08", size = 1076949, upload-time = "2025-11-26T16:15:07.355Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cd/f9/3af5afddee55a5c42fc2b3ccd85a7c23b284f6570822a0171a1157622866/hugr-0.14.4-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:93c9c56ddc0b5e66e4a6a19f95e94f95c2ce764c2763a8abf3329fb1c12c7cdd", size = 3831727, upload-time = "2025-11-26T16:14:50.634Z" },
+    { url = "https://files.pythonhosted.org/packages/04/18/b854c36821819a0f96fdcbf6849f0b29a057e52bda101c2081f705dfeb64/hugr-0.14.4-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:196d1c4eafa4b86502267809b316fcc9d9f5bd30a9052991ee4cd9140a99a969", size = 3406301, upload-time = "2025-11-26T16:14:46.918Z" },
+    { url = "https://files.pythonhosted.org/packages/00/2b/335abcb77657074c7bd1d32fe78acf4bfb86243b84ca4537006aa2ed88ad/hugr-0.14.4-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8affb8819f864dcbc7b9a32bb9efbc66d45b6656d5446afb2078cf1c1c15b63", size = 3719565, upload-time = "2025-11-26T16:14:13.276Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/92/e048e852e58d948c9a3d84daf8b5f124642050a7a200e22e0923025d6dd9/hugr-0.14.4-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1bc1960987198c024300b8baf5415bda825a786329fe7d9c4b5bf4c798809d8c", size = 3774829, upload-time = "2025-11-26T16:14:20.189Z" },
+    { url = "https://files.pythonhosted.org/packages/df/3d/a537e46cf2777538b33664150a36de5688ad9b79285765d3f47c1e208319/hugr-0.14.4-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d39ea3725047505458d63a5f3f9e1b7ea248fa62f8fab3a94c3ee5fefde36197", size = 4032332, upload-time = "2025-11-26T16:14:40.093Z" },
+    { url = "https://files.pythonhosted.org/packages/92/01/108efd9b71dcf3c80863c1d12bcb39eaff1052472b62b0ff06cf73607f76/hugr-0.14.4-cp310-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dfcd7656edbf746c49a28391ffadd995eabdeeb47c79b4b6d286996d175a67f6", size = 4187215, upload-time = "2025-11-26T16:14:26.906Z" },
+    { url = "https://files.pythonhosted.org/packages/16/23/b4699cbf9e50d7657ce7e179d873980e25416ca2a56c7d97234cc9528bc2/hugr-0.14.4-cp310-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbe48b812fe2f2c0eeb8ffc38bca08d34d88449f58eb139e77b5fc0c5a3e73c0", size = 4383840, upload-time = "2025-11-26T16:14:34.108Z" },
+    { url = "https://files.pythonhosted.org/packages/37/c9/081bb993bdf097a67b7ec9292cd5027f940666972d8631580ab346ed547a/hugr-0.14.4-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdb23f2b40e56875752ed739bf1c8bc3ae14bbdacde298e1c7ed0641b27ff853", size = 4068208, upload-time = "2025-11-26T16:14:43.755Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/8b/4f6ae5ca60115afc3a925b197c3504143a454cbfb84615abe7e6a4c666f4/hugr-0.14.4-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:772d631a6ad5099021f2d95d45ade7c543889d49b36d1b03fc072d3da3caa9a2", size = 3942511, upload-time = "2025-11-26T16:14:53.466Z" },
+    { url = "https://files.pythonhosted.org/packages/11/5e/851c84aa0b15779318f57c25d19aaca46e103efb982238c7ad6771a0d49e/hugr-0.14.4-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c7fc26b710141d521951be5b5d85ec4fa0d583ca09b4c6a7e4d1e0d18864072d", size = 4053248, upload-time = "2025-11-26T16:14:56.593Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/fc/38ca1caec06bb019fc3ed8a9f86f96ce0a26db7d0ca1496cf13d96ab51c0/hugr-0.14.4-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:b0f0455ec23fbc60dc9a5770c7c996baa544d96547622de0ba3e9ae4f27ab667", size = 4078485, upload-time = "2025-11-26T16:15:00.721Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/eb/6c851799c7cf5899253178928e6579322c50793b23abf152c093a4d9952f/hugr-0.14.4-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a40bda1a592a02560d8735cc1f0514e04dfc4d6fdd95ece8a63819708b3e6b12", size = 4265564, upload-time = "2025-11-26T16:15:04.257Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/17/cecf6b74c6896ae85ed98962a69c651f988bd7208c437c68a75ff5846114/hugr-0.14.4-cp310-abi3-win32.whl", hash = "sha256:8b930de9755cf8fa649775484a3ab360271fdd29a1aa0ec5570204e80824a71a", size = 3378991, upload-time = "2025-11-26T16:15:10.638Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/2a/9a56a0da944ce7daf754cac339270acb47a15d68c7bc84f3df366be64eb5/hugr-0.14.4-cp310-abi3-win_amd64.whl", hash = "sha256:1987f8b5078dc40b674203acb5952d02f1b72728997123652a01f5fface882a0", size = 3606113, upload-time = "2025-11-26T16:15:08.762Z" },
+    { url = "https://files.pythonhosted.org/packages/28/56/caac43ee7fcc69edbc3d32ec5796ab072d973b5b21c0ce389c18e202dca6/hugr-0.14.4-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:a1cdef74e8b5bd3831f5f635dbde78debb5bd3f9c997b78747542f1256d499fb", size = 3849266, upload-time = "2025-11-26T16:14:52.047Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/b0/d5e3187560aa0d84044863e24a3df8cb1706ae86315510d79905c53b29da/hugr-0.14.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e9880f01550807a3fcb9acbb062c1ae9601328bfe4684580ca04d543d3131eb0", size = 3408154, upload-time = "2025-11-26T16:14:49.019Z" },
+    { url = "https://files.pythonhosted.org/packages/05/ff/817a4595b68c2ef56ddfea82ec20c0247ebbcad1e721b2767ff513ece73b/hugr-0.14.4-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6b0a67e1740fb78d66ab31561be4d6374f480cab115f6314d6bbc8d26704a73", size = 3715760, upload-time = "2025-11-26T16:14:15.307Z" },
+    { url = "https://files.pythonhosted.org/packages/54/19/53574f3dbcf6b5c3602c71d017fda201196f5ea13c365012c12e3de8d87f/hugr-0.14.4-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a3aedae6e89aaebadfbbd201e35b07be2a683bb23307b2bbdcac410a170c0d11", size = 3771058, upload-time = "2025-11-26T16:14:21.889Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/45/3094b9254030a6443b83ff70114edf90dfbaf1356d387677335107509186/hugr-0.14.4-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d32813dafd772ac067d68d5bd2ed3f1a98d8912a2908db68a137ad75d4a88e7", size = 4038863, upload-time = "2025-11-26T16:14:42.014Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/7b/4065d535076a61a9ae004b664aa8379a3c493dd768336deb927bc26f4701/hugr-0.14.4-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:983f26f1cf0bf196c3aca3d0f97d7ddf27ec7cc8753bea8d358b4afbe904029c", size = 4185665, upload-time = "2025-11-26T16:14:28.733Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/a5/d5c5c7ad832c0b817743df9a4db58c72025c155d7e78ae12262975d2e527/hugr-0.14.4-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b453bc489b5c4e4746c980c935a79ada189988c10a2f77a6c2a10e04bae0b89", size = 4379657, upload-time = "2025-11-26T16:14:35.633Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/7b/5083c0cc249e53082443323fe493ac6d18388f181b82d6bfa479b523b7d9/hugr-0.14.4-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:328eb29099b3cc1ddbc5479a6bb7e1a437f170f4e85ee27b3d35357ef2caf369", size = 4066330, upload-time = "2025-11-26T16:14:45.255Z" },
+    { url = "https://files.pythonhosted.org/packages/87/e2/46cc78aff8da4544993c49638ff801a6a9a44149bf20f7e5917b18dd9fef/hugr-0.14.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:1fd38be47ac27ccb9edb124a18c3b14e545708b8bbed89c8c20d1b2e14a4e9b2", size = 3939452, upload-time = "2025-11-26T16:14:54.9Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/ba/f027273d8d3a6ebcfa9e6954502615050293e695dc8e38824164ef9abea1/hugr-0.14.4-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:b59dc1972a07a2cee4d8e53d24020634d70d7d923fe0592c977e0c3de8d538ed", size = 4048708, upload-time = "2025-11-26T16:14:58.135Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/27/bdb89dc5ac0f1dce5e8f79a47eaa30a90aed671329107f6f305330e4e043/hugr-0.14.4-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:19392043a13c53bb49cfb9ffabc6a084f6a1b563a51be3bef6d4e7f78b96aee6", size = 4088139, upload-time = "2025-11-26T16:15:02.8Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/e1/d1571d13f49d54cb4b84dbcc5d81e3548761b15abe019b97abdf7606c8a2/hugr-0.14.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c88b23a70ccd780f30148784eaad7cfd7384c459f5e221870d901922a6300492", size = 4259853, upload-time = "2025-11-26T16:15:05.828Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/75/e49bd706b9be6005da9a838e94d2a15cd96373839ddcc88fb38b7798fd29/hugr-0.14.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c29d997d8d210ebde9d07305833cd80b2425c1419b6554042d150c314ccd1c4", size = 3719146, upload-time = "2025-11-26T16:14:16.785Z" },
+    { url = "https://files.pythonhosted.org/packages/68/73/f8ec89ff697b20e244844f2bdf4aaa07b3ee063d845861659479582b0aa2/hugr-0.14.4-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:57b0487a13382005d0ee0e908ae349272bdc03c3537202a0a46e9a07c4fc112c", size = 3773374, upload-time = "2025-11-26T16:14:23.602Z" },
+    { url = "https://files.pythonhosted.org/packages/16/81/a82554d06dcee43b8580156c9e5ca1bafbb2eef6cb54d674b2c49bc0a21f/hugr-0.14.4-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bd82eddfcd20d26c96aa8bc8be8ab867f1c231902ae4200b570f405f0eb7ca9", size = 4185152, upload-time = "2025-11-26T16:14:30.574Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/a8/ae3b32d52c682545b66a3e73481565050e15d47ecf374da7658e5d8f60a5/hugr-0.14.4-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea4264d9d93b9684b1b5d47a8f5d7a23e73631e4d9cbd7d34008da37f2402913", size = 4376569, upload-time = "2025-11-26T16:14:37.123Z" },
 ]
 
 [[package]]
@@ -1314,7 +1322,8 @@ name = "ipython"
 version = "8.39.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.11'",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')",
 ]
 dependencies = [
     { name = "colorama", marker = "python_full_version < '3.11' and sys_platform == 'win32'" },
@@ -1339,7 +1348,8 @@ name = "ipython"
 version = "9.10.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.11.*'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')",
 ]
 dependencies = [
     { name = "colorama", marker = "python_full_version == '3.11.*' and sys_platform == 'win32'" },
@@ -1364,8 +1374,10 @@ name = "ipython"
 version = "9.12.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version >= '3.12' and python_full_version < '3.14'",
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version >= '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and sys_platform != 'darwin')",
+    "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'darwin')",
 ]
 dependencies = [
     { name = "colorama", marker = "python_full_version >= '3.12' and sys_platform == 'win32'" },
@@ -1623,7 +1635,7 @@ dependencies = [
     { name = "overrides", marker = "python_full_version < '3.12'" },
     { name = "packaging" },
     { name = "prometheus-client" },
-    { name = "pywinpty", marker = "os_name == 'nt'" },
+    { name = "pywinpty", marker = "(os_name == 'nt' and platform_machine != 'x86_64') or (os_name == 'nt' and sys_platform != 'darwin')" },
     { name = "pyzmq" },
     { name = "send2trash" },
     { name = "terminado" },
@@ -1641,7 +1653,7 @@ name = "jupyter-server-terminals"
 version = "0.5.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pywinpty", marker = "os_name == 'nt'" },
+    { name = "pywinpty", marker = "(os_name == 'nt' and platform_machine != 'x86_64') or (os_name == 'nt' and sys_platform != 'darwin')" },
     { name = "terminado" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f4/a7/bcd0a9b0cbba88986fe944aaaf91bfda603e5a50bda8ed15123f381a3b2f/jupyter_server_terminals-0.5.4.tar.gz", hash = "sha256:bbda128ed41d0be9020349f9f1f2a4ab9952a73ed5f5ac9f1419794761fb87f5", size = 31770, upload-time = "2026-01-14T16:53:20.213Z" }
@@ -1905,6 +1917,62 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e3/4c/02df1befee243e4c14bf5740c391178ba4f7b4602ff08936da170341afe9/lief-0.17.6-cp314-cp314-win_arm64.whl", hash = "sha256:7dcefa6467f0f0d75413a10e7869e488344347f0c67eff5bc49ec216714f0674", size = 3462306, upload-time = "2026-03-18T06:58:54.937Z" },
 ]
 
+[[package]]
+name = "llvmlite"
+version = "0.45.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/99/8d/5baf1cef7f9c084fb35a8afbde88074f0d6a727bc63ef764fe0e7543ba40/llvmlite-0.45.1.tar.gz", hash = "sha256:09430bb9d0bb58fc45a45a57c7eae912850bedc095cd0810a57de109c69e1c32", size = 185600, upload-time = "2025-10-01T17:59:52.046Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cf/6d/585c84ddd9d2a539a3c3487792b3cf3f988e28ec4fa281bf8b0e055e1166/llvmlite-0.45.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:1b1af0c910af0978aa55fa4f60bbb3e9f39b41e97c2a6d94d199897be62ba07a", size = 43043523, upload-time = "2025-10-01T18:02:58.621Z" },
+    { url = "https://files.pythonhosted.org/packages/04/ad/9bdc87b2eb34642c1cfe6bcb4f5db64c21f91f26b010f263e7467e7536a3/llvmlite-0.45.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:60f92868d5d3af30b4239b50e1717cb4e4e54f6ac1c361a27903b318d0f07f42", size = 43043526, upload-time = "2025-10-01T18:03:15.051Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/7c/82cbd5c656e8991bcc110c69d05913be2229302a92acb96109e166ae31fb/llvmlite-0.45.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:28e763aba92fe9c72296911e040231d486447c01d4f90027c8e893d89d49b20e", size = 43043524, upload-time = "2025-10-01T18:03:30.666Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/e2/c185bb7e88514d5025f93c6c4092f6120c6cea8fe938974ec9860fb03bbb/llvmlite-0.45.1-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:d9ea9e6f17569a4253515cc01dade70aba536476e3d750b2e18d81d7e670eb15", size = 43043524, upload-time = "2025-10-01T18:03:43.249Z" },
+]
+
+[[package]]
+name = "llvmlite"
+version = "0.47.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "(python_full_version >= '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and sys_platform != 'darwin')",
+    "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'darwin')",
+    "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')",
+    "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/01/88/a8952b6d5c21e74cbf158515b779666f692846502623e9e3c39d8e8ba25f/llvmlite-0.47.0.tar.gz", hash = "sha256:62031ce968ec74e95092184d4b0e857e444f8fdff0b8f9213707699570c33ccc", size = 193614, upload-time = "2026-03-31T18:29:53.497Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/f5/a1bde3aa8c43524b0acaf3f72fb3d80a32dd29dbb42d7dc434f84584cdcc/llvmlite-0.47.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41270b0b1310717f717cf6f2a9c68d3c43bd7905c33f003825aebc361d0d1b17", size = 37232772, upload-time = "2026-03-31T18:28:12.198Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/fb/76d88fc05ee1f9c1a6efe39eb493c4a727e5d1690412469017cd23bcb776/llvmlite-0.47.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f9d118bc1dd7623e0e65ca9ac485ec6dd543c3b77bc9928ddc45ebd34e1e30a7", size = 56275179, upload-time = "2026-03-31T18:28:15.725Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/08/29da7f36217abd56a0c389ef9a18bea47960826e691ced1a36c92c6ce93c/llvmlite-0.47.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ea5cfb04a6ab5b18e46be72b41b015975ba5980c4ddb41f1975b83e19031063", size = 55128632, upload-time = "2026-03-31T18:28:19.946Z" },
+    { url = "https://files.pythonhosted.org/packages/df/f8/5e12e9ed447d65f04acf6fcf2d79cded2355640b5131a46cee4c99a5949d/llvmlite-0.47.0-cp310-cp310-win_amd64.whl", hash = "sha256:166b896a2262a2039d5fc52df5ee1659bd1ccd081183df7a2fba1b74702dd5ea", size = 38138402, upload-time = "2026-03-31T18:28:23.327Z" },
+    { url = "https://files.pythonhosted.org/packages/34/0b/b9d1911cfefa61399821dfb37f486d83e0f42630a8d12f7194270c417002/llvmlite-0.47.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:74090f0dcfd6f24ebbef3f21f11e38111c4d7e6919b54c4416e1e357c3446b07", size = 37232770, upload-time = "2026-03-31T18:28:26.765Z" },
+    { url = "https://files.pythonhosted.org/packages/46/27/5799b020e4cdfb25a7c951c06a96397c135efcdc21b78d853bbd9c814c7d/llvmlite-0.47.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ca14f02e29134e837982497959a8e2193d6035235de1cb41a9cb2bd6da4eedbb", size = 56275177, upload-time = "2026-03-31T18:28:31.01Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/51/48a53fedf01cb1f3f43ef200be17ebf83c8d9a04018d3783c1a226c342c2/llvmlite-0.47.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:12a69d4bb05f402f30477e21eeabe81911e7c251cecb192bed82cd83c9db10d8", size = 55128631, upload-time = "2026-03-31T18:28:36.046Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/50/59227d06bdc96e23322713c381af4e77420949d8cd8a042c79e0043096cc/llvmlite-0.47.0-cp311-cp311-win_amd64.whl", hash = "sha256:c37d6eb7aaabfa83ab9c2ff5b5cdb95a5e6830403937b2c588b7490724e05327", size = 38138400, upload-time = "2026-03-31T18:28:40.076Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/48/4b7fe0e34c169fa2f12532916133e0b219d2823b540733651b34fdac509a/llvmlite-0.47.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:306a265f408c259067257a732c8e159284334018b4083a9e35f67d19792b164f", size = 37232769, upload-time = "2026-03-31T18:28:43.735Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/4b/e3f2cd17822cf772a4a51a0a8080b0032e6d37b2dbe8cfb724eac4e31c52/llvmlite-0.47.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5853bf26160857c0c2573415ff4efe01c4c651e59e2c55c2a088740acfee51cd", size = 56275178, upload-time = "2026-03-31T18:28:48.342Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/55/a3b4a543185305a9bdf3d9759d53646ed96e55e7dfd43f53e7a421b8fbae/llvmlite-0.47.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:003bcf7fa579e14db59c1a1e113f93ab8a06b56a4be31c7f08264d1d4072d077", size = 55128632, upload-time = "2026-03-31T18:28:52.901Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/f5/d281ae0f79378a5a91f308ea9fdb9f9cc068fddd09629edc0725a5a8fde1/llvmlite-0.47.0-cp312-cp312-win_amd64.whl", hash = "sha256:f3079f25bdc24cd9d27c4b2b5e68f5f60c4fdb7e8ad5ee2b9b006007558f9df7", size = 38138692, upload-time = "2026-03-31T18:28:57.147Z" },
+    { url = "https://files.pythonhosted.org/packages/77/6f/4615353e016799f80fa52ccb270a843c413b22361fadda2589b2922fb9b0/llvmlite-0.47.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a3c6a735d4e1041808434f9d440faa3d78d9b4af2ee64d05a66f351883b6ceec", size = 37232771, upload-time = "2026-03-31T18:29:01.324Z" },
+    { url = "https://files.pythonhosted.org/packages/31/b8/69f5565f1a280d032525878a86511eebed0645818492feeb169dfb20ae8e/llvmlite-0.47.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2699a74321189e812d476a43d6d7f652f51811e7b5aad9d9bba842a1c7927acb", size = 56275178, upload-time = "2026-03-31T18:29:05.748Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/da/b32cafcb926fb0ce2aa25553bf32cb8764af31438f40e2481df08884c947/llvmlite-0.47.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c6951e2b29930227963e53ee152441f0e14be92e9d4231852102d986c761e40", size = 55128632, upload-time = "2026-03-31T18:29:11.235Z" },
+    { url = "https://files.pythonhosted.org/packages/46/9f/4898b44e4042c60fafcb1162dfb7014f6f15b1ec19bf29cfea6bf26df90d/llvmlite-0.47.0-cp313-cp313-win_amd64.whl", hash = "sha256:c2e9adf8698d813a9a5efb2d4370caf344dbc1e145019851fee6a6f319ba760e", size = 38138695, upload-time = "2026-03-31T18:29:15.43Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/d4/33c8af00f0bf6f552d74f3a054f648af2c5bc6bece97972f3bfadce4f5ec/llvmlite-0.47.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:de966c626c35c9dff5ae7bf12db25637738d0df83fc370cf793bc94d43d92d14", size = 37232773, upload-time = "2026-03-31T18:29:19.453Z" },
+    { url = "https://files.pythonhosted.org/packages/64/1d/a760e993e0c0ba6db38d46b9f48f6c7dceb8ac838824997fb9e25f97bc04/llvmlite-0.47.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ddbccff2aeaff8670368340a158abefc032fe9b3ccf7d9c496639263d00151aa", size = 56275176, upload-time = "2026-03-31T18:29:24.149Z" },
+    { url = "https://files.pythonhosted.org/packages/84/3b/e679bc3b29127182a7f4aa2d2e9e5bea42adb93fb840484147d59c236299/llvmlite-0.47.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4a7b778a2e144fc64468fb9bf509ac1226c9813a00b4d7afea5d988c4e22fca", size = 55128631, upload-time = "2026-03-31T18:29:29.536Z" },
+    { url = "https://files.pythonhosted.org/packages/be/f7/19e2a09c62809c9e63bbd14ce71fb92c6ff7b7b3045741bb00c781efc3c9/llvmlite-0.47.0-cp314-cp314-win_amd64.whl", hash = "sha256:694e3c2cdc472ed2bd8bd4555ca002eec4310961dd58ef791d508f57b5cc4c94", size = 39153826, upload-time = "2026-03-31T18:29:33.681Z" },
+    { url = "https://files.pythonhosted.org/packages/40/a1/581a8c707b5e80efdbbe1dd94527404d33fe50bceb71f39d5a7e11bd57b7/llvmlite-0.47.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:92ec8a169a20b473c1c54d4695e371bde36489fc1efa3688e11e99beba0abf9c", size = 37232772, upload-time = "2026-03-31T18:29:37.952Z" },
+    { url = "https://files.pythonhosted.org/packages/11/03/16090dd6f74ba2b8b922276047f15962fbeea0a75d5601607edb301ba945/llvmlite-0.47.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa1cbd800edd3b20bc141521f7fd45a6185a5b84109aa6855134e81397ffe72b", size = 56275178, upload-time = "2026-03-31T18:29:42.58Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/cb/0abf1dd4c5286a95ffe0c1d8c67aec06b515894a0dd2ac97f5e27b82ab0b/llvmlite-0.47.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6725179b89f03b17dabe236ff3422cb8291b4c1bf40af152826dfd34e350ae8", size = 55128632, upload-time = "2026-03-31T18:29:46.939Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/79/d3bbab197e86e0ff4f9c07122895b66a3e0d024247fcff7f12c473cb36d9/llvmlite-0.47.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6842cf6f707ec4be3d985a385ad03f72b2d724439e118fcbe99b2929964f0453", size = 39153839, upload-time = "2026-03-31T18:29:51.004Z" },
+]
+
 [[package]]
 name = "markdown"
 version = "3.10.2"
@@ -2117,26 +2185,26 @@ wheels = [
 
 [[package]]
 name = "maturin"
-version = "1.12.6"
+version = "1.13.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "tomli", marker = "python_full_version < '3.11'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0c/18/8b2eebd3ea086a5ec73d7081f95ec64918ceda1900075902fc296ea3ad55/maturin-1.12.6.tar.gz", hash = "sha256:d37be3a811a7f2ee28a0fa0964187efa50e90f21da0c6135c27787fa0b6a89db", size = 269165, upload-time = "2026-03-01T14:54:04.21Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/39/16/b284a7bc4af3dd87717c784278c1b8cb18606ad1f6f7a671c47bfd9c3df0/maturin-1.13.1.tar.gz", hash = "sha256:9a87ff3b8e4d1c6eac33ebfe8e261e8236516d98d45c0323550621819b5a1a2f", size = 340369, upload-time = "2026-04-09T15:14:07.026Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/71/8b/9ddfde8a485489e3ebdc50ee3042ef1c854f00dfea776b951068f6ffe451/maturin-1.12.6-py3-none-linux_armv6l.whl", hash = "sha256:6892b4176992fcc143f9d1c1c874a816e9a041248eef46433db87b0f0aff4278", size = 9789847, upload-time = "2026-03-01T14:54:09.172Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/e8/5f7fd3763f214a77ac0388dbcc71cc30aec5490016bd0c8e6bd729fc7b0a/maturin-1.12.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:c0c742beeeef7fb93b6a81bd53e75507887e396fd1003c45117658d063812dad", size = 19023833, upload-time = "2026-03-01T14:53:46.743Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/7f/706ff3839c8b2046436d4c2bc97596c558728264d18abc298a1ad862a4be/maturin-1.12.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:2cb41139295eed6411d3cdafc7430738094c2721f34b7eeb44f33cac516115dc", size = 9821620, upload-time = "2026-03-01T14:54:12.04Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/9c/70917fb123c8dd6b595e913616c9c72d730cbf4a2b6cac8077dc02a12586/maturin-1.12.6-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:351f3af1488a7cbdcff3b6d8482c17164273ac981378a13a4a9937a49aec7d71", size = 9849107, upload-time = "2026-03-01T14:53:48.971Z" },
-    { url = "https://files.pythonhosted.org/packages/59/ea/f1d6ad95c0a12fbe761a7c28a57540341f188564dbe8ad730a4d1788cd32/maturin-1.12.6-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:6dbddfe4dc7ddee60bbac854870bd7cfec660acb54d015d24597d59a1c828f61", size = 10242855, upload-time = "2026-03-01T14:53:44.605Z" },
-    { url = "https://files.pythonhosted.org/packages/93/1b/2419843a4f1d2fb4747f3dc3d9c4a2881cd97a3274dd94738fcdf0835e79/maturin-1.12.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:8fdb0f63e77ee3df0f027a120e9af78dbc31edf0eb0f263d55783c250c33b728", size = 9674972, upload-time = "2026-03-01T14:53:52.763Z" },
-    { url = "https://files.pythonhosted.org/packages/71/46/b60ab2fc996d904b40e55bd475599dcdccd8f7ad3e649bf95e87970df466/maturin-1.12.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:fa84b7493a2e80759cacc2e668fa5b444d55b9994e90707c42904f55d6322c1e", size = 9645755, upload-time = "2026-03-01T14:53:58.497Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/96/03f2b55a8c226805115232fc23c4a4f33f0c9d39e11efab8166dc440f80d/maturin-1.12.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:e90dc12bc6a38e9495692a36c9e231c4d7e0c9bfde60719468ab7d8673db3c45", size = 12737612, upload-time = "2026-03-01T14:54:05.393Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/c2/648667022c5b53cdccefa67c245e8a984970f3045820f00c2e23bdb2aff4/maturin-1.12.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:06fc8d089f98623ce924c669b70911dfed30f9a29956c362945f727f9abc546b", size = 10455028, upload-time = "2026-03-01T14:54:07.349Z" },
-    { url = "https://files.pythonhosted.org/packages/63/d6/5b5efe3ca0c043357ed3f8d2b2d556169fdbf1ff75e50e8e597708a359d2/maturin-1.12.6-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:75133e56274d43b9227fd49dca9a86e32f1fd56a7b55544910c4ce978c2bb5aa", size = 10014531, upload-time = "2026-03-01T14:53:54.548Z" },
-    { url = "https://files.pythonhosted.org/packages/68/d5/39c594c27b1a8b32a0cb95fff9ad60b888c4352d1d1c389ac1bd20dc1e16/maturin-1.12.6-py3-none-win32.whl", hash = "sha256:3f32e0a3720b81423c9d35c14e728cb1f954678124749776dc72d533ea1115e8", size = 8553012, upload-time = "2026-03-01T14:53:50.706Z" },
-    { url = "https://files.pythonhosted.org/packages/94/66/b262832a91747e04051e21f986bd01a8af81fbffafacc7d66a11e79aab5f/maturin-1.12.6-py3-none-win_amd64.whl", hash = "sha256:977290159d252db946054a0555263c59b3d0c7957135c69e690f4b1558ee9983", size = 9890470, upload-time = "2026-03-01T14:53:56.659Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/47/76b8ca470ddc8d7d36aa8c15f5a6aed1841806bb93a0f4ead8ee61e9a088/maturin-1.12.6-py3-none-win_arm64.whl", hash = "sha256:bae91976cdc8148038e13c881e1e844e5c63e58e026e8b9945aa2d19b3b4ae89", size = 8606158, upload-time = "2026-03-01T14:54:02.423Z" },
+    { url = "https://files.pythonhosted.org/packages/43/4d/a23fc95be881aa8c7a6ea353410417872e4d7065df03d7f3db8f0dbed4a7/maturin-1.13.1-py3-none-linux_armv6l.whl", hash = "sha256:416e4e01cb88b798e606ee43929df897e42c1647b722ef68283816cca99a8742", size = 10102444, upload-time = "2026-04-09T15:13:48.393Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/1e/65c385d65bae95cf04895d52f39dbed8b1453ae55da2903d252ade40a774/maturin-1.13.1-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:72888e87819ce546d0d2df900e4b385e4ef299077d92ee37b48923a5602dae94", size = 19576043, upload-time = "2026-04-09T15:14:08.685Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/13/f6bc868d0bfecd9314870b97f530a167e31f7878ac4945c78245c6eef69c/maturin-1.13.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:98b5fcf1a186c217830a8295ecc2989c6b1cf50945417adfc15252107b9475b7", size = 10117339, upload-time = "2026-04-09T15:13:40.559Z" },
+    { url = "https://files.pythonhosted.org/packages/51/58/279e081305c11c1c1c4fccacf77df8959646c5d4de7a57ec7e787653e270/maturin-1.13.1-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:3da18cccf2f683c0977bff9146a0908d6ffce836d600665736ac01679f588cb9", size = 10139689, upload-time = "2026-04-09T15:13:38.291Z" },
+    { url = "https://files.pythonhosted.org/packages/00/94/69391af5396c6aab723932240803f49e5f3de3dd7c57d32f02d237a0ce32/maturin-1.13.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:6b1e5916a253243e8f5f9e847b62bbc98420eec48c9ce2e2e8724c6da89d359b", size = 10551141, upload-time = "2026-04-09T15:13:42.887Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/bf/4edac2667b49e3733438062ae416413b8fc8d42e1bd499ba15e1fb02fc55/maturin-1.13.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:dc91031e0619c1e28730279ef9ee5f106c9b9ec806b013f888676b242f892eb7", size = 9983094, upload-time = "2026-04-09T15:13:56.868Z" },
+    { url = "https://files.pythonhosted.org/packages/79/94/a6d651cfe8fc6bf2e892c90e3cdbb25c06d81c9115140d03ea1a68a97575/maturin-1.13.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:001741c6cff56aa8ea59a0d78ae990c0550d0e3e82b00b683eedb4158a8ef7e6", size = 9949980, upload-time = "2026-04-09T15:13:59.185Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/d1/82c067464f848e38af9910bce55eb54302b1c1284a279d515dbfcf5994f5/maturin-1.13.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:01c845825c917c07c1d0b2c9032c59c16a7d383d1e649a46481d3e5693c2750f", size = 13186276, upload-time = "2026-04-09T15:13:45.725Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f4/25367baf1025580f047f9b37598bb3fadc416e24536afd4f28e190335c73/maturin-1.13.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f69093ed4a0e6464e52a7fc26d714f859ce15630ec8070743398c6bf41f38a9e", size = 10891837, upload-time = "2026-04-09T15:13:35.68Z" },
+    { url = "https://files.pythonhosted.org/packages/af/be/caafad8ce74974b7deafdf144d12f758993dfea4c66c9905b138f51a7792/maturin-1.13.1-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:c1490584f3c70af45466ee99065b49e6657ebdccac6b10571bb44681309c9396", size = 10351032, upload-time = "2026-04-09T15:14:01.632Z" },
+    { url = "https://files.pythonhosted.org/packages/66/0e/970a721d27cfa410e8bfa0a1e32e6ef52cb8169692110a5fdabe1af3f570/maturin-1.13.1-py3-none-win32.whl", hash = "sha256:c6a720b252c99de072922dbe4432ab19662b6f80045b0355fec23bdfccb450da", size = 8855465, upload-time = "2026-04-09T15:13:51.122Z" },
+    { url = "https://files.pythonhosted.org/packages/88/70/7c1e0d65fa147d5479055a171541c82b8cdfc1c825d85a82240470f14176/maturin-1.13.1-py3-none-win_amd64.whl", hash = "sha256:a2017d2281203d0c6570240e7d746564d766d756105823b7de68bda6ae722711", size = 10230471, upload-time = "2026-04-09T15:13:53.89Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/2a/afe0193b673a79ffd2e01ad999511b7e9e6b49af02bb3759d82a78c3043d/maturin-1.13.1-py3-none-win_arm64.whl", hash = "sha256:2839024dcd65776abb4759e5bca29941971e095574162a4d335191da4be9ff24", size = 8905575, upload-time = "2026-04-09T15:14:03.891Z" },
 ]
 
 [[package]]
@@ -2324,7 +2392,7 @@ wheels = [
 
 [[package]]
 name = "nbconvert"
-version = "7.17.0"
+version = "7.17.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "beautifulsoup4" },
@@ -2342,9 +2410,9 @@ dependencies = [
     { name = "pygments" },
     { name = "traitlets" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/38/47/81f886b699450d0569f7bc551df2b1673d18df7ff25cc0c21ca36ed8a5ff/nbconvert-7.17.0.tar.gz", hash = "sha256:1b2696f1b5be12309f6c7d707c24af604b87dfaf6d950794c7b07acab96dda78", size = 862855, upload-time = "2026-01-29T16:37:48.478Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/b1/708e53fe2e429c103c6e6e159106bcf0357ac41aa4c28772bd8402339051/nbconvert-7.17.1.tar.gz", hash = "sha256:34d0d0a7e73ce3cbab6c5aae8f4f468797280b01fd8bd2ca746da8569eddd7d2", size = 865311, upload-time = "2026-04-08T00:44:14.914Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/0d/4b/8d5f796a792f8a25f6925a96032f098789f448571eb92011df1ae59e8ea8/nbconvert-7.17.0-py3-none-any.whl", hash = "sha256:4f99a63b337b9a23504347afdab24a11faa7d86b405e5c8f9881cd313336d518", size = 261510, upload-time = "2026-01-29T16:37:46.322Z" },
+    { url = "https://files.pythonhosted.org/packages/67/f8/bb0a9d5f46819c821dc1f004aa2cc29b1d91453297dbf5ff20470f00f193/nbconvert-7.17.1-py3-none-any.whl", hash = "sha256:aa85c087b435e7bf1ffd03319f658e285f2b89eccab33bc1ba7025495ab3e7c8", size = 261927, upload-time = "2026-04-08T00:44:12.845Z" },
 ]
 
 [[package]]
@@ -2376,7 +2444,8 @@ name = "networkx"
 version = "3.4.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.11'",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
 wheels = [
@@ -2388,9 +2457,12 @@ name = "networkx"
 version = "3.6.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version >= '3.12' and python_full_version < '3.14'",
-    "python_full_version == '3.11.*'",
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version >= '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and sys_platform != 'darwin')",
+    "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'darwin')",
+    "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
 wheels = [
@@ -2439,7 +2511,8 @@ name = "numpy"
 version = "2.2.6"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.11'",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
 wheels = [
@@ -2504,9 +2577,12 @@ name = "numpy"
 version = "2.4.4"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version >= '3.12' and python_full_version < '3.14'",
-    "python_full_version == '3.11.*'",
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version >= '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and sys_platform != 'darwin')",
+    "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'darwin')",
+    "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587, upload-time = "2026-03-29T13:22:01.298Z" }
 wheels = [
@@ -2875,7 +2951,7 @@ dev = [
     { name = "jupyter", specifier = ">=1.1.1" },
     { name = "markdown-exec", extras = ["ansi"] },
     { name = "matplotlib", specifier = ">=2.2.0" },
-    { name = "maturin", specifier = ">=1.2,<2.0" },
+    { name = "maturin", specifier = ">=1.13.1,<2.0" },
     { name = "mkdocs" },
     { name = "mkdocs-material" },
     { name = "mkdocstrings", extras = ["python"] },
@@ -2885,8 +2961,8 @@ dev = [
     { name = "polars", specifier = ">=1.0.0" },
     { name = "pre-commit" },
     { name = "ruff" },
-    { name = "setuptools", specifier = ">=62.6" },
-    { name = "wasmtime", specifier = ">=13.0" },
+    { name = "setuptools", specifier = ">=82.0.1" },
+    { name = "wasmtime", specifier = ">=43.0.0" },
 ]
 numpy-compat = [
     { name = "numpy", specifier = ">=1.15.0" },
@@ -3024,11 +3100,11 @@ wheels = [
 
 [[package]]
 name = "platformdirs"
-version = "4.9.4"
+version = "4.9.6"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/19/56/8d4c30c8a1d07013911a8fdbd8f89440ef9f08d07a1b50ab8ca8be5a20f9/platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934", size = 28737, upload-time = "2026-03-05T18:34:13.271Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/9f/4a/0883b8e3802965322523f0b200ecf33d31f10991d0401162f4b23c698b42/platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a", size = 29400, upload-time = "2026-04-09T00:04:10.812Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" },
+    { url = "https://files.pythonhosted.org/packages/75/a6/a0a304dc33b49145b21f4808d763822111e67d1c3a32b524a1baf947b6e1/platformdirs-4.9.6-py3-none-any.whl", hash = "sha256:e61adb1d5e5cb3441b4b7710bea7e4c12250ca49439228cc1021c00dcfac0917", size = 21348, upload-time = "2026-04-09T00:04:09.463Z" },
 ]
 
 [[package]]
@@ -3098,11 +3174,11 @@ wheels = [
 
 [[package]]
 name = "prometheus-client"
-version = "0.24.1"
+version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f0/58/a794d23feb6b00fc0c72787d7e87d872a6730dd9ed7c7b3e954637d8f280/prometheus_client-0.24.1.tar.gz", hash = "sha256:7e0ced7fbbd40f7b84962d5d2ab6f17ef88a72504dcf7c0b40737b43b2a461f9", size = 85616, upload-time = "2026-01-14T15:26:26.965Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1b/fb/d9aa83ffe43ce1f19e557c0971d04b90561b0cfd50762aafb01968285553/prometheus_client-0.25.0.tar.gz", hash = "sha256:5e373b75c31afb3c86f1a52fa1ad470c9aace18082d39ec0d2f918d11cc9ba28", size = 86035, upload-time = "2026-04-09T19:53:42.359Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/74/c3/24a2f845e3917201628ecaba4f18bab4d18a337834c1df2a159ee9d22a42/prometheus_client-0.24.1-py3-none-any.whl", hash = "sha256:150db128af71a5c2482b36e588fc8a6b95e498750da4b17065947c16070f4055", size = 64057, upload-time = "2026-01-14T15:26:24.42Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/9b/d4b1e644385499c8346fa9b622a3f030dce14cd6ef8a1871c221a17a67e7/prometheus_client-0.25.0-py3-none-any.whl", hash = "sha256:d5aec89e349a6ec230805d0df882f3807f74fd6c1a2fa86864e3c2279059fed1", size = 64154, upload-time = "2026-04-09T19:53:41.324Z" },
 ]
 
 [[package]]
@@ -3429,15 +3505,15 @@ wheels = [
 
 [[package]]
 name = "python-discovery"
-version = "1.2.1"
+version = "1.2.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "platformdirs" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b9/88/815e53084c5079a59df912825a279f41dd2e0df82281770eadc732f5352c/python_discovery-1.2.1.tar.gz", hash = "sha256:180c4d114bff1c32462537eac5d6a332b768242b76b69c0259c7d14b1b680c9e", size = 58457, upload-time = "2026-03-26T22:30:44.496Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/de/ef/3bae0e537cfe91e8431efcba4434463d2c5a65f5a89edd47c6cf2f03c55f/python_discovery-1.2.2.tar.gz", hash = "sha256:876e9c57139eb757cb5878cbdd9ae5379e5d96266c99ef731119e04fffe533bb", size = 58872, upload-time = "2026-04-07T17:28:49.249Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/67/0f/019d3949a40280f6193b62bc010177d4ce702d0fce424322286488569cd3/python_discovery-1.2.1-py3-none-any.whl", hash = "sha256:b6a957b24c1cd79252484d3566d1b49527581d46e789aaf43181005e56201502", size = 31674, upload-time = "2026-03-26T22:30:43.396Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/db/795879cc3ddfe338599bddea6388cc5100b088db0a4caf6e6c1af1c27e04/python_discovery-1.2.2-py3-none-any.whl", hash = "sha256:e1ae95d9af875e78f15e19aed0c6137ab1bb49c200f21f5061786490c9585c7a", size = 31894, upload-time = "2026-04-07T17:28:48.09Z" },
 ]
 
 [[package]]
@@ -3729,16 +3805,102 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" },
 ]
 
+[[package]]
+name = "pyzstd"
+version = "0.18.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/47/82/7bcafbf06ee83a66990ce5badbb8f4dc32184346bab20de7e468b1a2f6ec/pyzstd-0.18.0.tar.gz", hash = "sha256:81b6851ab1ca2e5f2c709e896a1362e3065a64f271f43db77fb7d5e4a78e9861", size = 806048, upload-time = "2025-10-05T08:19:47.994Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/92/a9f14aba213e39c5994e01343f4ec679fbecc303f60cd26a65b2687826ce/pyzstd-0.18.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:79bb84d866bf57ad2c4bc6b8247628b38e965c4f66288f887bf90f546a42ae04", size = 369087, upload-time = "2025-10-05T08:17:38.272Z" },
+    { url = "https://files.pythonhosted.org/packages/63/87/99688133d11bb2fb9fdfb13e8391742c34750927b2091f3a93bc98ccaf99/pyzstd-0.18.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0576c48e2f7a2c457538414a6197397c343b1bf5bfe9332b049afd0366c0c92", size = 295161, upload-time = "2025-10-05T08:17:40.377Z" },
+    { url = "https://files.pythonhosted.org/packages/04/d7/84e4ea7f2d429454ee1abc8bbccd6d7a288edbd014fd960f914af807fe06/pyzstd-0.18.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ea7702484795ee3c16c48a03d990123e833f1e1d6baabbe9a53256238eb04cbc", size = 408550, upload-time = "2025-10-05T08:17:41.565Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/2f/84dbb3fa0f87077450e795863cc408ee94fb24f85e76428f38a98902e25d/pyzstd-0.18.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c412ac29a9ebb76c8c40f2df146327b460ce184bbbdaa5bc9257317dce4caa8", size = 514924, upload-time = "2025-10-05T08:17:43.195Z" },
+    { url = "https://files.pythonhosted.org/packages/48/07/8a326a0a050bdba20ac75254f472fa0c305cd31518c8d05451ef4ad8c256/pyzstd-0.18.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:36baae4201196c2ec6567faf4a3f19c68211efc2fca30836c885b848ed057f66", size = 573393, upload-time = "2025-10-05T08:17:44.444Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/6c/bccc029d5c304d9315d8a11d606ee50c31b822e8bce1a7e2be300969e282/pyzstd-0.18.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f6d9c8a535af243c5a19f2d66c3733595ab633e00b97237d877e70e8389edc5", size = 428790, upload-time = "2025-10-05T08:17:45.666Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/a1/a21bb405bb405f516581529eccb1b057db391246af31f8755072ed08716a/pyzstd-0.18.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a533550740ce8c721aae27b377fb1160df68a9f457f16015ec8e47547a033dfc", size = 416564, upload-time = "2025-10-05T08:17:47.185Z" },
+    { url = "https://files.pythonhosted.org/packages/38/58/9ca3eb20d047f23db0e1d99cd81e6889d9d7e7b3dde3ddae8894f7e0f75a/pyzstd-0.18.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdd76049c8ccbb98276cfa78d807b4a497ec6bad2603361eceae993c6130e5bf", size = 519751, upload-time = "2025-10-05T08:17:48.745Z" },
+    { url = "https://files.pythonhosted.org/packages/13/7c/156bcc5f499eb6ca0eafba3f716cda01d8d5119270fab7ac5ca0519b0c3a/pyzstd-0.18.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:09b73fe07a8d81898ef1575cb3063816168abb3305c1a9f30110383b61a4ee92", size = 563494, upload-time = "2025-10-05T08:17:50.298Z" },
+    { url = "https://files.pythonhosted.org/packages/11/c3/511560f2043c88dee9a87a8db750d2b805e9be8054f35414e145acade401/pyzstd-0.18.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6baf9fd75d0af4f5d677b6e2d8dd3deb359c4ec2250c8536fe5ea48fd9305199", size = 433321, upload-time = "2025-10-05T08:17:51.46Z" },
+    { url = "https://files.pythonhosted.org/packages/11/1e/6d676a5403acbc422e79637c997235d5fc1a2d1fa234a9b22dd4c240fe29/pyzstd-0.18.0-cp310-cp310-win32.whl", hash = "sha256:c0634ab42226d2ad96c94d57fd242df2ca9417350c2969eb97c8c61d9574ba69", size = 220852, upload-time = "2025-10-05T08:17:53.001Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/b5/b1389a474d04c59dc50b8546032e5eaac67718025db9efe8dabe04d5b6f9/pyzstd-0.18.0-cp310-cp310-win_amd64.whl", hash = "sha256:ec99569321a99b9868666c85a5846151f9a16b6a222b59b2570e2ddeefd4d80c", size = 249552, upload-time = "2025-10-05T08:17:54.463Z" },
+    { url = "https://files.pythonhosted.org/packages/13/2d/6a6bcee3f94a44bda87a3fc8619496ab02736979e373f2dc74d0d9b083fb/pyzstd-0.18.0-cp310-cp310-win_arm64.whl", hash = "sha256:85371149cc1d8168461981084438b9f2f139c1699e989fef44562f7504ba0632", size = 222621, upload-time = "2025-10-05T08:17:55.573Z" },
+    { url = "https://files.pythonhosted.org/packages/63/d5/c81a3b2b2ddfd534552649344f7f9dc48c05c48b0b3e4065eb12209d37b0/pyzstd-0.18.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:848914835a8a984d4c5fad2355dc66f0aca979b35ec22753c9e694be8e98403c", size = 369088, upload-time = "2025-10-05T08:17:57.063Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/00/9c9b0afba9a9bad95cd1bc16203058296a3c2b28040bb0decccd8d662c10/pyzstd-0.18.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3938fea87fe83113b5d8ec2925bb265b4c540e374bb0ec73e5528de58d68c393", size = 295160, upload-time = "2025-10-05T08:17:58.714Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/f4/d71aafa852232dec6ec0d34a45b12136958042079703e60314e4a48bab5d/pyzstd-0.18.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9af4bcde7dde46ca7e82a4c6f5fda1760bcbfd15525dbea36fe625263ef06b5e", size = 408553, upload-time = "2025-10-05T08:17:59.889Z" },
+    { url = "https://files.pythonhosted.org/packages/27/5b/6af21645f1eb9e22416297dd1bc92615b89f91e5a657d37c528872a566ad/pyzstd-0.18.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:15d9419d173d26de25342235256aba363190e48e3fd8a8988420a26221b45320", size = 514931, upload-time = "2025-10-05T08:18:01.208Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/19/38c3440b82b227d3c99f1e4283584903633bba7e5a8bf757ffce3e913efb/pyzstd-0.18.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0b84f75f0494087afad31363e80a3463d1f32a0a6265f1a24660e6422b2b6fa6", size = 573396, upload-time = "2025-10-05T08:18:02.48Z" },
+    { url = "https://files.pythonhosted.org/packages/36/b9/7ee22f141b0438c2082cde550ced698ee2490ce5c0acd461d7f8fc2db18a/pyzstd-0.18.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cfcdf0e46020bda2e98814464ca3ae830da83937c4c61776bf8835c7094214e", size = 428794, upload-time = "2025-10-05T08:18:04.063Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/8f/b6a16829534fa3eac8ef40cc95dbf44a0c2b36a366829b49486a133e5d6b/pyzstd-0.18.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8551b6bc3690fb76e730967a628b6aab0d9331c38a41f5cddb546be994771191", size = 416581, upload-time = "2025-10-05T08:18:05.357Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/dc/601f56488064483358e90005169eecdec33fdb41d10fef825ca8cea6f1ea/pyzstd-0.18.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6883b47a4d5d5489890e24e74ef14c1f16dcd68bb326b86911ae0e254e33e4b7", size = 519760, upload-time = "2025-10-05T08:18:06.614Z" },
+    { url = "https://files.pythonhosted.org/packages/70/d9/98328d76d9ce3bf5479af62dfe2f141250c60e210cd9f1a9ed9e14ebc978/pyzstd-0.18.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:929dec930296362ce03fee81877fa93a68ca4de3af75fdfa96ecbe0e366b2ee3", size = 563496, upload-time = "2025-10-05T08:18:08.344Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/80/ba2cb4769035014a4d272c06414404b9652bbfc14562ab59a9b8ffd9b1eb/pyzstd-0.18.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:278c80fdeaf857b620295cc815a31f6478fcb217d476ac889985a43b2b67e9bd", size = 433325, upload-time = "2025-10-05T08:18:10.11Z" },
+    { url = "https://files.pythonhosted.org/packages/31/72/5303be53d439500067195c754ec31a8d9e8b2e95a6089ecd3f7354d9134e/pyzstd-0.18.0-cp311-cp311-win32.whl", hash = "sha256:0d1b678644894e49b5a448f02eebe0ac31bde6f51813168f5ff223d7212e1974", size = 220845, upload-time = "2025-10-05T08:18:11.706Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/4c/12f74c81d8e43e42338295347223bb37edf9d13a149690fd92e91a083664/pyzstd-0.18.0-cp311-cp311-win_amd64.whl", hash = "sha256:8285a464aed201b166bb0d2f4667485b61b607cf89f12943b1f21f7e84cb4550", size = 249578, upload-time = "2025-10-05T08:18:13.112Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/5f/64480ea0ead505f3d30dfa46de9e9f72d7eb0325612ace435f84fafca780/pyzstd-0.18.0-cp311-cp311-win_arm64.whl", hash = "sha256:942badf996589e5ab6cbdd0f7dd33f5dc2cd7ed0b65441c96b9a12ffa7700d51", size = 222617, upload-time = "2025-10-05T08:18:14.591Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/19/7c78cf4cedb812362bb77d0ad5c7e0fa843a344d5d5737a55dd1c1c2b987/pyzstd-0.18.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5eef13ee3e230e50c01b288d581664e8758f7b831271f6f32cfc29823a6ab365", size = 369768, upload-time = "2025-10-05T08:18:16.049Z" },
+    { url = "https://files.pythonhosted.org/packages/24/97/b01f76d7a9d7237d9b1dee94469d638575268635ba22c1d07acc23ee3ead/pyzstd-0.18.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f78d6ef80d2f355b5bc1a897e9aa58659e85170b3fa268f3211c4979c768264c", size = 295766, upload-time = "2025-10-05T08:18:17.234Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/ac/2f629bb68c545d2f65e070b69a4aad39418b874ad61715d5f12014a06a74/pyzstd-0.18.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:394175aeeb4e2255ff5340b32f6db79375b3ffb25514fe4c1439015a7f335ec2", size = 409439, upload-time = "2025-10-05T08:18:18.397Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/cc/b9bab32be36cf4552940c5bda281e1480b01cbf12b5852b529f69a6f33e8/pyzstd-0.18.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3250c551f526d3b966cf4a2199a8d9538dc5c7083b7a26a45f305f8f2ab20a06", size = 516327, upload-time = "2025-10-05T08:18:19.597Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/f8/c208c74ba04fa5fcef421fd728e906cd1570f8bffb77285c8f9418892bd4/pyzstd-0.18.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a99ca80053ca37be21f05f6c4152c70777e0eface72b08277cb4b10b6d286e79", size = 574893, upload-time = "2025-10-05T08:18:20.824Z" },
+    { url = "https://files.pythonhosted.org/packages/57/fa/b049d9688f46f8dd3509f5cb0a8ba629716bb9238161c119d8d06454412d/pyzstd-0.18.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc4488536e87ff0aac698b9cd65f2913ac87417b3952d80be32463c8e95cc35", size = 429851, upload-time = "2025-10-05T08:18:22.12Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/35/dff064fa704a0f335aab326318651af81aac611a8a24e7f1be3b9fb50d32/pyzstd-0.18.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c12da158f6ec1180be0a3d6f531050dfc1357a25e5d0fd8dd99d4506d2a3f448", size = 417437, upload-time = "2025-10-05T08:18:23.264Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/2d/d1731fd1213da3a41dcba84e297118c09ed6f05ff0f600c5b0a42edb0f62/pyzstd-0.18.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f9a7d6bff36dfbe87dce1730e4b70d6ab49058a6f8ea22e85b33642491a2d053", size = 521302, upload-time = "2025-10-05T08:18:24.446Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/89/0a50c7b3f71921a7df41a222283b56fbe236dc79dc142969705676a9eca1/pyzstd-0.18.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0f56086bf8019f7c809a406dcc182ce0fb0d3623a9edf351ed80dbb484514613", size = 564793, upload-time = "2025-10-05T08:18:26.122Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/8c/b1927786cd208a1271728969ba45881a823663b6621e2905c1029ad31a15/pyzstd-0.18.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1eb69217ad9b760537e93f2d578c7927b788a9cac0e2104e536855a2797b5b09", size = 434400, upload-time = "2025-10-05T08:18:27.668Z" },
+    { url = "https://files.pythonhosted.org/packages/66/f2/9780dc79d4bc9b6f5c64fe91541bf4a15dda9a9bf5eaf4a641436a50b2d0/pyzstd-0.18.0-cp312-cp312-win32.whl", hash = "sha256:05ce49412c7aef970e0a6be8e9add4748bc474a7f13533a14555642022f871e9", size = 221172, upload-time = "2025-10-05T08:18:29.464Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/d0/1b5c6e7bbe8e300159edd47556233a7756f59ee6d78b83e95e258e1ceaf9/pyzstd-0.18.0-cp312-cp312-win_amd64.whl", hash = "sha256:e951c3013b9df479cff758d578b83837b2531d02fb6c3e59166a756795697e19", size = 249751, upload-time = "2025-10-05T08:18:30.642Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/c2/7b64e5c2afd752fa18bc5a5c6e260dad14402e6f94d810f8ce2ad64ad512/pyzstd-0.18.0-cp312-cp312-win_arm64.whl", hash = "sha256:33b54781c66a86e33c93c89ae426811d0aa35a216a23116fc5d5162449284305", size = 222551, upload-time = "2025-10-05T08:18:31.774Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/55/9e200f8ad193bb9d060cd724a8f876e40be918eece578c46c33cab336cf4/pyzstd-0.18.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:65117997d1e10e9b41336c90c2c4877c8d27533f753272805ff39df15fd5298a", size = 369781, upload-time = "2025-10-05T08:18:33.37Z" },
+    { url = "https://files.pythonhosted.org/packages/64/73/28bc86e284ca5e5ea082641748f2a583c9db8744dbf2c28d54e85e101114/pyzstd-0.18.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8550efbfb5944343666d0e79d6a3687adcbeb4dbf17aa743146a25e72d12d47f", size = 295771, upload-time = "2025-10-05T08:18:34.665Z" },
+    { url = "https://files.pythonhosted.org/packages/98/3d/bb6f1316a14bd4079aa3cad13cfd7cd3a1feef4a5756bdb7a9e8758a5ab1/pyzstd-0.18.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac61854c4a77df66695540549a89f4c67039e4181a9158b8646425f1d56d947a", size = 411244, upload-time = "2025-10-05T08:18:35.879Z" },
+    { url = "https://files.pythonhosted.org/packages/be/ee/32fdd020003a3bc672b93ec1f9c2520a196bc57e6847a7ff38991dc7b4ea/pyzstd-0.18.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4c453369483f67480f86d67a7b63ef22827db65e7f0d4bec7992bb81751a94b9", size = 518289, upload-time = "2025-10-05T08:18:37.657Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/c2/0564465315ba5d3524f8bc625af2c125d4549a186bda6fa3b6eafa88d8f4/pyzstd-0.18.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4ef4b757b2df808ac15058fc2aa41e07d93843ee5a95629ff51eb6e8f1950951", size = 577193, upload-time = "2025-10-05T08:18:38.891Z" },
+    { url = "https://files.pythonhosted.org/packages/de/16/8540b868a5b0fa9eaf743856f86088e3cac8477fbf2339e76c5c973622cc/pyzstd-0.18.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b42529770febd331e23c5e8a68e9899acb0cc0806ee4c970354806c0ceeec6c7", size = 429177, upload-time = "2025-10-05T08:18:40.11Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/b7/08ba7b145978b0b5c4d83e26dca8140d71516fed42f60b31a7f387c6c89f/pyzstd-0.18.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7f54d13c269cdc37d2f73c9b3e70c6d2bb168dec768a472d54c2ed830bb19fb9", size = 419281, upload-time = "2025-10-05T08:18:41.46Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/da/7f64baf48b060bdc071388c90fb6b32992a5827ab5810d6deced86219ec5/pyzstd-0.18.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e6686460ca4be536dca1b6f2f80055f383a78e92e68e03a14806428572c4fdba", size = 523202, upload-time = "2025-10-05T08:18:42.868Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/5b/f5227ebe3ba672ec1051d27546c1d1cb1fde4a84fe8c5fa8cd6bc1b35614/pyzstd-0.18.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:8da3978d7de9095cacc5089bd0c435ab84ebd127e0979cd31fa1b216111644af", size = 567497, upload-time = "2025-10-05T08:18:44.507Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/ca/d067850a8035efc68ccc0399f343e86c18111dcb9ddec2a4b8b2ebe421eb/pyzstd-0.18.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1ebc87e6e50547cff97e07c3fed9999d79b6327c9c4143c3049a7cfeacb2cdba", size = 433881, upload-time = "2025-10-05T08:18:45.761Z" },
+    { url = "https://files.pythonhosted.org/packages/64/26/6c91c0815556ab8f7ddd67e891009a4ba6481b905b84a361697f208b053a/pyzstd-0.18.0-cp313-cp313-win32.whl", hash = "sha256:2dd203f2534b16dea2761394fda4e0f3c465a5109ae6450bdaada67e6ac14a45", size = 221170, upload-time = "2025-10-05T08:18:46.986Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8e/3f6f445ee23864988ae3cfd6cebd3fe79dd42dbaa50538275928082e3df8/pyzstd-0.18.0-cp313-cp313-win_amd64.whl", hash = "sha256:98f43488f88b859291d6bdc51cc7793d1eab17aa9382b17d762944bbb8567c98", size = 249759, upload-time = "2025-10-05T08:18:48.526Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/43/1a96bd7bae073ae57c3927ece4cd72355ad46b7153961857d6a77a5bec9d/pyzstd-0.18.0-cp313-cp313-win_arm64.whl", hash = "sha256:cff8922e25e19d8fbd95b53f451e637bc80e826ab53c8777a885d4e99d1c0c2d", size = 222548, upload-time = "2025-10-05T08:18:50.163Z" },
+    { url = "https://files.pythonhosted.org/packages/11/37/d82e2bf7aad6df47a0914b47460a242802d6107cb63e3b0f0b02fab911b0/pyzstd-0.18.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:67f795ec745cfd6930cdaf5118fcdd8d87ce02b07b254d37efe75afd33ce9917", size = 369755, upload-time = "2025-10-05T08:18:51.353Z" },
+    { url = "https://files.pythonhosted.org/packages/90/1f/b01eb168e7dd689a0df6dcdd5e8171953927330e3b40e290c7a227714e50/pyzstd-0.18.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a8a589673b9b417a084e393f18d09a16b67b87a80f80da6d3b4f84dd983c9b3d", size = 295789, upload-time = "2025-10-05T08:18:53.065Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/70/581b01d765833eb12f8b04e048d4d5f803007e491c9ecb57bc1ca810317f/pyzstd-0.18.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fdaee8c33f96a6568225e821e6cc33045917628ae0bc7d8d3855332085c1aa7c", size = 411367, upload-time = "2025-10-05T08:18:54.295Z" },
+    { url = "https://files.pythonhosted.org/packages/32/ff/9896a01d1eef51e352f6d9d8175f9ff13b394a8239f337c6e0ec930fe3ee/pyzstd-0.18.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:42bf45d8e835d7c9c0bef98ff703143a5129edf09ef6c3b757037cbf79eabcaa", size = 518386, upload-time = "2025-10-05T08:18:55.472Z" },
+    { url = "https://files.pythonhosted.org/packages/26/24/60cebe2101a3915535a480f77c0acc1e2b9d690377ef72e0237594def37e/pyzstd-0.18.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2f4dff2a15e2047baea9359d3a547dee80f61887f17e0f23190b4b932fd617e4", size = 577251, upload-time = "2025-10-05T08:18:56.789Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/d0/86beb2bf4b112c46eb39c24389561f979672f262dcb76f3b500c7ab47fb1/pyzstd-0.18.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ed87932d6c534fc8921f7d44a4dadb32881e10ebc68935175a2cba254f5cc83", size = 429102, upload-time = "2025-10-05T08:18:57.973Z" },
+    { url = "https://files.pythonhosted.org/packages/be/a4/1037092d5ff6fc11db687ae71413c51eef59b3fc6899a4777328eafcc7b3/pyzstd-0.18.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7d08a372b2b7fa1fd24217424e13d3d794e01299c43c8bd55f50934ef0785779", size = 419324, upload-time = "2025-10-05T08:18:59.297Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/97/13855620caa4f00262b819b0962dbf66e798f17523fd770edea5eb471e52/pyzstd-0.18.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:e8403108172e24622f51732a336a89fe32bf3842965e0dc677c65df3a562f3ad", size = 523380, upload-time = "2025-10-05T08:19:01.022Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/20/7f0b24d7da75637094191847df620e6a3de954742a2899c89c0f80fd9f71/pyzstd-0.18.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5604eeb7f00ec308b7e878dae92abfc4eee2e5d238765a62d4fadc0d57bbbff3", size = 567596, upload-time = "2025-10-05T08:19:02.408Z" },
+    { url = "https://files.pythonhosted.org/packages/58/87/389186d0ba732ba4f53dbf3dc82ac08a783e7ea2ab06de21c98ae4ae5a65/pyzstd-0.18.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d6b300c5240409f1e7ab9972ab2a880a1949447d8414dbc11d89c10bfcb31aa5", size = 433862, upload-time = "2025-10-05T08:19:05.258Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/6f/bedcdbbfecae9f253eaeaa3776a2876cbd4af66ce710d1e2492546debc61/pyzstd-0.18.0-cp314-cp314-win32.whl", hash = "sha256:83f4fe1409a59c45a5e6fccb4d451e1e3dd03a5fabebd2dd6ba651468f54025e", size = 225481, upload-time = "2025-10-05T08:19:06.525Z" },
+    { url = "https://files.pythonhosted.org/packages/df/04/85bc2d0d906762c84f6e2f2be7228c1f610b8d42dfbb0602b281ee14bd9d/pyzstd-0.18.0-cp314-cp314-win_amd64.whl", hash = "sha256:73c3dcd9a16f1669ed6eef0dad1d840b7dd6070ab7d48719171ca691101e7975", size = 255101, upload-time = "2025-10-05T08:19:07.807Z" },
+    { url = "https://files.pythonhosted.org/packages/64/5c/19e973213762479698dd551b39e7841fcc16104f492fdf941daa42637633/pyzstd-0.18.0-cp314-cp314-win_arm64.whl", hash = "sha256:61333bbb337b9746284624ed14f6238838dfae1e395691ba49f227015374f760", size = 229465, upload-time = "2025-10-05T08:19:09.084Z" },
+    { url = "https://files.pythonhosted.org/packages/59/79/10e69fe241968e1993b6ea4c09c4f4cb5783a7f639d680958a9835310ed4/pyzstd-0.18.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:35934369fcdfde6fb932f88fa441337c8ddaf4b08e7b0b12952010f0ba2082f7", size = 354239, upload-time = "2025-10-05T08:19:28.84Z" },
+    { url = "https://files.pythonhosted.org/packages/73/a2/f0e6b6a816693d37840342d1e6a81fa1504c9c0c825f504f80c2ebc9dd91/pyzstd-0.18.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:55b8e12c9657359a697440e88a8535d1a771025e5d8f1c3087ad69ba11bee6d2", size = 283761, upload-time = "2025-10-05T08:19:30.494Z" },
+    { url = "https://files.pythonhosted.org/packages/99/c3/3f22f0ded2cd19e506eee9ea270854919d4d2df5006fbb5faef3d561b0f6/pyzstd-0.18.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:134d33d3e56b5083c8f827b63254c2abf85d6ace2b323e69d28e3954b5b71883", size = 338432, upload-time = "2025-10-05T08:19:31.83Z" },
+    { url = "https://files.pythonhosted.org/packages/22/62/ac46ca28d6dd56e22207a86e1315b5b2a2c1c1d23b08e1079c07ebbff1e7/pyzstd-0.18.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6c4bffa0157ef9e5cfa32413a5a79448e5affadece4982df274f1b5aae3a680", size = 358420, upload-time = "2025-10-05T08:19:33.059Z" },
+    { url = "https://files.pythonhosted.org/packages/38/ce/ec54eeca984a67d4785f0d1fad5179de1781a38adee80a7ecb51a1c5edf9/pyzstd-0.18.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8c36824d94cf77997a899b60886cc2be3ac969083f1d74eb4dd4127234ba50a4", size = 244901, upload-time = "2025-10-05T08:19:34.321Z" },
+    { url = "https://files.pythonhosted.org/packages/25/cd/2dafdf61d6d092cc331aa5310ba34a42fa528ae3f3ab0e505b6b5728a56a/pyzstd-0.18.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:788e0889db436cd6d16a3b490006ab80a913d8ce6f46db127f1888066ff4560b", size = 354192, upload-time = "2025-10-05T08:19:35.549Z" },
+    { url = "https://files.pythonhosted.org/packages/af/ed/89a2c9144da69dcf8113fa2f3b6d234a17e19ab37f599cc2270bb1ee394e/pyzstd-0.18.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5e70b7c36a40d7f946bf6391a206374b057299735d366fad6524d3b9f392441f", size = 283718, upload-time = "2025-10-05T08:19:36.819Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/b1/179601c6972cea615af936de9ba8225be9d53c2cfd0a2a7ceacf32a22857/pyzstd-0.18.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:571c5f71622943387370f76de8cc0de3d5c6217ab0f38386cb127665e4e09275", size = 338432, upload-time = "2025-10-05T08:19:38.073Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/57/067cf55f86a4baf8cd9e91f7e9596038cabc2b296ac3e0f9be14310c435f/pyzstd-0.18.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de0b730f374b583894d58b79cff76569540baf1e84bc493be191d3128b58e559", size = 358419, upload-time = "2025-10-05T08:19:39.34Z" },
+    { url = "https://files.pythonhosted.org/packages/49/88/53d1ec8c639305fb96944b3a1e7f60b6e6af80781d970036c3cf2d6d2316/pyzstd-0.18.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b32184013f33dba2fabcdda89f2a83289f5b717a0c2477cda764e53fdafec7ee", size = 244902, upload-time = "2025-10-05T08:19:40.607Z" },
+]
+
 [[package]]
 name = "qir-qis"
-version = "0.1.4"
+version = "0.1.3"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/98/0ff34bb2016bf67d4ec708067df4c083bba41c6b616334866b19027989fc/qir_qis-0.1.4-cp310-abi3-macosx_15_0_arm64.whl", hash = "sha256:5889f93960216f1f66f7e05d37873edcbf21a1f5c6f24cdb667a16cc90bef202", size = 21862662, upload-time = "2026-03-17T20:02:50.094Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/07/c40232233d5d831927020d1f634ca815cc172726aa7635cd245810c05f19/qir_qis-0.1.4-cp310-abi3-macosx_15_0_x86_64.whl", hash = "sha256:373c5c258605648bee53b00fc0f1acda9c40543242b93ce25982eba5f034e136", size = 23224260, upload-time = "2026-03-17T20:02:52.895Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/69/1ee1d966b4703988c066d4bfabc56e338ac285dc61654198110310882ba8/qir_qis-0.1.4-cp310-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:a7bdd4cd5c98c42ffbbb790279708fd871c09b5f52cfee36a25aaed608eedcc1", size = 28695899, upload-time = "2026-03-17T20:02:55.753Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/ad/f89d038322062fa35881932ab0828ce73db7ea06aa54df356c411928ed02/qir_qis-0.1.4-cp310-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:390b5abc1be0154f9b623e75528ffc9cbf41aac4cd6173bfec6088db50777d8e", size = 26034079, upload-time = "2026-03-17T20:02:58.376Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/81/f25de0a11e190dce0c7e5e53785fc998ead74c9666d87272cc37c1bb0ef4/qir_qis-0.1.4-cp310-abi3-win_amd64.whl", hash = "sha256:b596b3f6c644c6d519c97747d18c6df599098a6ea18b1545e295deaf6f756e71", size = 20328023, upload-time = "2026-03-17T20:03:01.094Z" },
+    { url = "https://files.pythonhosted.org/packages/52/c9/24906128a455d2de1e08ad05b6de7a0b25002e1cc1db941b7ad4a9314f6e/qir_qis-0.1.3-cp310-abi3-macosx_13_0_arm64.whl", hash = "sha256:e1704efcafea5983d686b8658f4c8dff9110229af6f47bd2d5b5213a7256aeb3", size = 15959593, upload-time = "2026-02-24T22:56:11.581Z" },
+    { url = "https://files.pythonhosted.org/packages/65/02/bd01b83fe4a811d1e2e0c20ccd49e92289e561a74480cafdfc7c00ef98f1/qir_qis-0.1.3-cp310-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9a0e488bdd4015330602645aa77002f9f970764ba4ccb7b8548490aa7c3de5ed", size = 17550477, upload-time = "2026-02-24T22:56:14.353Z" },
+    { url = "https://files.pythonhosted.org/packages/81/d0/817ee7e71154d79be5e7f0c6fda45f925261b22b0b5abaf3d9932366f1ec/qir_qis-0.1.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eff1fc1bc282e33707658c65ca483bc9f5558c618f746920004d74dca9ba48c6", size = 17527772, upload-time = "2026-02-24T22:56:16.882Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/18/43aaac65f8d6637db0dc18ce6fa7e5458f7924dae9a1b22b9ec84b985bcb/qir_qis-0.1.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5864b1165a08270a327b80ccb5c02b28544e6e40860a1ca3ec4976b99183f7e8", size = 18805051, upload-time = "2026-02-24T22:56:19.111Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/16/25aa2d6ac5dba9d052c4b58049452d0174f4b2a049e6a38e4540c4a72a46/qir_qis-0.1.3-cp310-abi3-win_amd64.whl", hash = "sha256:855bc462e4f31d0dc05cba063f7632610d16a1c66e40ef99ace215355ce76faa", size = 15688465, upload-time = "2026-02-24T22:56:21.464Z" },
 ]
 
 [[package]]
@@ -3875,15 +4037,15 @@ wheels = [
 
 [[package]]
 name = "rich"
-version = "14.3.3"
+version = "15.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "markdown-it-py" },
     { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" },
+    { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" },
 ]
 
 [[package]]
@@ -4010,27 +4172,27 @@ wheels = [
 
 [[package]]
 name = "ruff"
-version = "0.15.9"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e6/97/e9f1ca355108ef7194e38c812ef40ba98c7208f47b13ad78d023caa583da/ruff-0.15.9.tar.gz", hash = "sha256:29cbb1255a9797903f6dde5ba0188c707907ff44a9006eb273b5a17bfa0739a2", size = 4617361, upload-time = "2026-04-02T18:17:20.829Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0b/1f/9cdfd0ac4b9d1e5a6cf09bedabdf0b56306ab5e333c85c87281273e7b041/ruff-0.15.9-py3-none-linux_armv6l.whl", hash = "sha256:6efbe303983441c51975c243e26dff328aca11f94b70992f35b093c2e71801e1", size = 10511206, upload-time = "2026-04-02T18:16:41.574Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/f6/32bfe3e9c136b35f02e489778d94384118bb80fd92c6d92e7ccd97db12ce/ruff-0.15.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:4965bac6ac9ea86772f4e23587746f0b7a395eccabb823eb8bfacc3fa06069f7", size = 10923307, upload-time = "2026-04-02T18:17:08.645Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/25/de55f52ab5535d12e7aaba1de37a84be6179fb20bddcbe71ec091b4a3243/ruff-0.15.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eaf05aad70ca5b5a0a4b0e080df3a6b699803916d88f006efd1f5b46302daab8", size = 10316722, upload-time = "2026-04-02T18:16:44.206Z" },
-    { url = "https://files.pythonhosted.org/packages/48/11/690d75f3fd6278fe55fff7c9eb429c92d207e14b25d1cae4064a32677029/ruff-0.15.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9439a342adb8725f32f92732e2bafb6d5246bd7a5021101166b223d312e8fc59", size = 10623674, upload-time = "2026-04-02T18:16:50.951Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/ec/176f6987be248fc5404199255522f57af1b4a5a1b57727e942479fec98ad/ruff-0.15.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9c5e6faf9d97c8edc43877c3f406f47446fc48c40e1442d58cfcdaba2acea745", size = 10351516, upload-time = "2026-04-02T18:16:57.206Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/fc/51cffbd2b3f240accc380171d51446a32aa2ea43a40d4a45ada67368fbd2/ruff-0.15.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b34a9766aeec27a222373d0b055722900fbc0582b24f39661aa96f3fe6ad901", size = 11150202, upload-time = "2026-04-02T18:17:06.452Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/d4/25292a6dfc125f6b6528fe6af31f5e996e19bf73ca8e3ce6eb7fa5b95885/ruff-0.15.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89dd695bc72ae76ff484ae54b7e8b0f6b50f49046e198355e44ea656e521fef9", size = 11988891, upload-time = "2026-04-02T18:17:18.575Z" },
-    { url = "https://files.pythonhosted.org/packages/13/e1/1eebcb885c10e19f969dcb93d8413dfee8172578709d7ee933640f5e7147/ruff-0.15.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce187224ef1de1bd225bc9a152ac7102a6171107f026e81f317e4257052916d5", size = 11480576, upload-time = "2026-04-02T18:16:52.986Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/6b/a1548ac378a78332a4c3dcf4a134c2475a36d2a22ddfa272acd574140b50/ruff-0.15.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b0c7c341f68adb01c488c3b7d4b49aa8ea97409eae6462d860a79cf55f431b6", size = 11254525, upload-time = "2026-04-02T18:17:02.041Z" },
-    { url = "https://files.pythonhosted.org/packages/42/aa/4bb3af8e61acd9b1281db2ab77e8b2c3c5e5599bf2a29d4a942f1c62b8d6/ruff-0.15.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:55cc15eee27dc0eebdfcb0d185a6153420efbedc15eb1d38fe5e685657b0f840", size = 11204072, upload-time = "2026-04-02T18:17:13.581Z" },
-    { url = "https://files.pythonhosted.org/packages/69/48/d550dc2aa6e423ea0bcc1d0ff0699325ffe8a811e2dba156bd80750b86dc/ruff-0.15.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a6537f6eed5cda688c81073d46ffdfb962a5f29ecb6f7e770b2dc920598997ed", size = 10594998, upload-time = "2026-04-02T18:16:46.369Z" },
-    { url = "https://files.pythonhosted.org/packages/63/47/321167e17f5344ed5ec6b0aa2cff64efef5f9e985af8f5622cfa6536043f/ruff-0.15.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:6d3fcbca7388b066139c523bda744c822258ebdcfbba7d24410c3f454cc9af71", size = 10359769, upload-time = "2026-04-02T18:17:10.994Z" },
-    { url = "https://files.pythonhosted.org/packages/67/5e/074f00b9785d1d2c6f8c22a21e023d0c2c1817838cfca4c8243200a1fa87/ruff-0.15.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:058d8e99e1bfe79d8a0def0b481c56059ee6716214f7e425d8e737e412d69677", size = 10850236, upload-time = "2026-04-02T18:16:48.749Z" },
-    { url = "https://files.pythonhosted.org/packages/76/37/804c4135a2a2caf042925d30d5f68181bdbd4461fd0d7739da28305df593/ruff-0.15.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:8e1ddb11dbd61d5983fa2d7d6370ef3eb210951e443cace19594c01c72abab4c", size = 11358343, upload-time = "2026-04-02T18:16:55.068Z" },
-    { url = "https://files.pythonhosted.org/packages/88/3d/1364fcde8656962782aa9ea93c92d98682b1ecec2f184e625a965ad3b4a6/ruff-0.15.9-py3-none-win32.whl", hash = "sha256:bde6ff36eaf72b700f32b7196088970bf8fdb2b917b7accd8c371bfc0fd573ec", size = 10583382, upload-time = "2026-04-02T18:17:04.261Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/56/5c7084299bd2cacaa07ae63a91c6f4ba66edc08bf28f356b24f6b717c799/ruff-0.15.9-py3-none-win_amd64.whl", hash = "sha256:45a70921b80e1c10cf0b734ef09421f71b5aa11d27404edc89d7e8a69505e43d", size = 11744969, upload-time = "2026-04-02T18:16:59.611Z" },
-    { url = "https://files.pythonhosted.org/packages/03/36/76704c4f312257d6dbaae3c959add2a622f63fcca9d864659ce6d8d97d3d/ruff-0.15.9-py3-none-win_arm64.whl", hash = "sha256:0694e601c028fd97dc5c6ee244675bc241aeefced7ef80cd9c6935a871078f53", size = 11005870, upload-time = "2026-04-02T18:17:15.773Z" },
+version = "0.15.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/d9/aa3f7d59a10ef6b14fe3431706f854dbf03c5976be614a9796d36326810c/ruff-0.15.10.tar.gz", hash = "sha256:d1f86e67ebfdef88e00faefa1552b5e510e1d35f3be7d423dc7e84e63788c94e", size = 4631728, upload-time = "2026-04-09T14:06:09.884Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/00/a1c2fdc9939b2c03691edbda290afcd297f1f389196172826b03d6b6a595/ruff-0.15.10-py3-none-linux_armv6l.whl", hash = "sha256:0744e31482f8f7d0d10a11fcbf897af272fefdfcb10f5af907b18c2813ff4d5f", size = 10563362, upload-time = "2026-04-09T14:06:21.189Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/15/006990029aea0bebe9d33c73c3e28c80c391ebdba408d1b08496f00d422d/ruff-0.15.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b1e7c16ea0ff5a53b7c2df52d947e685973049be1cdfe2b59a9c43601897b22e", size = 10951122, upload-time = "2026-04-09T14:06:02.236Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/c0/4ac978fe874d0618c7da647862afe697b281c2806f13ce904ad652fa87e4/ruff-0.15.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:93cc06a19e5155b4441dd72808fdf84290d84ad8a39ca3b0f994363ade4cebb1", size = 10314005, upload-time = "2026-04-09T14:06:00.026Z" },
+    { url = "https://files.pythonhosted.org/packages/da/73/c209138a5c98c0d321266372fc4e33ad43d506d7e5dd817dd89b60a8548f/ruff-0.15.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e1dd04312997c99ea6965df66a14fb4f03ba978564574ffc68b0d61fd3989e", size = 10643450, upload-time = "2026-04-09T14:05:42.137Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/76/0deec355d8ec10709653635b1f90856735302cb8e149acfdf6f82a5feb70/ruff-0.15.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8154d43684e4333360fedd11aaa40b1b08a4e37d8ffa9d95fee6fa5b37b6fab1", size = 10379597, upload-time = "2026-04-09T14:05:49.984Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/be/86bba8fc8798c081e28a4b3bb6d143ccad3fd5f6f024f02002b8f08a9fa3/ruff-0.15.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab88715f3a6deb6bde6c227f3a123410bec7b855c3ae331b4c006189e895cef", size = 11146645, upload-time = "2026-04-09T14:06:12.246Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/89/140025e65911b281c57be1d385ba1d932c2366ca88ae6663685aed8d4881/ruff-0.15.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a768ff5969b4f44c349d48edf4ab4f91eddb27fd9d77799598e130fb628aa158", size = 12030289, upload-time = "2026-04-09T14:06:04.776Z" },
+    { url = "https://files.pythonhosted.org/packages/88/de/ddacca9545a5e01332567db01d44bd8cf725f2db3b3d61a80550b48308ea/ruff-0.15.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ee3ef42dab7078bda5ff6a1bcba8539e9857deb447132ad5566a038674540d0", size = 11496266, upload-time = "2026-04-09T14:05:55.485Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/bb/7ddb00a83760ff4a83c4e2fc231fd63937cc7317c10c82f583302e0f6586/ruff-0.15.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51cb8cc943e891ba99989dd92d61e29b1d231e14811db9be6440ecf25d5c1609", size = 11256418, upload-time = "2026-04-09T14:05:57.69Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/8d/55de0d35aacf6cd50b6ee91ee0f291672080021896543776f4170fc5c454/ruff-0.15.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:e59c9bdc056a320fb9ea1700a8d591718b8faf78af065484e801258d3a76bc3f", size = 11288416, upload-time = "2026-04-09T14:05:44.695Z" },
+    { url = "https://files.pythonhosted.org/packages/68/cf/9438b1a27426ec46a80e0a718093c7f958ef72f43eb3111862949ead3cc1/ruff-0.15.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:136c00ca2f47b0018b073f28cb5c1506642a830ea941a60354b0e8bc8076b151", size = 10621053, upload-time = "2026-04-09T14:05:52.782Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/50/e29be6e2c135e9cd4cb15fbade49d6a2717e009dff3766dd080fcb82e251/ruff-0.15.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8b80a2f3c9c8a950d6237f2ca12b206bccff626139be9fa005f14feb881a1ae8", size = 10378302, upload-time = "2026-04-09T14:06:14.361Z" },
+    { url = "https://files.pythonhosted.org/packages/18/2f/e0b36a6f99c51bb89f3a30239bc7bf97e87a37ae80aa2d6542d6e5150364/ruff-0.15.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:e3e53c588164dc025b671c9df2462429d60357ea91af7e92e9d56c565a9f1b07", size = 10850074, upload-time = "2026-04-09T14:06:16.581Z" },
+    { url = "https://files.pythonhosted.org/packages/11/08/874da392558ce087a0f9b709dc6ec0d60cbc694c1c772dab8d5f31efe8cb/ruff-0.15.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b0c52744cf9f143a393e284125d2576140b68264a93c6716464e129a3e9adb48", size = 11358051, upload-time = "2026-04-09T14:06:18.948Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/46/602938f030adfa043e67112b73821024dc79f3ab4df5474c25fa4c1d2d14/ruff-0.15.10-py3-none-win32.whl", hash = "sha256:d4272e87e801e9a27a2e8df7b21011c909d9ddd82f4f3281d269b6ba19789ca5", size = 10588964, upload-time = "2026-04-09T14:06:07.14Z" },
+    { url = "https://files.pythonhosted.org/packages/25/b6/261225b875d7a13b33a6d02508c39c28450b2041bb01d0f7f1a83d569512/ruff-0.15.10-py3-none-win_amd64.whl", hash = "sha256:28cb32d53203242d403d819fd6983152489b12e4a3ae44993543d6fe62ab42ed", size = 11745044, upload-time = "2026-04-09T14:05:39.473Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ed/dea90a65b7d9e69888890fb14c90d7f51bf0c1e82ad800aeb0160e4bacfd/ruff-0.15.10-py3-none-win_arm64.whl", hash = "sha256:601d1610a9e1f1c2165a4f561eeaa2e2ea1e97f3287c5aa258d3dab8b57c6188", size = 11035607, upload-time = "2026-04-09T14:05:47.593Z" },
 ]
 
 [[package]]
@@ -4038,7 +4200,8 @@ name = "scipy"
 version = "1.15.3"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.11'",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'darwin')",
 ]
 dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -4097,9 +4260,12 @@ name = "scipy"
 version = "1.17.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14'",
-    "python_full_version >= '3.12' and python_full_version < '3.14'",
-    "python_full_version == '3.11.*'",
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
+    "(python_full_version >= '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and sys_platform != 'darwin')",
+    "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and python_full_version < '3.14' and sys_platform != 'darwin')",
+    "(python_full_version == '3.11.*' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and sys_platform != 'darwin')",
 ]
 dependencies = [
     { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -4170,11 +4336,13 @@ wheels = [
 
 [[package]]
 name = "selene-core"
-version = "0.2.6"
+version = "0.2.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "hugr" },
     { name = "lief" },
+    { name = "llvmlite", version = "0.45.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'darwin'" },
+    { name = "llvmlite", version = "0.47.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'x86_64' or sys_platform != 'darwin'" },
     { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pydot" },
@@ -4184,7 +4352,7 @@ dependencies = [
     { name = "ziglang" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/33/28002766e84074564e6b6d9c91912c7882d56cde18dfc324bb183f5b5813/selene_core-0.2.6-py3-none-any.whl", hash = "sha256:1ae54f599bab727673c42316ea5269dd0e9df874dac21f312c2db9c68aad5554", size = 29073, upload-time = "2026-03-02T17:18:28.663Z" },
+    { url = "https://files.pythonhosted.org/packages/df/e2/46d0a50c46e15ff604e643951f7b72ca99c3aaad549acbf3908c2ec754bd/selene_core-0.2.7-py3-none-any.whl", hash = "sha256:22f4ca6435eb328079ebfe1c73dc64506665e2d64ee6b44079e4246534ea7aa8", size = 30361, upload-time = "2026-04-10T20:55:12.363Z" },
 ]
 
 [[package]]
@@ -4201,7 +4369,7 @@ wheels = [
 
 [[package]]
 name = "selene-sim"
-version = "0.2.12"
+version = "0.2.13"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -4212,11 +4380,11 @@ dependencies = [
     { name = "tqdm" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c0/16/64621e391f1e375982b3da9bc92bc826f55be867fb8debb69feaeb47b4eb/selene_sim-0.2.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2b39acb614b087cc48685b8d98d5374f23dd4940eb67fcd83c8941d257f5de8c", size = 3867309, upload-time = "2026-03-02T17:43:29.205Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/55/68c6f521c614e7f457f336d79037149e85a7c46a87d507f3caa5f0956f0a/selene_sim-0.2.12-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:da17278b63c21615203857a313a8d9636a0759210378cf297b5dc35797480f68", size = 3968891, upload-time = "2026-03-02T17:43:31.207Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/fa/6169a52d9e2dd2e7c66a34d8b06cbd084947c359d592755e56bb2a128616/selene_sim-0.2.12-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:7b49ca939d2fef88715d6fd294babc636e8e9cf184e6cbcaf83f83fd00a08de7", size = 4331609, upload-time = "2026-03-02T17:43:33.44Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/06/b16d1f345e1b100f4575cbdecb6feb6d2760d361b091500c00e3a5c59149/selene_sim-0.2.12-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:c264b0184a2e1339fef23195353e6304e2963c2dec0c1766f5ea3407c7379b95", size = 4370694, upload-time = "2026-03-02T17:43:34.936Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/15/4e4e633c9cf61ab23a2f8f5149b8f842eb38952f3ec5cd57e71ea2be614d/selene_sim-0.2.12-py3-none-win_amd64.whl", hash = "sha256:d43ca8c72f6575d70795425c93cf8e45f9cf875348ea9c9af11a5b4a8b72aaf2", size = 2746664, upload-time = "2026-03-02T17:43:36.05Z" },
+    { url = "https://files.pythonhosted.org/packages/df/15/7f58330010c407dd5ebc57cbe5a2e72e14bbac380a6f68a78303bd3fe039/selene_sim-0.2.13-py3-none-macosx_11_0_arm64.whl", hash = "sha256:731947866ba0578782b4ef8511dc05b5226215d7f36e48699ab1a7ca01317fb4", size = 3675784, upload-time = "2026-04-10T21:33:57.549Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/e8/d19055240dac113e02abcd947f3215df5b989f75f75ad66ee722824b6cc8/selene_sim-0.2.13-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:677c634b157d374d188600b221c4ddf20a4a61861ec05c8c31feb06ccc59bf0b", size = 3810091, upload-time = "2026-04-10T21:33:59.246Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/c0/0d7fd02e43721185439dc5900885ad82be2e1e9e2b7a3ead0df62856b710/selene_sim-0.2.13-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:d8b9fe4e9480e48f549b5fdb11521946df28117578d57f404618108352eba6ff", size = 4096184, upload-time = "2026-04-10T21:34:01.333Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/fe/60c61a9116f6aa37766518f881c18afb630a61369326c88bf620c193810f/selene_sim-0.2.13-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:462aab4384e1a9c39b6e6b350c581bc97159cabb42b4107efb3083e2d5ad3b96", size = 4242866, upload-time = "2026-04-10T21:34:02.77Z" },
+    { url = "https://files.pythonhosted.org/packages/35/c5/5d326369f9695952f94dbf215f16b0e0164151e3b33062b64c952c0e896a/selene_sim-0.2.13-py3-none-win_amd64.whl", hash = "sha256:cad7fa64a91d2b0a2b90ee87ee4ee5b016cad3a9720755b6386f8ccb91fe954d", size = 8912980, upload-time = "2026-04-10T21:34:04.78Z" },
 ]
 
 [[package]]
@@ -4341,7 +4509,7 @@ version = "0.18.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ptyprocess", marker = "os_name != 'nt'" },
-    { name = "pywinpty", marker = "os_name == 'nt'" },
+    { name = "pywinpty", marker = "(os_name == 'nt' and platform_machine != 'x86_64') or (os_name == 'nt' and sys_platform != 'darwin')" },
     { name = "tornado" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/8a/11/965c6fd8e5cc254f1fe142d547387da17a8ebfd75a3455f637c663fb38a0/terminado-0.18.1.tar.gz", hash = "sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e", size = 32701, upload-time = "2024-03-12T14:34:39.026Z" }
@@ -4363,7 +4531,7 @@ wheels = [
 
 [[package]]
 name = "tket"
-version = "0.12.15"
+version = "0.12.13"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "hugr" },
@@ -4371,13 +4539,13 @@ dependencies = [
     { name = "tket-eccs" },
     { name = "tket-exts" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/43/5d/0d063d5b0f90fe197c68fcda5bee384606a368666abe961cd1227ae1afee/tket-0.12.15.tar.gz", hash = "sha256:1ce7c0877c510810ae91be9c0a7a1b638de8b7f5008f065939aa200ca7cde5a5", size = 464051, upload-time = "2026-01-16T16:40:41.838Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/9f/b80e8f9f93e4f6f5af27170a9484f36de1171c939a55b132ad08b43317e7/tket-0.12.13.tar.gz", hash = "sha256:d6b394f3b27e2d9e67438a007592d1c980bee19b0544a7735972aa162543adeb", size = 460359, upload-time = "2025-12-10T18:07:02.743Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f9/6f/1ca616db59681f08db1577eb7592ea5b2f3bcdeab423bea61cff30676a3c/tket-0.12.15-cp310-abi3-macosx_13_0_arm64.whl", hash = "sha256:a802b785a09c58a468cb0d7c605837eed328ac441e1ef9fade0913b3082a1a49", size = 10082145, upload-time = "2026-01-16T16:40:29.858Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/37/ba7e9d2b15c7d89ce33930640ac9f7fb86f2c1ffd8d63ba62e5819463c22/tket-0.12.15-cp310-abi3-macosx_15_0_x86_64.whl", hash = "sha256:8cd363d59e5d7bad90baf458bfc231db4e0a36329275e4c5cafb8052f3490c23", size = 11022474, upload-time = "2026-01-16T16:40:33.042Z" },
-    { url = "https://files.pythonhosted.org/packages/52/1a/ce41490837bb9e4177104ca6e4f7afa837fbb1a32468bc8b57a2b46361eb/tket-0.12.15-cp310-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:7fda0d91a4281a3a9825676f12c8b8ae466fa6e39fdf9a79b04b3305df34ccad", size = 13088827, upload-time = "2026-01-16T16:21:11.12Z" },
-    { url = "https://files.pythonhosted.org/packages/69/39/43dddc3e0143d56af058af612c863b5c3bb09823ed35d915f64a33d4597f/tket-0.12.15-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:730cd5f8103db95ba3387caa2588f1925dfb4473cf3cc383d16b4bea69c6b47f", size = 14678972, upload-time = "2026-01-16T16:40:36.576Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/12/7f9fdff4543bc28228b20bb359f2518ec0067b4ab08ddff1fe6b10e5b9eb/tket-0.12.15-cp310-abi3-win_amd64.whl", hash = "sha256:eac19fade07aa07a2967b3775ac65506e197ae1b8723daba8666b1cc27f6abef", size = 9866960, upload-time = "2026-01-16T16:40:39.837Z" },
+    { url = "https://files.pythonhosted.org/packages/09/30/a755e7b4365c4c89294f152e342734c3d4e9ad3e2ba5ba9b8bbd2f516b11/tket-0.12.13-cp310-abi3-macosx_13_0_arm64.whl", hash = "sha256:8a0ba6c6e7624ba66b908bba1ef88dc9f8165e147a12c69374c121a392e71acc", size = 10291588, upload-time = "2025-12-10T18:06:51.074Z" },
+    { url = "https://files.pythonhosted.org/packages/de/09/22f00a9c639e61619980008f8f50cbc6966c55790f309160915e139f54a3/tket-0.12.13-cp310-abi3-macosx_15_0_x86_64.whl", hash = "sha256:c7ec567d8d13fff896f06540c6f5ba61ef0cd2da2850a21b444850dc9b677de5", size = 11275557, upload-time = "2025-12-10T18:06:53.498Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/ee/a25579314ec8058d0e96b3e26d7f6862383d0bcf1d150302bb2be543f416/tket-0.12.13-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f60f2a412da68e34328b101b135f807334cd830e97e9b3992b62e708b14c8b1", size = 13376604, upload-time = "2025-12-10T18:06:55.756Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/57/bef91d6aba87ad78778a211255bfd64d9a942ad991b17c64f4bc577fd07c/tket-0.12.13-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:cfff346a679334a8f21035abe8867ef36335ffe28e0edbab965a667310294fd5", size = 14891058, upload-time = "2025-12-10T18:06:58.363Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/ae/da9b17cedf6f8080a0edec5654b687ae967aca4d6bd0d5f1806b625fbea7/tket-0.12.13-cp310-abi3-win_amd64.whl", hash = "sha256:fb9b5592f58462aaf8769fda6355350a935c0061a344d2bae0a5ed4a16ea907e", size = 10183807, upload-time = "2025-12-10T18:07:00.726Z" },
 ]
 
 [[package]]
@@ -4495,26 +4663,26 @@ wheels = [
 
 [[package]]
 name = "types-requests"
-version = "2.33.0.20260402"
+version = "2.33.0.20260408"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c1/7b/a06527d20af1441d813360b8e0ce152a75b7d8e4aab7c7d0a156f405d7ec/types_requests-2.33.0.20260402.tar.gz", hash = "sha256:1bdd3ada9b869741c5c4b887d2c8b4e38284a1449751823b5ebbccba3eefd9da", size = 23851, upload-time = "2026-04-02T04:19:55.942Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/6a/749dc53a54a3f35842c1f8197b3ca6b54af6d7458a1bfc75f6629b6da666/types_requests-2.33.0.20260408.tar.gz", hash = "sha256:95b9a86376807a216b2fb412b47617b202091c3ea7c078f47cc358d5528ccb7b", size = 23882, upload-time = "2026-04-08T04:34:49.33Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/65/3853bb6bac5ae789dc7e28781154705c27859eccc8e46282c3f36780f5f5/types_requests-2.33.0.20260402-py3-none-any.whl", hash = "sha256:c98372d7124dd5d10af815ee25c013897592ff92af27b27e22c98984102c3254", size = 20739, upload-time = "2026-04-02T04:19:54.955Z" },
+    { url = "https://files.pythonhosted.org/packages/90/b8/78fd6c037de4788c040fdd323b3369804400351b7827473920f6c1d03c10/types_requests-2.33.0.20260408-py3-none-any.whl", hash = "sha256:81f31d5ea4acb39f03be7bc8bed569ba6d5a9c5d97e89f45ac43d819b68ca50f", size = 20739, upload-time = "2026-04-08T04:34:48.325Z" },
 ]
 
 [[package]]
 name = "types-tqdm"
-version = "4.67.3.20260402"
+version = "4.67.3.20260408"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "types-requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/54/42/e9e6688891d8db77b5795ec02b329524170892ff81bec63c4c4ca7425b30/types_tqdm-4.67.3.20260402.tar.gz", hash = "sha256:e0739f3bc5d1c801999a202f0537280aa1bc2e669c49f5be91bfb99376690624", size = 18077, upload-time = "2026-04-02T04:22:23.049Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/42/2e2968e68a694d3dac3a47aa0df06e46be1a6eef498e5bd15f4c54674eb9/types_tqdm-4.67.3.20260408.tar.gz", hash = "sha256:fd849a79891ae7136ed47541aface15c35bd9a13160fa8a93e42e10f60cf4c8d", size = 18119, upload-time = "2026-04-08T04:36:52.488Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4f/73/a6cf75de5be376d7b57ce6c934ae9bc90aa5be6ada4ac50a99ecbdf9763e/types_tqdm-4.67.3.20260402-py3-none-any.whl", hash = "sha256:b5d1a65fe3286e1a855e51ddebf63d3641daf9bad285afd1ec56808eb59df76e", size = 24562, upload-time = "2026-04-02T04:22:22.114Z" },
+    { url = "https://files.pythonhosted.org/packages/14/5d/7dedddc32ab7bc2344ece772b5e0f03ec63a1d47ad259696689713c1cf50/types_tqdm-4.67.3.20260408-py3-none-any.whl", hash = "sha256:3b9ed74ebef04df8f53d470ffdc84348e93496d8acafa08bf79fafce0f2f5b5d", size = 24561, upload-time = "2026-04-08T04:36:51.538Z" },
 ]
 
 [[package]]
@@ -4567,7 +4735,7 @@ wheels = [
 
 [[package]]
 name = "virtualenv"
-version = "21.2.0"
+version = "21.2.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "distlib" },
@@ -4576,28 +4744,28 @@ dependencies = [
     { name = "python-discovery" },
     { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/aa/92/58199fe10049f9703c2666e809c4f686c54ef0a68b0f6afccf518c0b1eb9/virtualenv-21.2.0.tar.gz", hash = "sha256:1720dc3a62ef5b443092e3f499228599045d7fea4c79199770499df8becf9098", size = 5840618, upload-time = "2026-03-09T17:24:38.013Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/97/c5/aff062c66b42e2183201a7ace10c6b2e959a9a16525c8e8ca8e59410d27a/virtualenv-21.2.1.tar.gz", hash = "sha256:b66ffe81301766c0d5e2208fc3576652c59d44e7b731fc5f5ed701c9b537fa78", size = 5844770, upload-time = "2026-04-09T18:47:11.482Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c6/59/7d02447a55b2e55755011a647479041bc92a82e143f96a8195cb33bd0a1c/virtualenv-21.2.0-py3-none-any.whl", hash = "sha256:1bd755b504931164a5a496d217c014d098426cddc79363ad66ac78125f9d908f", size = 5825084, upload-time = "2026-03-09T17:24:35.378Z" },
+    { url = "https://files.pythonhosted.org/packages/20/0e/f083a76cb590e60dff3868779558eefefb8dfb7c9ed020babc7aa014ccbf/virtualenv-21.2.1-py3-none-any.whl", hash = "sha256:bd16b49c53562b28cf1a3ad2f36edb805ad71301dee70ddc449e5c88a9f919a2", size = 5828326, upload-time = "2026-04-09T18:47:09.331Z" },
 ]
 
 [[package]]
 name = "wasmtime"
-version = "42.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/54/cd/1f110419ed006f91624010f4df4da82490220bd5527650284c97fc758a6c/wasmtime-42.0.0.tar.gz", hash = "sha256:90485655d6e541b817a7baa1b3071b4525d03f76bcb6ad04661774f06a3b02d4", size = 117133, upload-time = "2026-02-24T19:12:53.321Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/cb/f206f7a839d6843b01c041000056bf7aad23cf72fe2333a0c5dad144e0f2/wasmtime-42.0.0-py3-none-android_26_arm64_v8a.whl", hash = "sha256:214e7d294ce1b5adb94f09a870a2ab6759173dc0194bdde74ee4492b477d8392", size = 6829706, upload-time = "2026-02-24T19:12:36.637Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/97/d4f5f46eef74e013c3a0caa9b8625bb1c4162e2b9817258596ee6932c019/wasmtime-42.0.0-py3-none-android_26_x86_64.whl", hash = "sha256:cdd9710fad242dde7cb0eacbe48bf902bb1bac6ecbecd3e743c31af463a795c6", size = 7699640, upload-time = "2026-02-24T19:12:38.471Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/d2/5b2bf901b0a9b8050d966dff61e353de7cd86dd58679a79e48372ff8b3a6/wasmtime-42.0.0-py3-none-any.whl", hash = "sha256:7a166bd262608806f3295343fcd07ee3e037f931f6d3b0a24ab1cfc7ccc3e8eb", size = 6403639, upload-time = "2026-02-24T19:12:39.777Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/6f/a40322bdd55809441bab7e1ac707aa38ced3572904a700f1dfb4b2520dcd/wasmtime-42.0.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:21e3dafd74704de0e7ed7668ab76cc5a9df130b4306befbfcb08ddb29673c784", size = 7483525, upload-time = "2026-02-24T19:12:41.422Z" },
-    { url = "https://files.pythonhosted.org/packages/47/04/ef61af9fe9e5c0a8d782c8662302535ee6e6dba1a6929191fa3ea371a491/wasmtime-42.0.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:411bf05df47c8a36c6b31b6012720ac1251b95fdd155e389b25eb6fbbd7e181c", size = 6493225, upload-time = "2026-02-24T19:12:42.9Z" },
-    { url = "https://files.pythonhosted.org/packages/44/54/a774313c19c1c0ae2c1897af697c12178904d67911f42c4a9bdddba68640/wasmtime-42.0.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:ca12269ee88aac6b1f64b5f324abf3c6370ff853338d991292f10cb17b906667", size = 7740997, upload-time = "2026-02-24T19:12:44.453Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/5d/fae28526b1d42f0365e4fd6c2a212c7c000e47d7320632018fa45735a06e/wasmtime-42.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:78f9353b9fdc2f6e7ed13e28ce0394533f5a62710b75c00434ac82681f738924", size = 6785820, upload-time = "2026-02-24T19:12:45.777Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/ae/5c5e96273a36c70753e8ba4db323dd9b1ccf6fcea4ccad99d458ad2ecf13/wasmtime-42.0.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ba317e879aab71c407e7012f4dc10b221c6daf737496c501005612e11d26e8ee", size = 6810021, upload-time = "2026-02-24T19:12:47.453Z" },
-    { url = "https://files.pythonhosted.org/packages/46/68/5c129389f67219a90c3ba0dcf85555249bde9797760f2d715bec03bc198a/wasmtime-42.0.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e9ef6dbd1a2cff21694ba64f27b90a7ab0af61a54d911a59682005830683dc8a", size = 7779984, upload-time = "2026-02-24T19:12:48.642Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/e5/6650c9e7ad904c9a6730c4b762b1dfed4f7d7b0e981e3624a6ecd7abb7ed/wasmtime-42.0.0-py3-none-win_amd64.whl", hash = "sha256:3a360a1285457021efe24369490cd719996596f2cbe1aa62dae6ad68179cf0f9", size = 6403647, upload-time = "2026-02-24T19:12:50.373Z" },
-    { url = "https://files.pythonhosted.org/packages/44/b2/e93046661deef4d8fee2f40080a28e5ff201cc98d4fb1929a46367c34778/wasmtime-42.0.0-py3-none-win_arm64.whl", hash = "sha256:8caa13a6ee264969449c008da1dcb8f9f6c954800853527714e7fcddbdda9166", size = 5397896, upload-time = "2026-02-24T19:12:51.639Z" },
+version = "43.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/0e/967542865d59d9529bab604b9b88f09a92636e69cc4b1d30c5013e854493/wasmtime-43.0.0.tar.gz", hash = "sha256:eb98b8e2bc35d03dd69c9dd095a388044323622526fc94a9406b8efc48ddc259", size = 117449, upload-time = "2026-03-31T19:26:23.663Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/a9/5e598c9ae8791375fa47b0dad377e0030dcd6da1be527a639670c5a3f9d6/wasmtime-43.0.0-py3-none-android_26_arm64_v8a.whl", hash = "sha256:c52d7bd47481958494b6ef9f0ed56d01ba6d7088cc9adbc1414be899b75bc04d", size = 6895231, upload-time = "2026-03-31T19:26:01.774Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/aa/ce764724dcede88f9010963ca7d70d0a79655174599ea85074cb2c656d59/wasmtime-43.0.0-py3-none-android_26_x86_64.whl", hash = "sha256:f65b287290f06751b2c87da3cdb2381b045ac93bc3ee0e3b805c2a6dc5327bc6", size = 7775074, upload-time = "2026-03-31T19:26:04.741Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/ca/67db17c3f098894be798457ce261816fb67c0c1b80c1a53ed1dfa8ed4ff1/wasmtime-43.0.0-py3-none-any.whl", hash = "sha256:9441349d9346230420ed24d357d6f8330fe7251ac5938bb892147728bbe731d7", size = 6472597, upload-time = "2026-03-31T19:26:06.61Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/87/b9727ac8ecf02d2bd9af838fe6004c028034ce3f38215a22f8e94705b83d/wasmtime-43.0.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:0ff3815f63122d2f59e58c626aad3c4592f1cabc0b6bd7dcc1edc3890eb46783", size = 7564987, upload-time = "2026-03-31T19:26:08.492Z" },
+    { url = "https://files.pythonhosted.org/packages/08/42/d9588fa6dad9a609e5acaa72d1d5b346b2913f87c2e95d0c7ddadf5e919b/wasmtime-43.0.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5a03c7aa03519df58fed5115ad8093d6deac46386115add715e725448e89ab25", size = 6615055, upload-time = "2026-03-31T19:26:10.506Z" },
+    { url = "https://files.pythonhosted.org/packages/48/a9/25b27545ad916a169583dbea41a6a03c58fe04c1d05fa39797dc43bd50b9/wasmtime-43.0.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:341542e87caf1f2ef7ff648a78827fcef5751e3e9be2ee07a1fcf3a04413c213", size = 7819110, upload-time = "2026-03-31T19:26:12.335Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/9a/4d8760f827931b5b265b83e52316d40b8e0eb999bb8e2d457c2ae172d5cc/wasmtime-43.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:30b042fd4a05d0f8a320baed53fcb971aff8a3789ed6967f4521f87931ace717", size = 6910375, upload-time = "2026-03-31T19:26:14.207Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/19/81c748c089a693b102f9a6239f2558a0ffd55fc721fcdd139361aaede1a1/wasmtime-43.0.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:34ff18384ad62625cb1438fd0266f6c74b4a72ddcb8ba30c60a66be3632db44b", size = 6938286, upload-time = "2026-03-31T19:26:15.898Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/fa/c37e77c907567a8802696f9ab839b719ea811cf3d59ffc815cc95d894339/wasmtime-43.0.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c7025d477d807df30dad07c9318ea747c6cfc99764c7cb2a8e44e75b8c43e3be", size = 7852033, upload-time = "2026-03-31T19:26:17.915Z" },
+    { url = "https://files.pythonhosted.org/packages/69/67/57c7e361049554cdedd9253e732a6eace5c643488a0e3886ac3f471a4be7/wasmtime-43.0.0-py3-none-win_amd64.whl", hash = "sha256:7e6b0d0641d78012bdf7d3622ca4bc969462dcf1d0a6c147dc5d7aae2f5093a9", size = 6472603, upload-time = "2026-03-31T19:26:19.724Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/27/8ecf7dbbb16dc3ab32fcb205f4d798e77cab264118bc1ac52145a76e38fb/wasmtime-43.0.0-py3-none-win_arm64.whl", hash = "sha256:5ddb2ba4b354fc4f055c8ce9285e7bc4cb259c339e5834bb4d0739d644042b8e", size = 5455362, upload-time = "2026-03-31T19:26:21.746Z" },
 ]
 
 [[package]]