QuEST/quest/src/gpu/gpu_subroutines.cpp at main · InvictusWingsSRL/QuEST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/** @file
 * CUDA GPU-accelerated definitions of the subroutines called by
 * accelerator.cpp. This file is always compiled, even when GPU
 * acceleration is disabled and when parsed by a non-CUDA compiler,
 * and so uses precompiler guards to disable CUDA-only code.
 * This file contains host definitions and associated memory and
 * thread management, and invokes custom kernels defined in
 * gpu_kernels.hpp which is never parsed by non-CUDA compilers.
 * This file also invokes Thrust and cuQuantum routines, defined in
 * gpu_thrust.hpp and gpu_cuquantum.hpp respectively, which are
 * also never parsed by non-CUDA compilers.
 *
 * Note that some custom kernels are templated in order to apply
 * compile-time optimisations like automatic loop unrolling. So
 * too are their calling host definitions in this file, which are
 * called by accelerator.cpp which chooses the template parameter.
 * This unnecessarily duplicates other parts of the host functions
 * responsible for dispatching to Thrust or cuQuantum, bloating the
 * compiled binary size; we accept this design wart over having
 * non-templated host functions because this requires duplicating
 * the template-dispatch logic (which would then also be defined in
 * cpu_subroutines.cpp) and moving it out of the aptly-named
 * accelerator.cpp file.
 *
 * Despite COMPILE_CUDA=1 whenever COMPILE_CUQUANTUM=1, we will
 * still use superfluous (COMPILE_CUDA || COMPILE_CUQUANTUM) guards
 * to communicate when there is no bespoke cuQuantum routine.
 *
 * When compiling for AMD GPUs, the CUDA symbols invoked herein are
 * mapped to HIP symbols by cuda_to_hip.h
 *
 * @author Tyson Jones
 */

// obtain preprocessors from config.h prior to validation
#include "quest/include/config.h"

#if (COMPILE_CUQUANTUM && ! COMPILE_CUDA)
    #error "Cannot define COMPILE_CUQUANTUM=1 without simultaneously defining COMPILE_CUDA=1"
#endif

#include "quest/include/types.h"
#include "quest/include/qureg.h"
#include "quest/include/paulis.h"
#include "quest/include/matrices.h"

#include "quest/src/core/errors.hpp"
#include "quest/src/core/bitwise.hpp"
#include "quest/src/core/utilities.hpp"
#include "quest/src/core/accelerator.hpp"
#include "quest/src/comm/comm_indices.hpp"
#include "quest/src/gpu/gpu_config.hpp"
#include "quest/src/gpu/gpu_subroutines.hpp"

#if COMPILE_CUDA
    #include "quest/src/gpu/gpu_types.cuh"
    #include "quest/src/gpu/gpu_kernels.cuh"
    #include "quest/src/gpu/gpu_thrust.cuh"
#endif

#if COMPILE_CUQUANTUM
    #include "quest/src/gpu/gpu_cuquantum.cuh"
#endif

#include <vector>
using std::vector;


/*
 * GETTERS
 */


qcomp gpu_statevec_getAmp_sub(Qureg qureg, qindex ind) {

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    // this bespoke function exists (in lieu of caller
    // just calling copyGpuToCpu() directly) mostly for
    // consistency with the CPU equivalent (which has a
    // performance motivation), and so that we can one
    // day update this function if single-scalar random
    // access of GPU memory can be accelerated.

    qcomp amp;

    // compiler guards harmlessly duplicated therein
    gpu_copyGpuToCpu(&qureg.gpuAmps[ind], &amp, 1);

    return amp;

#else
    error_gpuCopyButGpuNotCompiled();
    return -1;
#endif
}


/*
 * SETTERS
 */


void gpu_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) {

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    thrust_densmatr_setAmpsToPauliStrSum_sub(qureg, sum);

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


void gpu_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStrSum in) {

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    thrust_fullstatediagmatr_setElemsToPauliStrSum(out, in);

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


/*
 * COMMUNICATION BUFFER PACKING
 */


template <int NumQubits>
qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<int> qubitStates) {

    assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
    qindex numBlocks = getNumBlocks(numThreads);
    qindex sendInd = getSubBufferSendInd(qureg);

    devints sortedQubits = util_getSorted(qubits);
    qindex qubitStateMask  = util_getBitMask(qubits, qubitStates);

    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads,
        getPtr(sortedQubits), qubits.size(), qubitStateMask
    );

    // return the number of packed amps
    return numThreads;

#else
    error_gpuSimButGpuNotCompiled();
    return 0;
#endif
}


qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {

    assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode / 8;
    qindex numBlocks = getNumBlocks(numThreads);
    qindex sendInd = getSubBufferSendInd(qureg);

    kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads,
        qubit1, qubit2, qubit3, bit2
    );

    // return the number of packed amps
    return numThreads;

#else
    error_gpuSimButGpuNotCompiled();
    return 0;
#endif
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, gpu_statevec_packAmpsIntoBuffer, (Qureg, vector<int>, vector<int>) )


/*
 * SWAPS
 */


template <int NumCtrls>
void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUQUANTUM

    cuquantum_statevec_anyCtrlSwap_subA(qureg, ctrls, ctrlStates, targ1, targ2);

#elif COMPILE_CUDA

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);

    devints sortedQubits = util_getSorted(ctrls, {targ2, targ1});
    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1});

    kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads,
        getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


template <int NumCtrls>
void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);
    qindex recvInd = getBufferRecvInd();

    devints sortedCtrls = util_getSorted(ctrls);
    qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);

    kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads,
        getPtr(sortedCtrls), ctrls.size(), ctrlStateMask
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


template <int NumCtrls>
void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);
    qindex recvInd = getBufferRecvInd();

    devints sortedQubits = util_getSorted(ctrls, {targ});
    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState});

    kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads,
        getPtr(sortedQubits), ctrls.size(), qubitStateMask
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subA, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) )
INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subB, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) )
INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subC, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) )


/*
 * ONE-TARGET DENSE MATRIX
 */


template <int NumCtrls>
void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUQUANTUM

    bool applyAdj = false;
    auto arr = unpackMatrixToCuQcomps(matr);
    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data(), applyAdj);

#elif COMPILE_CUDA

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
    qindex numBlocks = getNumBlocks(numThreads);

    devints sortedQubits = util_getSorted(ctrls, {targ});
    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});

    auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr);

    kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads,
        getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ,
        m00, m01, m10, m11
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


template <int NumCtrls>
void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);
    qindex recvInd = getBufferRecvInd();

    devints sortedCtrls = util_getSorted(ctrls);
    qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);

    kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads,
        getPtr(sortedCtrls), ctrls.size(), ctrlStateMask,
        toCuQcomp(fac0), toCuQcomp(fac1)
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDenseMatr_subA, (Qureg, vector<int>, vector<int>, int, CompMatr1) )
INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDenseMatr_subB, (Qureg, vector<int>, vector<int>, qcomp, qcomp) )


/*
 * TWO-TARGET DENSE MATRIX
 */


template <int NumCtrls>
void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUQUANTUM

    bool applyAdj = false;
    auto arr = unpackMatrixToCuQcomps(matr);
    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data(), applyAdj);

#elif COMPILE_CUDA

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2);
    qindex numBlocks = getNumBlocks(numThreads);

    devints sortedQubits = util_getSorted(ctrls, {targ1,targ2});
    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0});

    // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...)
    auto m = unpackMatrixToCuQcomps(matr);

    kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads,
        getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2,
        m[0], m[1], m[2],  m[3],  m[4],  m[5],  m[6],  m[7],
        m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15]
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}

INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, int, int, CompMatr2) )


/*
 * MANY-TARGET DENSE MATRIX
 */


template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
    assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);

#if COMPILE_CUQUANTUM

    auto matrElemsPtr = toCuQcomps(matr.gpuElemsFlat);
    auto matrElemsLen = matr.numRows * matr.numRows;

    // assert the pre-condition assumed below
    if (ApplyConj && ApplyTransp)
        error_gpuDenseMatrixConjugatedAndTransposed();

    // cuStateVec can effect the adjoint, but not the individual conjugate or transpose,
    // and alas we only ever use one at a time (because applying matrix to the bra-qubits of
    // a vectorised density matrix effectively transposes the matrix), so we effect transpose
    // by manually conjugating then telling cuQuantum to adjoint (hehe!)
    if (ApplyConj || ApplyTransp)
        thrust_setElemsToConjugate(matrElemsPtr, matrElemsLen);

    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, targs, matrElemsPtr, ApplyTransp);

    // undo changes (which is only not done if cuQuantum encounters a non-recoverable internal error)
    if (ApplyConj || ApplyTransp)
        thrust_setElemsToConjugate(matrElemsPtr, matrElemsLen);

#elif COMPILE_CUDA

    // a 'batch' refers to 2^N amps which become mixed by the matrix,
    // distinguished in this kernel from 'numThreads' since we may
    // task each thread with processing more than a single batch
    qindex numBatches = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size());

    devints deviceTargs = targs;
    devints deviceQubits = util_getSorted(ctrls, targs);
    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector<int>(targs.size(),0));

    // unpacking args (to better distinguish below signatures)
    auto ampsPtr   = toCuQcomps(qureg.gpuAmps);
    auto matrPtr   = toCuQcomps(matr.gpuElemsFlat);
    auto qubitsPtr = getPtr(deviceQubits);
    auto targsPtr  = getPtr(deviceTargs);
    auto nCtrls    = ctrls.size();

    // this function updates amplitudes in batches of 2^NumTargs, where each is
    // determined by distinct mixtures of the existing 2^NumTargs values, which
    // must ergo be cached. As such, each thread needs private memory, which is
    // provided either by fast registers (when NumTargs is compile-time known)
    // or by slow global memory (necessary when blockDim * 2^NumTargs exceeds the
    // total shared memory) which is strided for coalesced access by warp threads

    if constexpr (NumTargs != -1) {

        // when NumTargs <= 5, each thread has a private array stored in the registers,
        // enabling rapid IO. Given NUM_THREADS_PER_BLOCK = 128, the maximum size of
        // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
        // memory capacity, but does NOT exceed maximum register capacity.

        /// @todo
        /// We should really check the above claims, otherwise the thread-private arrays could
        /// silently "spill" from registers into "local memory" (which is really slow,
        /// global memory) and greatly sabotage performance on some GPUs.

        qindex numThreads = numBatches;
        qindex numBlocks = getNumBlocks(numThreads);

        kernel_statevec_anyCtrlFewTargDenseMatr
            <NumCtrls, NumTargs, ApplyConj, ApplyTransp>
            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
                ampsPtr, numThreads,
                qubitsPtr, nCtrls, qubitStateMask,
                targsPtr, matrPtr
        );

    } else {

        // when NumTargs > 5, we must use global memory to give each thread a private
        // workspace of 2^NumTargs elements. Alas, we should not simply allocate this
        // space per-thread, since all threads being potentially concurrent means we
        // would allocate a total cache equal to the Qureg size (when nctrls=0)!
        // Instead, we change the parallelisation granularity, giving each thread more
        // batches of 2^NumTargs amps to modify, re-using its private cache, so that the
        // number of potentially-concurrent threads is reduced, as is the total cache.
        // We choose the granularity by upper-bounding the number of concurrent threads,
        // where we assign one-block per multiprocessor because we are anyway memory-
        // bandwidth bound (so we don't expect many interweaved blocks per MP).
        qindex numThreads = gpu_getMaxNumConcurrentThreads();

        // use strictly 2^# threads to maintain precondition of all kernels
        if (!isPowerOf2(numThreads))
            numThreads = util_getNextPowerOf2(numThreads);

        // no point in dispatching more threads than batches
        if (numThreads > numBatches)
            numThreads = numBatches;

        // evenly distribute the batches between threads, and the threads unevenly between blocks
        qindex numBatchesPerThread = numBatches / numThreads; // divides evenly
        qindex numBlocks = getNumBlocks(numThreads);

        // expand the cache if necessary
        qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK;
        qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations);

        kernel_statevec_anyCtrlManyTargDenseMatr
            <NumCtrls, ApplyConj, ApplyTransp>
            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
                toCuQcomps(cache),
                ampsPtr, numThreads, numBatchesPerThread,
                qubitsPtr, nCtrls, qubitStateMask,
                targsPtr, targs.size(), powerOf2(targs.size()), matrPtr
        );
    }

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, CompMatr) )


/*
 * ONE-TARGET DIAG MATRIX
 */


template <int NumCtrls>
void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

    // diagonal matrices are always embarrassingly parallel, but alas
    // cuQuantum's API cannot handle when the targeted qubits are prefix,
    // since they appear larger than the local statevector (unknown by it
    // to be distributed). cuQuantum would require we gratuitously swap
    // such qubits into the suffix qubits before invocation, which is unnece-
    // ssary. As such, we will only invoke cuQuantum when the targeted qubits
    // (in this function, only one) are within the suffix substate, otherwise
    // we fall back to using our custom kernels which never require comm.

#if COMPILE_CUQUANTUM

    if (util_isQubitInSuffix(targ, qureg)) {

        // we never conjugate DiagMatr1 at this level; the caller will have already conjugated
        bool conj = false;

        // we can pass 1D CPU .elems array directly to cuQuantum which will recognise host pointers
        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, toCuQcomps(matr.elems), conj);

        // explicitly return to avoid re-simulation below
        return;
    }

#endif

// note preprocessors are not exclusive
#if COMPILE_CUDA

    /// @todo
    /// when NumCtrls==0, a Thrust functor would be undoubtedly more
    /// efficient (because of improved parallelisation granularity)

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);

    devints deviceCtrls = util_getSorted(ctrls);
    qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
    auto elems = unpackMatrixToCuQcomps(matr);

    kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
        getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1]
    );

    // explicitly return to avoid runtime error below
    return;

#endif

    // only reachable when nothing above simulated
    error_gpuSimButGpuNotCompiled();
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, int, DiagMatr1) )


/*
 * TWO-TARGET DIAG MATRIX
 */


template <int NumCtrls>
void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

    // diagonal matrices are always embarrassingly parallel, but alas
    // cuQuantum's API cannot handle when the targeted qubits are prefix,
    // since they appear larger than the local statevector (unknown by it
    // to be distributed). cuQuantum would require we gratuitously swap
    // such qubits into the suffix qubits before invocation, which is unnece-
    // ssary. As such, we will only invoke cuQuantum when the targeted qubits
    // are both within the suffix substate, otherwise we fall back to using
    // our custom kernels which never require comm.

#if COMPILE_CUQUANTUM

    if (util_areAllQubitsInSuffix({targ1,targ2}, qureg)) {

        // we never conjugate DiagMatr2 at this level; the caller will have already conjugated
        bool conj = false;

        // we can pass 1D CPU array directly to cuQuantum, and it will recognise host pointers
        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, toCuQcomps(matr.elems), conj);

        // explicitly return to avoid re-simulation below
        return;
    }

#endif

// note preprocessors are not exclusive
#if COMPILE_CUDA

    /// @todo
    /// when NumCtrls==0, a Thrust functor would be undoubtedly more
    /// efficient (because of improved parallelisation granularity)

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);

    devints deviceCtrls = util_getSorted(ctrls);
    qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
    auto elems = unpackMatrixToCuQcomps(matr);

    kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
        getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2,
        elems[0], elems[1], elems[2], elems[3]
    );

    // explicitly return to avoid runtime error below
    return;

#endif

    // only reachable when nothing above simulated
    error_gpuSimButGpuNotCompiled();
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, int, int, DiagMatr2) )


/*
 * ANY-TARGET DIAG MATRIX
 */


template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower>
void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
    assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);
    assert_exponentMatchesTemplateParam(exponent, HasPower);

    // diagonal matrices are always embarrassingly parallel, but alas
    // cuQuantum's API cannot handle when the targeted qubits are prefix,
    // since they appear larger than the local statevector (unknown by it
    // to be distributed). cuQuantum would require we gratuitously swap
    // such qubits into the suffix qubits before invocation, which is unnece-
    // ssary. As such, we will only invoke cuQuantum when the targeted qubits
    // are both within the suffix substate, otherwise we fall back to using
    // our custom kernels which never require comm. Furthermore, cuQuantum
    // cannot handle when exponent != 1, for which we also fallback to custom.

#if COMPILE_CUQUANTUM

    // cuQuantum cannot handle HasPower, in which case we fall back to custom kernel
    if (!HasPower && util_areAllQubitsInSuffix(targs, qureg)) {
        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, toCuQcomps(util_getGpuMemPtr(matr)), ApplyConj);

        // must return to avoid re-simulation below
        return;
    }

#endif

// note preprocessors are not exclusive
#if COMPILE_CUDA

    /// @todo
    /// when NumCtrls==0, a Thrust functor would be undoubtedly more
    /// efficient (because of improved parallelisation granularity)

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);

    devints deviceTargs = targs;
    devints deviceCtrls = util_getSorted(ctrls);
    qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);

    kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
        getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(),
        toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent)
    );

    // must return to avoid runtime error below
    return;

#endif

    // only reachable when nothing above simulated
    error_gpuSimButGpuNotCompiled();
}


INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, DiagMatr, qcomp) )


/*
 * ALL-TARGS DIAGONAL MATRIX
 */


template <bool HasPower>
void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {

    assert_exponentMatchesTemplateParam(exponent, HasPower);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    // we always use Thrust because we are doubtful that cuQuantum's
    // diagonal-matrix facilities are optimised for the all-qubit case

    thrust_statevec_allTargDiagMatr_sub<HasPower>(qureg, matr, toCuQcomp(exponent));

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight>
void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {

    assert_exponentMatchesTemplateParam(exponent, HasPower);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode;
    qindex numBlocks = getNumBlocks(numThreads);

    kernel_densmatr_allTargDiagMatr_sub
        <HasPower, ApplyLeft, ApplyRight, ConjRight>
        <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
            toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
            toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent)
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


template void gpu_statevec_allTargDiagMatr_sub<true >(Qureg, FullStateDiagMatr, qcomp);
template void gpu_statevec_allTargDiagMatr_sub<false>(Qureg, FullStateDiagMatr, qcomp);

template void gpu_densmatr_allTargDiagMatr_sub<false, true,  true,  true>  (Qureg, FullStateDiagMatr, qcomp); // matr qureg conj(matr)
template void gpu_densmatr_allTargDiagMatr_sub<false, true,  false, false> (Qureg, FullStateDiagMatr, qcomp); // matr qureg
template void gpu_densmatr_allTargDiagMatr_sub<false, false, true,  false> (Qureg, FullStateDiagMatr, qcomp); //      qureg matr
template void gpu_densmatr_allTargDiagMatr_sub<true,  true,  true,  true>  (Qureg, FullStateDiagMatr, qcomp); // matr^P qureg conj(matr^P)
template void gpu_densmatr_allTargDiagMatr_sub<true,  true,  false, false> (Qureg, FullStateDiagMatr, qcomp); // matr^P qureg
template void gpu_densmatr_allTargDiagMatr_sub<true,  false, true,  false> (Qureg, FullStateDiagMatr, qcomp); //      qureg matr^P


/*
 * PAULI TENSOR AND GADGET
 */


template <int NumCtrls, int NumTargs>
void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
    assert_numTargsMatchesTemplateParam(x.size() + y.size(), NumTargs);

    // we do not make use of cuQuantum's custatevecApplyGeneralizedPermutationMatrix() to effect
    // a pauli tensor because we wish to avoid creating the (2^#paulis) large permutation matrix.
    // We also cannot make use of cuQuantum's custatevecApplyPauliRotation() because it cannot
    // handle Pauli operators upon the prefix substate as our singly-communicating method does.
    // This is true even if we passed down the gadget phase to this function; cuStateVec would
    // exact amp -> a amp + b other_amp for the wrong b, which we cannot thereafter remedy.

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qcomp powI   = util_getPowerOfI(y.size());
    auto targsXY = util_getConcatenated(x, y);
    auto maskXY  = util_getBitMask(targsXY);
    auto maskYZ  = util_getBitMask(util_getConcatenated(y, z));

    devints deviceTargs   = targsXY;
    devints deviceQubits  = util_getSorted(ctrls, targsXY);
    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targsXY, vector<int>(targsXY.size(),0));

    // unlike the analogous cpu routine, this function has only a single parallelisation
    // granularity; where every pair-of-amps is modified by an independent thread, despite
    // that many threads share a common i0 value (appearing in the kernel). This turns out
    // faster than when giving threads many pair-amps to modify, due to memory movements

    qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly
    qindex numBlocks = getNumBlocks(numThreads);
    kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads,
        getPtr(deviceQubits), ctrls.size(), qubitStateMask,
        getPtr(deviceTargs), deviceTargs.size(),
        maskXY, maskYZ, toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac)
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


template <int NumCtrls>
void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);
    qindex recvInd = getBufferRecvInd();

    qcomp powI = util_getPowerOfI(y.size());
    auto maskXY = util_getBitMask(util_getConcatenated(x, y));
    auto maskYZ = util_getBitMask(util_getConcatenated(y, z));

    devints sortedCtrls = util_getSorted(ctrls);
    qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);

    kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads,
        getPtr(sortedCtrls), ctrls.size(), ctrlStateMask,
        maskXY, maskYZ, bufferMaskXY,
        toCuQcomp(powI), toCuQcomp(ampFac), toCuQcomp(pairAmpFac)
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevector_anyCtrlPauliTensorOrGadget_subA, (Qureg, vector<int>, vector<int>, vector<int>, vector<int>, vector<int>, qcomp, qcomp) )
INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlPauliTensorOrGadget_subB, (Qureg, vector<int>, vector<int>, vector<int>, vector<int>, vector<int>, qcomp, qcomp, qindex) )


/*
 * PHASE TENSOR AND GADGET
 */


template <int NumCtrls>
void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp fac0, qcomp fac1) {

    assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
    qindex numBlocks = getNumBlocks(numThreads);

    devints sortedCtrls = util_getSorted(ctrls);
    qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
    qindex targMask = util_getBitMask(targs);

    kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(qureg.gpuAmps), numThreads,
        getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask,
        toCuQcomp(fac0), toCuQcomp(fac1)
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub, (Qureg, vector<int>, vector<int>, vector<int>, qcomp, qcomp) )


/*
 * QUREG COMBINATION
 */


template <int NumQuregs>
void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = outQureg.numAmpsPerNode;
    qindex numBlocks = getNumBlocks(numThreads);

    // extract amp ptrs from qureg list
    vector<cu_qcomp*> ptrs;
    ptrs.reserve(inQuregs.size());
    for (auto& qureg : inQuregs)
        ptrs.push_back(toCuQcomps(qureg.gpuAmps));

    // copy coeff and qureg lists into GPU memory
    devcuqcompptrs devQuregAmps = ptrs;
    devcomps devCoeffs = coeffs;

    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        toCuQcomps(outQureg.gpuAmps), numThreads,
        getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size()
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


void gpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inQureg) {

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    thrust_densmatr_mixQureg_subA(outProb, outQureg, inProb, inQureg);

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qureg inQureg) {

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = outQureg.numAmpsPerNode;
    qindex numBlocks = getNumBlocks(numThreads);

    kernel_densmatr_mixQureg_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps),
        numThreads, inQureg.numAmps
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {

#if COMPILE_CUDA || COMPILE_CUQUANTUM

    qindex numThreads = outQureg.numAmpsPerNode;
    qindex numBlocks = getNumBlocks(numThreads);

    kernel_densmatr_mixQureg_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
        outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer),
        numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode
    );

#else
    error_gpuSimButGpuNotCompiled();
#endif
}


INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_QUREGS( void, gpu_statevec_setQuregToWeightedSum_sub, (Qureg, vector<qcomp>, vector<Qureg>) )


/*
 * ONE-QUBIT DEPHASING
 */


void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) {