From 63e369124370d5a748d1a198ca016717492655d6 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 25 Jul 2022 17:41:17 +0200
Subject: [PATCH 01/82] Working on the nointrin generator

---
 third_party/cppsim/nointrin/kernel.py   |  64 ++++
 third_party/cppsim/nointrin/kernel1.hpp |  53 ++++
 third_party/cppsim/nointrin/kernel2.hpp |  62 ++++
 third_party/cppsim/nointrin/kernel3.hpp |  91 ++++++
 third_party/cppsim/nointrin/kernel4.hpp | 152 ++++++++++
 third_party/cppsim/nointrin/kernel5.hpp | 373 ++++++++++++++++++++++++
 third_party/cppsim/nointrin/kernels.hpp |  40 +++
 7 files changed, 835 insertions(+)
 create mode 100644 third_party/cppsim/nointrin/kernel.py
 create mode 100644 third_party/cppsim/nointrin/kernel1.hpp
 create mode 100644 third_party/cppsim/nointrin/kernel2.hpp
 create mode 100644 third_party/cppsim/nointrin/kernel3.hpp
 create mode 100644 third_party/cppsim/nointrin/kernel4.hpp
 create mode 100644 third_party/cppsim/nointrin/kernel5.hpp
 create mode 100644 third_party/cppsim/nointrin/kernels.hpp
diff --git a/third_party/cppsim/nointrin/kernel.py b/third_party/cppsim/nointrin/kernel.py
new file mode 100644
index 00000000..b3fe8f28
--- /dev/null
+++ b/third_party/cppsim/nointrin/kernel.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+nqubits = 3
+
+pragma = "#pragma";
+newline = "\n";
+
+kernel = \
+f"""
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0{''.join(', std::size_t d{}'.format(i) for i in range (1, nqubits))}, M const& m)
+{{
+    std::array<std::complex<double>, 1U << nqubits> v;
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+
+    nqubits = 2:
+    
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+    v[2] = psi[I + d1];
+    v[3] = psi[I + d0 + d1];    
+
+    // All combinations of qubits, excluding dupes:
+    v[0] = 0 0
+    v[1] = 1 0
+    v[2] = 0 1
+    v[3] = 1 1
+
+    psi[I] = (add(mul(v[0], m[0][0]), mul(v[1], m[0][1])));
+    psi[I + d0] = (add(mul(v[0], m[1][0]), mul(v[1], m[1][1])));
+}}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, {''.join('unsigned id{}, '.format(i) for i in range (0, nqubits))}M const& m, std::size_t ctrlmask)
+{{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0{''.join(', d{} = 1UL << id{}'.format(i, i) for i in range (1, nqubits))};
+    std::size_t dsorted[] = {{ d0{''.join(', d{}'.format(i) for i in range (1, nqubits))} }};
+    std::sort(dsorted, dsorted + {nqubits}, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){{
+        {pragma} omp for collapse(LOOP_COLLAPSE{nqubits}) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
+{''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits))}{''.join('    ' for i in range (0, nqubits + 2))}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits - 1}]; ++i{nqubits}){{
+        {''.join('    '.format(i) for i in range (1, nqubits + 2))}kernel_core(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits))}m);
+        {''.join('    '.format(i) for i in range (1, nqubits + 1))}}}
+        }}
+    }}
+    else{{
+        {pragma} omp for collapse(LOOP_COLLAPSE{nqubits}) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
+{''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits))}{''.join('    ' for i in range (0, nqubits + 2))}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits - 1}]; ++i{nqubits}){{
+        {''.join('    '.format(i) for i in range (1, nqubits + 2))}if (((i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))})&ctrlmask) == ctrlmask)
+        {''.join('    '.format(i) for i in range (1, nqubits + 3))}kernel_core(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits))}m);
+        {''.join('    '.format(i) for i in range (1, nqubits + 1))}}}
+        }}
+    }}
+}}
+"""
+
+print(kernel)
+
diff --git a/third_party/cppsim/nointrin/kernel1.hpp b/third_party/cppsim/nointrin/kernel1.hpp
new file mode 100644
index 00000000..e1cd9e66
--- /dev/null
+++ b/third_party/cppsim/nointrin/kernel1.hpp
@@ -0,0 +1,53 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m)
+{
+    std::complex<double> v[2];
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+
+    psi[I] = (add(mul(v[0], m[0][0]), mul(v[1], m[0][1])));
+    psi[I + d0] = (add(mul(v[0], m[1][0]), mul(v[1], m[1][1])));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t dsorted[] = {d0 };
+    std::sort(dsorted, dsorted + 1, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+                kernel_core(psi, i0 + i1, d0, m);
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+                if (((i0 + i1)&ctrlmask) == ctrlmask)
+                    kernel_core(psi, i0 + i1, d0, m);
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/nointrin/kernel2.hpp b/third_party/cppsim/nointrin/kernel2.hpp
new file mode 100644
index 00000000..879fa857
--- /dev/null
+++ b/third_party/cppsim/nointrin/kernel2.hpp
@@ -0,0 +1,62 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m)
+{
+    std::complex<double> v[4];
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+    v[2] = psi[I + d1];
+    v[3] = psi[I + d0 + d1];
+
+    psi[I] = (add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))));
+    psi[I + d0] = (add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))));
+    psi[I + d1] = (add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3])))));
+    psi[I + d0 + d1] = (add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3])))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+    std::size_t dsorted[] = {d0 , d1};
+    std::sort(dsorted, dsorted + 2, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+                    kernel_core(psi, i0 + i1 + i2, d0, d1, m);
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+                    if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
+                        kernel_core(psi, i0 + i1 + i2, d0, d1, m);
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/nointrin/kernel3.hpp b/third_party/cppsim/nointrin/kernel3.hpp
new file mode 100644
index 00000000..c70d721d
--- /dev/null
+++ b/third_party/cppsim/nointrin/kernel3.hpp
@@ -0,0 +1,91 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m)
+{
+    std::complex<double> v[4];
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+    v[2] = psi[I + d1];
+    v[3] = psi[I + d0 + d1];
+
+    std::complex<double> tmp[8];
+
+    // Сделай систему команд load/store,add/mul, и реализуй для них компилятор
+
+    // Идея этой формы в том, что половина PSI-выражения может быть расчитана независимо,
+    // но это не может помочь для распределённых вычислений
+    tmp[0] = add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3]))));
+    tmp[1] = add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3]))));
+    tmp[2] = add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3]))));
+    tmp[3] = add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3]))));
+    tmp[4] = add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3]))));
+    tmp[5] = add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3]))));
+    tmp[6] = add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3]))));
+    tmp[7] = add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3]))));
+
+    v[0] = psi[I + d2];
+    v[1] = psi[I + d0 + d2];
+    v[2] = psi[I + d1 + d2];
+    v[3] = psi[I + d0 + d1 + d2];
+
+    psi[I] = (add(tmp[0], add(mul(v[0], m[0][4]), add(mul(v[1], m[0][5]), add(mul(v[2], m[0][6]), mul(v[3], m[0][7]))))));
+    psi[I + d0] = (add(tmp[1], add(mul(v[0], m[1][4]), add(mul(v[1], m[1][5]), add(mul(v[2], m[1][6]), mul(v[3], m[1][7]))))));
+    psi[I + d1] = (add(tmp[2], add(mul(v[0], m[2][4]), add(mul(v[1], m[2][5]), add(mul(v[2], m[2][6]), mul(v[3], m[2][7]))))));
+    psi[I + d0 + d1] = (add(tmp[3], add(mul(v[0], m[3][4]), add(mul(v[1], m[3][5]), add(mul(v[2], m[3][6]), mul(v[3], m[3][7]))))));
+    psi[I + d2] = (add(tmp[4], add(mul(v[0], m[4][4]), add(mul(v[1], m[4][5]), add(mul(v[2], m[4][6]), mul(v[3], m[4][7]))))));
+    psi[I + d0 + d2] = (add(tmp[5], add(mul(v[0], m[5][4]), add(mul(v[1], m[5][5]), add(mul(v[2], m[5][6]), mul(v[3], m[5][7]))))));
+    psi[I + d1 + d2] = (add(tmp[6], add(mul(v[0], m[6][4]), add(mul(v[1], m[6][5]), add(mul(v[2], m[6][6]), mul(v[3], m[6][7]))))));
+    psi[I + d0 + d1 + d2] = (add(tmp[7], add(mul(v[0], m[7][4]), add(mul(v[1], m[7][5]), add(mul(v[2], m[7][6]), mul(v[3], m[7][7]))))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+    std::size_t d2 = 1UL << id2;
+    std::size_t dsorted[] = {d0 , d1, d2};
+    std::sort(dsorted, dsorted + 3, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+                        kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m);
+                    }
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+                        if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
+                            kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m);
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/nointrin/kernel4.hpp b/third_party/cppsim/nointrin/kernel4.hpp
new file mode 100644
index 00000000..b12424a7
--- /dev/null
+++ b/third_party/cppsim/nointrin/kernel4.hpp
@@ -0,0 +1,152 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m)
+{
+    std::complex<double> v[4];
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+    v[2] = psi[I + d1];
+    v[3] = psi[I + d0 + d1];
+
+    std::complex<double> tmp[16];
+
+    tmp[0] = add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3]))));
+    tmp[1] = add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3]))));
+    tmp[2] = add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3]))));
+    tmp[3] = add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3]))));
+    tmp[4] = add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3]))));
+    tmp[5] = add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3]))));
+    tmp[6] = add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3]))));
+    tmp[7] = add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3]))));
+    tmp[8] = add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3]))));
+    tmp[9] = add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3]))));
+    tmp[10] = add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3]))));
+    tmp[11] = add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3]))));
+    tmp[12] = add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3]))));
+    tmp[13] = add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3]))));
+    tmp[14] = add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3]))));
+    tmp[15] = add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3]))));
+
+    v[0] = psi[I + d2];
+    v[1] = psi[I + d0 + d2];
+    v[2] = psi[I + d1 + d2];
+    v[3] = psi[I + d0 + d1 + d2];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][4]), add(mul(v[1], m[0][5]), add(mul(v[2], m[0][6]), mul(v[3], m[0][7])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][4]), add(mul(v[1], m[1][5]), add(mul(v[2], m[1][6]), mul(v[3], m[1][7])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][4]), add(mul(v[1], m[2][5]), add(mul(v[2], m[2][6]), mul(v[3], m[2][7])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][4]), add(mul(v[1], m[3][5]), add(mul(v[2], m[3][6]), mul(v[3], m[3][7])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][4]), add(mul(v[1], m[4][5]), add(mul(v[2], m[4][6]), mul(v[3], m[4][7])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][4]), add(mul(v[1], m[5][5]), add(mul(v[2], m[5][6]), mul(v[3], m[5][7])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][4]), add(mul(v[1], m[6][5]), add(mul(v[2], m[6][6]), mul(v[3], m[6][7])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][4]), add(mul(v[1], m[7][5]), add(mul(v[2], m[7][6]), mul(v[3], m[7][7])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][4]), add(mul(v[1], m[8][5]), add(mul(v[2], m[8][6]), mul(v[3], m[8][7])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][4]), add(mul(v[1], m[9][5]), add(mul(v[2], m[9][6]), mul(v[3], m[9][7])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][4]), add(mul(v[1], m[10][5]), add(mul(v[2], m[10][6]), mul(v[3], m[10][7])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][4]), add(mul(v[1], m[11][5]), add(mul(v[2], m[11][6]), mul(v[3], m[11][7])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][4]), add(mul(v[1], m[12][5]), add(mul(v[2], m[12][6]), mul(v[3], m[12][7])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][4]), add(mul(v[1], m[13][5]), add(mul(v[2], m[13][6]), mul(v[3], m[13][7])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][4]), add(mul(v[1], m[14][5]), add(mul(v[2], m[14][6]), mul(v[3], m[14][7])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][4]), add(mul(v[1], m[15][5]), add(mul(v[2], m[15][6]), mul(v[3], m[15][7])))));
+
+    v[0] = psi[I + d3];
+    v[1] = psi[I + d0 + d3];
+    v[2] = psi[I + d1 + d3];
+    v[3] = psi[I + d0 + d1 + d3];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][8]), add(mul(v[1], m[0][9]), add(mul(v[2], m[0][10]), mul(v[3], m[0][11])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][8]), add(mul(v[1], m[1][9]), add(mul(v[2], m[1][10]), mul(v[3], m[1][11])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][8]), add(mul(v[1], m[2][9]), add(mul(v[2], m[2][10]), mul(v[3], m[2][11])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][8]), add(mul(v[1], m[3][9]), add(mul(v[2], m[3][10]), mul(v[3], m[3][11])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][8]), add(mul(v[1], m[4][9]), add(mul(v[2], m[4][10]), mul(v[3], m[4][11])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][8]), add(mul(v[1], m[5][9]), add(mul(v[2], m[5][10]), mul(v[3], m[5][11])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][8]), add(mul(v[1], m[6][9]), add(mul(v[2], m[6][10]), mul(v[3], m[6][11])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][8]), add(mul(v[1], m[7][9]), add(mul(v[2], m[7][10]), mul(v[3], m[7][11])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][8]), add(mul(v[1], m[8][9]), add(mul(v[2], m[8][10]), mul(v[3], m[8][11])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][8]), add(mul(v[1], m[9][9]), add(mul(v[2], m[9][10]), mul(v[3], m[9][11])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][8]), add(mul(v[1], m[10][9]), add(mul(v[2], m[10][10]), mul(v[3], m[10][11])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][8]), add(mul(v[1], m[11][9]), add(mul(v[2], m[11][10]), mul(v[3], m[11][11])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][8]), add(mul(v[1], m[12][9]), add(mul(v[2], m[12][10]), mul(v[3], m[12][11])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][8]), add(mul(v[1], m[13][9]), add(mul(v[2], m[13][10]), mul(v[3], m[13][11])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][8]), add(mul(v[1], m[14][9]), add(mul(v[2], m[14][10]), mul(v[3], m[14][11])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][8]), add(mul(v[1], m[15][9]), add(mul(v[2], m[15][10]), mul(v[3], m[15][11])))));
+
+    v[0] = psi[I + d2 + d3];
+    v[1] = psi[I + d0 + d2 + d3];
+    v[2] = psi[I + d1 + d2 + d3];
+    v[3] = psi[I + d0 + d1 + d2 + d3];
+
+    psi[I] = (add(tmp[0], add(mul(v[0], m[0][12]), add(mul(v[1], m[0][13]), add(mul(v[2], m[0][14]), mul(v[3], m[0][15]))))));
+    psi[I + d0] = (add(tmp[1], add(mul(v[0], m[1][12]), add(mul(v[1], m[1][13]), add(mul(v[2], m[1][14]), mul(v[3], m[1][15]))))));
+    psi[I + d1] = (add(tmp[2], add(mul(v[0], m[2][12]), add(mul(v[1], m[2][13]), add(mul(v[2], m[2][14]), mul(v[3], m[2][15]))))));
+    psi[I + d0 + d1] = (add(tmp[3], add(mul(v[0], m[3][12]), add(mul(v[1], m[3][13]), add(mul(v[2], m[3][14]), mul(v[3], m[3][15]))))));
+    psi[I + d2] = (add(tmp[4], add(mul(v[0], m[4][12]), add(mul(v[1], m[4][13]), add(mul(v[2], m[4][14]), mul(v[3], m[4][15]))))));
+    psi[I + d0 + d2] = (add(tmp[5], add(mul(v[0], m[5][12]), add(mul(v[1], m[5][13]), add(mul(v[2], m[5][14]), mul(v[3], m[5][15]))))));
+    psi[I + d1 + d2] = (add(tmp[6], add(mul(v[0], m[6][12]), add(mul(v[1], m[6][13]), add(mul(v[2], m[6][14]), mul(v[3], m[6][15]))))));
+    psi[I + d0 + d1 + d2] = (add(tmp[7], add(mul(v[0], m[7][12]), add(mul(v[1], m[7][13]), add(mul(v[2], m[7][14]), mul(v[3], m[7][15]))))));
+    psi[I + d3] = (add(tmp[8], add(mul(v[0], m[8][12]), add(mul(v[1], m[8][13]), add(mul(v[2], m[8][14]), mul(v[3], m[8][15]))))));
+    psi[I + d0 + d3] = (add(tmp[9], add(mul(v[0], m[9][12]), add(mul(v[1], m[9][13]), add(mul(v[2], m[9][14]), mul(v[3], m[9][15]))))));
+    psi[I + d1 + d3] = (add(tmp[10], add(mul(v[0], m[10][12]), add(mul(v[1], m[10][13]), add(mul(v[2], m[10][14]), mul(v[3], m[10][15]))))));
+    psi[I + d0 + d1 + d3] = (add(tmp[11], add(mul(v[0], m[11][12]), add(mul(v[1], m[11][13]), add(mul(v[2], m[11][14]), mul(v[3], m[11][15]))))));
+    psi[I + d2 + d3] = (add(tmp[12], add(mul(v[0], m[12][12]), add(mul(v[1], m[12][13]), add(mul(v[2], m[12][14]), mul(v[3], m[12][15]))))));
+    psi[I + d0 + d2 + d3] = (add(tmp[13], add(mul(v[0], m[13][12]), add(mul(v[1], m[13][13]), add(mul(v[2], m[13][14]), mul(v[3], m[13][15]))))));
+    psi[I + d1 + d2 + d3] = (add(tmp[14], add(mul(v[0], m[14][12]), add(mul(v[1], m[14][13]), add(mul(v[2], m[14][14]), mul(v[3], m[14][15]))))));
+    psi[I + d0 + d1 + d2 + d3] = (add(tmp[15], add(mul(v[0], m[15][12]), add(mul(v[1], m[15][13]), add(mul(v[2], m[15][14]), mul(v[3], m[15][15]))))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+    std::size_t d2 = 1UL << id2;
+    std::size_t d3 = 1UL << id3;
+    std::size_t dsorted[] = {d0 , d1, d2, d3};
+    std::sort(dsorted, dsorted + 4, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+                            kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+                            if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
+                                kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/nointrin/kernel5.hpp b/third_party/cppsim/nointrin/kernel5.hpp
new file mode 100644
index 00000000..a3e47f10
--- /dev/null
+++ b/third_party/cppsim/nointrin/kernel5.hpp
@@ -0,0 +1,373 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m)
+{
+    std::complex<double> v[4];
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+    v[2] = psi[I + d1];
+    v[3] = psi[I + d0 + d1];
+
+    std::complex<double> tmp[32];
+
+    tmp[0] = add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3]))));
+    tmp[1] = add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3]))));
+    tmp[2] = add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3]))));
+    tmp[3] = add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3]))));
+    tmp[4] = add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3]))));
+    tmp[5] = add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3]))));
+    tmp[6] = add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3]))));
+    tmp[7] = add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3]))));
+    tmp[8] = add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3]))));
+    tmp[9] = add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3]))));
+    tmp[10] = add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3]))));
+    tmp[11] = add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3]))));
+    tmp[12] = add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3]))));
+    tmp[13] = add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3]))));
+    tmp[14] = add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3]))));
+    tmp[15] = add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3]))));
+    tmp[16] = add(mul(v[0], m[16][0]), add(mul(v[1], m[16][1]), add(mul(v[2], m[16][2]), mul(v[3], m[16][3]))));
+    tmp[17] = add(mul(v[0], m[17][0]), add(mul(v[1], m[17][1]), add(mul(v[2], m[17][2]), mul(v[3], m[17][3]))));
+    tmp[18] = add(mul(v[0], m[18][0]), add(mul(v[1], m[18][1]), add(mul(v[2], m[18][2]), mul(v[3], m[18][3]))));
+    tmp[19] = add(mul(v[0], m[19][0]), add(mul(v[1], m[19][1]), add(mul(v[2], m[19][2]), mul(v[3], m[19][3]))));
+    tmp[20] = add(mul(v[0], m[20][0]), add(mul(v[1], m[20][1]), add(mul(v[2], m[20][2]), mul(v[3], m[20][3]))));
+    tmp[21] = add(mul(v[0], m[21][0]), add(mul(v[1], m[21][1]), add(mul(v[2], m[21][2]), mul(v[3], m[21][3]))));
+    tmp[22] = add(mul(v[0], m[22][0]), add(mul(v[1], m[22][1]), add(mul(v[2], m[22][2]), mul(v[3], m[22][3]))));
+    tmp[23] = add(mul(v[0], m[23][0]), add(mul(v[1], m[23][1]), add(mul(v[2], m[23][2]), mul(v[3], m[23][3]))));
+    tmp[24] = add(mul(v[0], m[24][0]), add(mul(v[1], m[24][1]), add(mul(v[2], m[24][2]), mul(v[3], m[24][3]))));
+    tmp[25] = add(mul(v[0], m[25][0]), add(mul(v[1], m[25][1]), add(mul(v[2], m[25][2]), mul(v[3], m[25][3]))));
+    tmp[26] = add(mul(v[0], m[26][0]), add(mul(v[1], m[26][1]), add(mul(v[2], m[26][2]), mul(v[3], m[26][3]))));
+    tmp[27] = add(mul(v[0], m[27][0]), add(mul(v[1], m[27][1]), add(mul(v[2], m[27][2]), mul(v[3], m[27][3]))));
+    tmp[28] = add(mul(v[0], m[28][0]), add(mul(v[1], m[28][1]), add(mul(v[2], m[28][2]), mul(v[3], m[28][3]))));
+    tmp[29] = add(mul(v[0], m[29][0]), add(mul(v[1], m[29][1]), add(mul(v[2], m[29][2]), mul(v[3], m[29][3]))));
+    tmp[30] = add(mul(v[0], m[30][0]), add(mul(v[1], m[30][1]), add(mul(v[2], m[30][2]), mul(v[3], m[30][3]))));
+    tmp[31] = add(mul(v[0], m[31][0]), add(mul(v[1], m[31][1]), add(mul(v[2], m[31][2]), mul(v[3], m[31][3]))));
+
+    v[0] = psi[I + d2];
+    v[1] = psi[I + d0 + d2];
+    v[2] = psi[I + d1 + d2];
+    v[3] = psi[I + d0 + d1 + d2];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][4]), add(mul(v[1], m[0][5]), add(mul(v[2], m[0][6]), mul(v[3], m[0][7])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][4]), add(mul(v[1], m[1][5]), add(mul(v[2], m[1][6]), mul(v[3], m[1][7])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][4]), add(mul(v[1], m[2][5]), add(mul(v[2], m[2][6]), mul(v[3], m[2][7])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][4]), add(mul(v[1], m[3][5]), add(mul(v[2], m[3][6]), mul(v[3], m[3][7])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][4]), add(mul(v[1], m[4][5]), add(mul(v[2], m[4][6]), mul(v[3], m[4][7])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][4]), add(mul(v[1], m[5][5]), add(mul(v[2], m[5][6]), mul(v[3], m[5][7])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][4]), add(mul(v[1], m[6][5]), add(mul(v[2], m[6][6]), mul(v[3], m[6][7])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][4]), add(mul(v[1], m[7][5]), add(mul(v[2], m[7][6]), mul(v[3], m[7][7])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][4]), add(mul(v[1], m[8][5]), add(mul(v[2], m[8][6]), mul(v[3], m[8][7])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][4]), add(mul(v[1], m[9][5]), add(mul(v[2], m[9][6]), mul(v[3], m[9][7])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][4]), add(mul(v[1], m[10][5]), add(mul(v[2], m[10][6]), mul(v[3], m[10][7])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][4]), add(mul(v[1], m[11][5]), add(mul(v[2], m[11][6]), mul(v[3], m[11][7])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][4]), add(mul(v[1], m[12][5]), add(mul(v[2], m[12][6]), mul(v[3], m[12][7])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][4]), add(mul(v[1], m[13][5]), add(mul(v[2], m[13][6]), mul(v[3], m[13][7])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][4]), add(mul(v[1], m[14][5]), add(mul(v[2], m[14][6]), mul(v[3], m[14][7])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][4]), add(mul(v[1], m[15][5]), add(mul(v[2], m[15][6]), mul(v[3], m[15][7])))));
+    tmp[16] = add(tmp[16], add(mul(v[0], m[16][4]), add(mul(v[1], m[16][5]), add(mul(v[2], m[16][6]), mul(v[3], m[16][7])))));
+    tmp[17] = add(tmp[17], add(mul(v[0], m[17][4]), add(mul(v[1], m[17][5]), add(mul(v[2], m[17][6]), mul(v[3], m[17][7])))));
+    tmp[18] = add(tmp[18], add(mul(v[0], m[18][4]), add(mul(v[1], m[18][5]), add(mul(v[2], m[18][6]), mul(v[3], m[18][7])))));
+    tmp[19] = add(tmp[19], add(mul(v[0], m[19][4]), add(mul(v[1], m[19][5]), add(mul(v[2], m[19][6]), mul(v[3], m[19][7])))));
+    tmp[20] = add(tmp[20], add(mul(v[0], m[20][4]), add(mul(v[1], m[20][5]), add(mul(v[2], m[20][6]), mul(v[3], m[20][7])))));
+    tmp[21] = add(tmp[21], add(mul(v[0], m[21][4]), add(mul(v[1], m[21][5]), add(mul(v[2], m[21][6]), mul(v[3], m[21][7])))));
+    tmp[22] = add(tmp[22], add(mul(v[0], m[22][4]), add(mul(v[1], m[22][5]), add(mul(v[2], m[22][6]), mul(v[3], m[22][7])))));
+    tmp[23] = add(tmp[23], add(mul(v[0], m[23][4]), add(mul(v[1], m[23][5]), add(mul(v[2], m[23][6]), mul(v[3], m[23][7])))));
+    tmp[24] = add(tmp[24], add(mul(v[0], m[24][4]), add(mul(v[1], m[24][5]), add(mul(v[2], m[24][6]), mul(v[3], m[24][7])))));
+    tmp[25] = add(tmp[25], add(mul(v[0], m[25][4]), add(mul(v[1], m[25][5]), add(mul(v[2], m[25][6]), mul(v[3], m[25][7])))));
+    tmp[26] = add(tmp[26], add(mul(v[0], m[26][4]), add(mul(v[1], m[26][5]), add(mul(v[2], m[26][6]), mul(v[3], m[26][7])))));
+    tmp[27] = add(tmp[27], add(mul(v[0], m[27][4]), add(mul(v[1], m[27][5]), add(mul(v[2], m[27][6]), mul(v[3], m[27][7])))));
+    tmp[28] = add(tmp[28], add(mul(v[0], m[28][4]), add(mul(v[1], m[28][5]), add(mul(v[2], m[28][6]), mul(v[3], m[28][7])))));
+    tmp[29] = add(tmp[29], add(mul(v[0], m[29][4]), add(mul(v[1], m[29][5]), add(mul(v[2], m[29][6]), mul(v[3], m[29][7])))));
+    tmp[30] = add(tmp[30], add(mul(v[0], m[30][4]), add(mul(v[1], m[30][5]), add(mul(v[2], m[30][6]), mul(v[3], m[30][7])))));
+    tmp[31] = add(tmp[31], add(mul(v[0], m[31][4]), add(mul(v[1], m[31][5]), add(mul(v[2], m[31][6]), mul(v[3], m[31][7])))));
+
+    v[0] = psi[I + d3];
+    v[1] = psi[I + d0 + d3];
+    v[2] = psi[I + d1 + d3];
+    v[3] = psi[I + d0 + d1 + d3];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][8]), add(mul(v[1], m[0][9]), add(mul(v[2], m[0][10]), mul(v[3], m[0][11])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][8]), add(mul(v[1], m[1][9]), add(mul(v[2], m[1][10]), mul(v[3], m[1][11])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][8]), add(mul(v[1], m[2][9]), add(mul(v[2], m[2][10]), mul(v[3], m[2][11])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][8]), add(mul(v[1], m[3][9]), add(mul(v[2], m[3][10]), mul(v[3], m[3][11])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][8]), add(mul(v[1], m[4][9]), add(mul(v[2], m[4][10]), mul(v[3], m[4][11])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][8]), add(mul(v[1], m[5][9]), add(mul(v[2], m[5][10]), mul(v[3], m[5][11])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][8]), add(mul(v[1], m[6][9]), add(mul(v[2], m[6][10]), mul(v[3], m[6][11])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][8]), add(mul(v[1], m[7][9]), add(mul(v[2], m[7][10]), mul(v[3], m[7][11])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][8]), add(mul(v[1], m[8][9]), add(mul(v[2], m[8][10]), mul(v[3], m[8][11])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][8]), add(mul(v[1], m[9][9]), add(mul(v[2], m[9][10]), mul(v[3], m[9][11])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][8]), add(mul(v[1], m[10][9]), add(mul(v[2], m[10][10]), mul(v[3], m[10][11])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][8]), add(mul(v[1], m[11][9]), add(mul(v[2], m[11][10]), mul(v[3], m[11][11])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][8]), add(mul(v[1], m[12][9]), add(mul(v[2], m[12][10]), mul(v[3], m[12][11])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][8]), add(mul(v[1], m[13][9]), add(mul(v[2], m[13][10]), mul(v[3], m[13][11])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][8]), add(mul(v[1], m[14][9]), add(mul(v[2], m[14][10]), mul(v[3], m[14][11])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][8]), add(mul(v[1], m[15][9]), add(mul(v[2], m[15][10]), mul(v[3], m[15][11])))));
+    tmp[16] = add(tmp[16], add(mul(v[0], m[16][8]), add(mul(v[1], m[16][9]), add(mul(v[2], m[16][10]), mul(v[3], m[16][11])))));
+    tmp[17] = add(tmp[17], add(mul(v[0], m[17][8]), add(mul(v[1], m[17][9]), add(mul(v[2], m[17][10]), mul(v[3], m[17][11])))));
+    tmp[18] = add(tmp[18], add(mul(v[0], m[18][8]), add(mul(v[1], m[18][9]), add(mul(v[2], m[18][10]), mul(v[3], m[18][11])))));
+    tmp[19] = add(tmp[19], add(mul(v[0], m[19][8]), add(mul(v[1], m[19][9]), add(mul(v[2], m[19][10]), mul(v[3], m[19][11])))));
+    tmp[20] = add(tmp[20], add(mul(v[0], m[20][8]), add(mul(v[1], m[20][9]), add(mul(v[2], m[20][10]), mul(v[3], m[20][11])))));
+    tmp[21] = add(tmp[21], add(mul(v[0], m[21][8]), add(mul(v[1], m[21][9]), add(mul(v[2], m[21][10]), mul(v[3], m[21][11])))));
+    tmp[22] = add(tmp[22], add(mul(v[0], m[22][8]), add(mul(v[1], m[22][9]), add(mul(v[2], m[22][10]), mul(v[3], m[22][11])))));
+    tmp[23] = add(tmp[23], add(mul(v[0], m[23][8]), add(mul(v[1], m[23][9]), add(mul(v[2], m[23][10]), mul(v[3], m[23][11])))));
+    tmp[24] = add(tmp[24], add(mul(v[0], m[24][8]), add(mul(v[1], m[24][9]), add(mul(v[2], m[24][10]), mul(v[3], m[24][11])))));
+    tmp[25] = add(tmp[25], add(mul(v[0], m[25][8]), add(mul(v[1], m[25][9]), add(mul(v[2], m[25][10]), mul(v[3], m[25][11])))));
+    tmp[26] = add(tmp[26], add(mul(v[0], m[26][8]), add(mul(v[1], m[26][9]), add(mul(v[2], m[26][10]), mul(v[3], m[26][11])))));
+    tmp[27] = add(tmp[27], add(mul(v[0], m[27][8]), add(mul(v[1], m[27][9]), add(mul(v[2], m[27][10]), mul(v[3], m[27][11])))));
+    tmp[28] = add(tmp[28], add(mul(v[0], m[28][8]), add(mul(v[1], m[28][9]), add(mul(v[2], m[28][10]), mul(v[3], m[28][11])))));
+    tmp[29] = add(tmp[29], add(mul(v[0], m[29][8]), add(mul(v[1], m[29][9]), add(mul(v[2], m[29][10]), mul(v[3], m[29][11])))));
+    tmp[30] = add(tmp[30], add(mul(v[0], m[30][8]), add(mul(v[1], m[30][9]), add(mul(v[2], m[30][10]), mul(v[3], m[30][11])))));
+    tmp[31] = add(tmp[31], add(mul(v[0], m[31][8]), add(mul(v[1], m[31][9]), add(mul(v[2], m[31][10]), mul(v[3], m[31][11])))));
+
+    v[0] = psi[I + d2 + d3];
+    v[1] = psi[I + d0 + d2 + d3];
+    v[2] = psi[I + d1 + d2 + d3];
+    v[3] = psi[I + d0 + d1 + d2 + d3];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][12]), add(mul(v[1], m[0][13]), add(mul(v[2], m[0][14]), mul(v[3], m[0][15])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][12]), add(mul(v[1], m[1][13]), add(mul(v[2], m[1][14]), mul(v[3], m[1][15])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][12]), add(mul(v[1], m[2][13]), add(mul(v[2], m[2][14]), mul(v[3], m[2][15])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][12]), add(mul(v[1], m[3][13]), add(mul(v[2], m[3][14]), mul(v[3], m[3][15])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][12]), add(mul(v[1], m[4][13]), add(mul(v[2], m[4][14]), mul(v[3], m[4][15])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][12]), add(mul(v[1], m[5][13]), add(mul(v[2], m[5][14]), mul(v[3], m[5][15])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][12]), add(mul(v[1], m[6][13]), add(mul(v[2], m[6][14]), mul(v[3], m[6][15])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][12]), add(mul(v[1], m[7][13]), add(mul(v[2], m[7][14]), mul(v[3], m[7][15])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][12]), add(mul(v[1], m[8][13]), add(mul(v[2], m[8][14]), mul(v[3], m[8][15])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][12]), add(mul(v[1], m[9][13]), add(mul(v[2], m[9][14]), mul(v[3], m[9][15])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][12]), add(mul(v[1], m[10][13]), add(mul(v[2], m[10][14]), mul(v[3], m[10][15])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][12]), add(mul(v[1], m[11][13]), add(mul(v[2], m[11][14]), mul(v[3], m[11][15])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][12]), add(mul(v[1], m[12][13]), add(mul(v[2], m[12][14]), mul(v[3], m[12][15])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][12]), add(mul(v[1], m[13][13]), add(mul(v[2], m[13][14]), mul(v[3], m[13][15])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][12]), add(mul(v[1], m[14][13]), add(mul(v[2], m[14][14]), mul(v[3], m[14][15])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][12]), add(mul(v[1], m[15][13]), add(mul(v[2], m[15][14]), mul(v[3], m[15][15])))));
+    tmp[16] = add(tmp[16], add(mul(v[0], m[16][12]), add(mul(v[1], m[16][13]), add(mul(v[2], m[16][14]), mul(v[3], m[16][15])))));
+    tmp[17] = add(tmp[17], add(mul(v[0], m[17][12]), add(mul(v[1], m[17][13]), add(mul(v[2], m[17][14]), mul(v[3], m[17][15])))));
+    tmp[18] = add(tmp[18], add(mul(v[0], m[18][12]), add(mul(v[1], m[18][13]), add(mul(v[2], m[18][14]), mul(v[3], m[18][15])))));
+    tmp[19] = add(tmp[19], add(mul(v[0], m[19][12]), add(mul(v[1], m[19][13]), add(mul(v[2], m[19][14]), mul(v[3], m[19][15])))));
+    tmp[20] = add(tmp[20], add(mul(v[0], m[20][12]), add(mul(v[1], m[20][13]), add(mul(v[2], m[20][14]), mul(v[3], m[20][15])))));
+    tmp[21] = add(tmp[21], add(mul(v[0], m[21][12]), add(mul(v[1], m[21][13]), add(mul(v[2], m[21][14]), mul(v[3], m[21][15])))));
+    tmp[22] = add(tmp[22], add(mul(v[0], m[22][12]), add(mul(v[1], m[22][13]), add(mul(v[2], m[22][14]), mul(v[3], m[22][15])))));
+    tmp[23] = add(tmp[23], add(mul(v[0], m[23][12]), add(mul(v[1], m[23][13]), add(mul(v[2], m[23][14]), mul(v[3], m[23][15])))));
+    tmp[24] = add(tmp[24], add(mul(v[0], m[24][12]), add(mul(v[1], m[24][13]), add(mul(v[2], m[24][14]), mul(v[3], m[24][15])))));
+    tmp[25] = add(tmp[25], add(mul(v[0], m[25][12]), add(mul(v[1], m[25][13]), add(mul(v[2], m[25][14]), mul(v[3], m[25][15])))));
+    tmp[26] = add(tmp[26], add(mul(v[0], m[26][12]), add(mul(v[1], m[26][13]), add(mul(v[2], m[26][14]), mul(v[3], m[26][15])))));
+    tmp[27] = add(tmp[27], add(mul(v[0], m[27][12]), add(mul(v[1], m[27][13]), add(mul(v[2], m[27][14]), mul(v[3], m[27][15])))));
+    tmp[28] = add(tmp[28], add(mul(v[0], m[28][12]), add(mul(v[1], m[28][13]), add(mul(v[2], m[28][14]), mul(v[3], m[28][15])))));
+    tmp[29] = add(tmp[29], add(mul(v[0], m[29][12]), add(mul(v[1], m[29][13]), add(mul(v[2], m[29][14]), mul(v[3], m[29][15])))));
+    tmp[30] = add(tmp[30], add(mul(v[0], m[30][12]), add(mul(v[1], m[30][13]), add(mul(v[2], m[30][14]), mul(v[3], m[30][15])))));
+    tmp[31] = add(tmp[31], add(mul(v[0], m[31][12]), add(mul(v[1], m[31][13]), add(mul(v[2], m[31][14]), mul(v[3], m[31][15])))));
+
+    v[0] = psi[I + d4];
+    v[1] = psi[I + d0 + d4];
+    v[2] = psi[I + d1 + d4];
+    v[3] = psi[I + d0 + d1 + d4];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][16]), add(mul(v[1], m[0][17]), add(mul(v[2], m[0][18]), mul(v[3], m[0][19])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][16]), add(mul(v[1], m[1][17]), add(mul(v[2], m[1][18]), mul(v[3], m[1][19])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][16]), add(mul(v[1], m[2][17]), add(mul(v[2], m[2][18]), mul(v[3], m[2][19])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][16]), add(mul(v[1], m[3][17]), add(mul(v[2], m[3][18]), mul(v[3], m[3][19])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][16]), add(mul(v[1], m[4][17]), add(mul(v[2], m[4][18]), mul(v[3], m[4][19])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][16]), add(mul(v[1], m[5][17]), add(mul(v[2], m[5][18]), mul(v[3], m[5][19])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][16]), add(mul(v[1], m[6][17]), add(mul(v[2], m[6][18]), mul(v[3], m[6][19])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][16]), add(mul(v[1], m[7][17]), add(mul(v[2], m[7][18]), mul(v[3], m[7][19])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][16]), add(mul(v[1], m[8][17]), add(mul(v[2], m[8][18]), mul(v[3], m[8][19])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][16]), add(mul(v[1], m[9][17]), add(mul(v[2], m[9][18]), mul(v[3], m[9][19])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][16]), add(mul(v[1], m[10][17]), add(mul(v[2], m[10][18]), mul(v[3], m[10][19])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][16]), add(mul(v[1], m[11][17]), add(mul(v[2], m[11][18]), mul(v[3], m[11][19])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][16]), add(mul(v[1], m[12][17]), add(mul(v[2], m[12][18]), mul(v[3], m[12][19])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][16]), add(mul(v[1], m[13][17]), add(mul(v[2], m[13][18]), mul(v[3], m[13][19])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][16]), add(mul(v[1], m[14][17]), add(mul(v[2], m[14][18]), mul(v[3], m[14][19])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][16]), add(mul(v[1], m[15][17]), add(mul(v[2], m[15][18]), mul(v[3], m[15][19])))));
+    tmp[16] = add(tmp[16], add(mul(v[0], m[16][16]), add(mul(v[1], m[16][17]), add(mul(v[2], m[16][18]), mul(v[3], m[16][19])))));
+    tmp[17] = add(tmp[17], add(mul(v[0], m[17][16]), add(mul(v[1], m[17][17]), add(mul(v[2], m[17][18]), mul(v[3], m[17][19])))));
+    tmp[18] = add(tmp[18], add(mul(v[0], m[18][16]), add(mul(v[1], m[18][17]), add(mul(v[2], m[18][18]), mul(v[3], m[18][19])))));
+    tmp[19] = add(tmp[19], add(mul(v[0], m[19][16]), add(mul(v[1], m[19][17]), add(mul(v[2], m[19][18]), mul(v[3], m[19][19])))));
+    tmp[20] = add(tmp[20], add(mul(v[0], m[20][16]), add(mul(v[1], m[20][17]), add(mul(v[2], m[20][18]), mul(v[3], m[20][19])))));
+    tmp[21] = add(tmp[21], add(mul(v[0], m[21][16]), add(mul(v[1], m[21][17]), add(mul(v[2], m[21][18]), mul(v[3], m[21][19])))));
+    tmp[22] = add(tmp[22], add(mul(v[0], m[22][16]), add(mul(v[1], m[22][17]), add(mul(v[2], m[22][18]), mul(v[3], m[22][19])))));
+    tmp[23] = add(tmp[23], add(mul(v[0], m[23][16]), add(mul(v[1], m[23][17]), add(mul(v[2], m[23][18]), mul(v[3], m[23][19])))));
+    tmp[24] = add(tmp[24], add(mul(v[0], m[24][16]), add(mul(v[1], m[24][17]), add(mul(v[2], m[24][18]), mul(v[3], m[24][19])))));
+    tmp[25] = add(tmp[25], add(mul(v[0], m[25][16]), add(mul(v[1], m[25][17]), add(mul(v[2], m[25][18]), mul(v[3], m[25][19])))));
+    tmp[26] = add(tmp[26], add(mul(v[0], m[26][16]), add(mul(v[1], m[26][17]), add(mul(v[2], m[26][18]), mul(v[3], m[26][19])))));
+    tmp[27] = add(tmp[27], add(mul(v[0], m[27][16]), add(mul(v[1], m[27][17]), add(mul(v[2], m[27][18]), mul(v[3], m[27][19])))));
+    tmp[28] = add(tmp[28], add(mul(v[0], m[28][16]), add(mul(v[1], m[28][17]), add(mul(v[2], m[28][18]), mul(v[3], m[28][19])))));
+    tmp[29] = add(tmp[29], add(mul(v[0], m[29][16]), add(mul(v[1], m[29][17]), add(mul(v[2], m[29][18]), mul(v[3], m[29][19])))));
+    tmp[30] = add(tmp[30], add(mul(v[0], m[30][16]), add(mul(v[1], m[30][17]), add(mul(v[2], m[30][18]), mul(v[3], m[30][19])))));
+    tmp[31] = add(tmp[31], add(mul(v[0], m[31][16]), add(mul(v[1], m[31][17]), add(mul(v[2], m[31][18]), mul(v[3], m[31][19])))));
+
+    v[0] = psi[I + d2 + d4];
+    v[1] = psi[I + d0 + d2 + d4];
+    v[2] = psi[I + d1 + d2 + d4];
+    v[3] = psi[I + d0 + d1 + d2 + d4];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][20]), add(mul(v[1], m[0][21]), add(mul(v[2], m[0][22]), mul(v[3], m[0][23])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][20]), add(mul(v[1], m[1][21]), add(mul(v[2], m[1][22]), mul(v[3], m[1][23])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][20]), add(mul(v[1], m[2][21]), add(mul(v[2], m[2][22]), mul(v[3], m[2][23])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][20]), add(mul(v[1], m[3][21]), add(mul(v[2], m[3][22]), mul(v[3], m[3][23])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][20]), add(mul(v[1], m[4][21]), add(mul(v[2], m[4][22]), mul(v[3], m[4][23])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][20]), add(mul(v[1], m[5][21]), add(mul(v[2], m[5][22]), mul(v[3], m[5][23])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][20]), add(mul(v[1], m[6][21]), add(mul(v[2], m[6][22]), mul(v[3], m[6][23])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][20]), add(mul(v[1], m[7][21]), add(mul(v[2], m[7][22]), mul(v[3], m[7][23])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][20]), add(mul(v[1], m[8][21]), add(mul(v[2], m[8][22]), mul(v[3], m[8][23])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][20]), add(mul(v[1], m[9][21]), add(mul(v[2], m[9][22]), mul(v[3], m[9][23])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][20]), add(mul(v[1], m[10][21]), add(mul(v[2], m[10][22]), mul(v[3], m[10][23])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][20]), add(mul(v[1], m[11][21]), add(mul(v[2], m[11][22]), mul(v[3], m[11][23])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][20]), add(mul(v[1], m[12][21]), add(mul(v[2], m[12][22]), mul(v[3], m[12][23])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][20]), add(mul(v[1], m[13][21]), add(mul(v[2], m[13][22]), mul(v[3], m[13][23])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][20]), add(mul(v[1], m[14][21]), add(mul(v[2], m[14][22]), mul(v[3], m[14][23])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][20]), add(mul(v[1], m[15][21]), add(mul(v[2], m[15][22]), mul(v[3], m[15][23])))));
+    tmp[16] = add(tmp[16], add(mul(v[0], m[16][20]), add(mul(v[1], m[16][21]), add(mul(v[2], m[16][22]), mul(v[3], m[16][23])))));
+    tmp[17] = add(tmp[17], add(mul(v[0], m[17][20]), add(mul(v[1], m[17][21]), add(mul(v[2], m[17][22]), mul(v[3], m[17][23])))));
+    tmp[18] = add(tmp[18], add(mul(v[0], m[18][20]), add(mul(v[1], m[18][21]), add(mul(v[2], m[18][22]), mul(v[3], m[18][23])))));
+    tmp[19] = add(tmp[19], add(mul(v[0], m[19][20]), add(mul(v[1], m[19][21]), add(mul(v[2], m[19][22]), mul(v[3], m[19][23])))));
+    tmp[20] = add(tmp[20], add(mul(v[0], m[20][20]), add(mul(v[1], m[20][21]), add(mul(v[2], m[20][22]), mul(v[3], m[20][23])))));
+    tmp[21] = add(tmp[21], add(mul(v[0], m[21][20]), add(mul(v[1], m[21][21]), add(mul(v[2], m[21][22]), mul(v[3], m[21][23])))));
+    tmp[22] = add(tmp[22], add(mul(v[0], m[22][20]), add(mul(v[1], m[22][21]), add(mul(v[2], m[22][22]), mul(v[3], m[22][23])))));
+    tmp[23] = add(tmp[23], add(mul(v[0], m[23][20]), add(mul(v[1], m[23][21]), add(mul(v[2], m[23][22]), mul(v[3], m[23][23])))));
+    tmp[24] = add(tmp[24], add(mul(v[0], m[24][20]), add(mul(v[1], m[24][21]), add(mul(v[2], m[24][22]), mul(v[3], m[24][23])))));
+    tmp[25] = add(tmp[25], add(mul(v[0], m[25][20]), add(mul(v[1], m[25][21]), add(mul(v[2], m[25][22]), mul(v[3], m[25][23])))));
+    tmp[26] = add(tmp[26], add(mul(v[0], m[26][20]), add(mul(v[1], m[26][21]), add(mul(v[2], m[26][22]), mul(v[3], m[26][23])))));
+    tmp[27] = add(tmp[27], add(mul(v[0], m[27][20]), add(mul(v[1], m[27][21]), add(mul(v[2], m[27][22]), mul(v[3], m[27][23])))));
+    tmp[28] = add(tmp[28], add(mul(v[0], m[28][20]), add(mul(v[1], m[28][21]), add(mul(v[2], m[28][22]), mul(v[3], m[28][23])))));
+    tmp[29] = add(tmp[29], add(mul(v[0], m[29][20]), add(mul(v[1], m[29][21]), add(mul(v[2], m[29][22]), mul(v[3], m[29][23])))));
+    tmp[30] = add(tmp[30], add(mul(v[0], m[30][20]), add(mul(v[1], m[30][21]), add(mul(v[2], m[30][22]), mul(v[3], m[30][23])))));
+    tmp[31] = add(tmp[31], add(mul(v[0], m[31][20]), add(mul(v[1], m[31][21]), add(mul(v[2], m[31][22]), mul(v[3], m[31][23])))));
+
+    v[0] = psi[I + d3 + d4];
+    v[1] = psi[I + d0 + d3 + d4];
+    v[2] = psi[I + d1 + d3 + d4];
+    v[3] = psi[I + d0 + d1 + d3 + d4];
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[0][24]), add(mul(v[1], m[0][25]), add(mul(v[2], m[0][26]), mul(v[3], m[0][27])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[1][24]), add(mul(v[1], m[1][25]), add(mul(v[2], m[1][26]), mul(v[3], m[1][27])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[2][24]), add(mul(v[1], m[2][25]), add(mul(v[2], m[2][26]), mul(v[3], m[2][27])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[3][24]), add(mul(v[1], m[3][25]), add(mul(v[2], m[3][26]), mul(v[3], m[3][27])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[4][24]), add(mul(v[1], m[4][25]), add(mul(v[2], m[4][26]), mul(v[3], m[4][27])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[5][24]), add(mul(v[1], m[5][25]), add(mul(v[2], m[5][26]), mul(v[3], m[5][27])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[6][24]), add(mul(v[1], m[6][25]), add(mul(v[2], m[6][26]), mul(v[3], m[6][27])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[7][24]), add(mul(v[1], m[7][25]), add(mul(v[2], m[7][26]), mul(v[3], m[7][27])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[8][24]), add(mul(v[1], m[8][25]), add(mul(v[2], m[8][26]), mul(v[3], m[8][27])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[9][24]), add(mul(v[1], m[9][25]), add(mul(v[2], m[9][26]), mul(v[3], m[9][27])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[10][24]), add(mul(v[1], m[10][25]), add(mul(v[2], m[10][26]), mul(v[3], m[10][27])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[11][24]), add(mul(v[1], m[11][25]), add(mul(v[2], m[11][26]), mul(v[3], m[11][27])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[12][24]), add(mul(v[1], m[12][25]), add(mul(v[2], m[12][26]), mul(v[3], m[12][27])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[13][24]), add(mul(v[1], m[13][25]), add(mul(v[2], m[13][26]), mul(v[3], m[13][27])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[14][24]), add(mul(v[1], m[14][25]), add(mul(v[2], m[14][26]), mul(v[3], m[14][27])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[15][24]), add(mul(v[1], m[15][25]), add(mul(v[2], m[15][26]), mul(v[3], m[15][27])))));
+    tmp[16] = add(tmp[16], add(mul(v[0], m[16][24]), add(mul(v[1], m[16][25]), add(mul(v[2], m[16][26]), mul(v[3], m[16][27])))));
+    tmp[17] = add(tmp[17], add(mul(v[0], m[17][24]), add(mul(v[1], m[17][25]), add(mul(v[2], m[17][26]), mul(v[3], m[17][27])))));
+    tmp[18] = add(tmp[18], add(mul(v[0], m[18][24]), add(mul(v[1], m[18][25]), add(mul(v[2], m[18][26]), mul(v[3], m[18][27])))));
+    tmp[19] = add(tmp[19], add(mul(v[0], m[19][24]), add(mul(v[1], m[19][25]), add(mul(v[2], m[19][26]), mul(v[3], m[19][27])))));
+    tmp[20] = add(tmp[20], add(mul(v[0], m[20][24]), add(mul(v[1], m[20][25]), add(mul(v[2], m[20][26]), mul(v[3], m[20][27])))));
+    tmp[21] = add(tmp[21], add(mul(v[0], m[21][24]), add(mul(v[1], m[21][25]), add(mul(v[2], m[21][26]), mul(v[3], m[21][27])))));
+    tmp[22] = add(tmp[22], add(mul(v[0], m[22][24]), add(mul(v[1], m[22][25]), add(mul(v[2], m[22][26]), mul(v[3], m[22][27])))));
+    tmp[23] = add(tmp[23], add(mul(v[0], m[23][24]), add(mul(v[1], m[23][25]), add(mul(v[2], m[23][26]), mul(v[3], m[23][27])))));
+    tmp[24] = add(tmp[24], add(mul(v[0], m[24][24]), add(mul(v[1], m[24][25]), add(mul(v[2], m[24][26]), mul(v[3], m[24][27])))));
+    tmp[25] = add(tmp[25], add(mul(v[0], m[25][24]), add(mul(v[1], m[25][25]), add(mul(v[2], m[25][26]), mul(v[3], m[25][27])))));
+    tmp[26] = add(tmp[26], add(mul(v[0], m[26][24]), add(mul(v[1], m[26][25]), add(mul(v[2], m[26][26]), mul(v[3], m[26][27])))));
+    tmp[27] = add(tmp[27], add(mul(v[0], m[27][24]), add(mul(v[1], m[27][25]), add(mul(v[2], m[27][26]), mul(v[3], m[27][27])))));
+    tmp[28] = add(tmp[28], add(mul(v[0], m[28][24]), add(mul(v[1], m[28][25]), add(mul(v[2], m[28][26]), mul(v[3], m[28][27])))));
+    tmp[29] = add(tmp[29], add(mul(v[0], m[29][24]), add(mul(v[1], m[29][25]), add(mul(v[2], m[29][26]), mul(v[3], m[29][27])))));
+    tmp[30] = add(tmp[30], add(mul(v[0], m[30][24]), add(mul(v[1], m[30][25]), add(mul(v[2], m[30][26]), mul(v[3], m[30][27])))));
+    tmp[31] = add(tmp[31], add(mul(v[0], m[31][24]), add(mul(v[1], m[31][25]), add(mul(v[2], m[31][26]), mul(v[3], m[31][27])))));
+
+    v[0] = psi[I + d2 + d3 + d4];
+    v[1] = psi[I + d0 + d2 + d3 + d4];
+    v[2] = psi[I + d1 + d2 + d3 + d4];
+    v[3] = psi[I + d0 + d1 + d2 + d3 + d4];
+
+    psi[I] = (add(tmp[0], add(mul(v[0], m[0][28]), add(mul(v[1], m[0][29]), add(mul(v[2], m[0][30]), mul(v[3], m[0][31]))))));
+    psi[I + d0] = (add(tmp[1], add(mul(v[0], m[1][28]), add(mul(v[1], m[1][29]), add(mul(v[2], m[1][30]), mul(v[3], m[1][31]))))));
+    psi[I + d1] = (add(tmp[2], add(mul(v[0], m[2][28]), add(mul(v[1], m[2][29]), add(mul(v[2], m[2][30]), mul(v[3], m[2][31]))))));
+    psi[I + d0 + d1] = (add(tmp[3], add(mul(v[0], m[3][28]), add(mul(v[1], m[3][29]), add(mul(v[2], m[3][30]), mul(v[3], m[3][31]))))));
+    psi[I + d2] = (add(tmp[4], add(mul(v[0], m[4][28]), add(mul(v[1], m[4][29]), add(mul(v[2], m[4][30]), mul(v[3], m[4][31]))))));
+    psi[I + d0 + d2] = (add(tmp[5], add(mul(v[0], m[5][28]), add(mul(v[1], m[5][29]), add(mul(v[2], m[5][30]), mul(v[3], m[5][31]))))));
+    psi[I + d1 + d2] = (add(tmp[6], add(mul(v[0], m[6][28]), add(mul(v[1], m[6][29]), add(mul(v[2], m[6][30]), mul(v[3], m[6][31]))))));
+    psi[I + d0 + d1 + d2] = (add(tmp[7], add(mul(v[0], m[7][28]), add(mul(v[1], m[7][29]), add(mul(v[2], m[7][30]), mul(v[3], m[7][31]))))));
+    psi[I + d3] = (add(tmp[8], add(mul(v[0], m[8][28]), add(mul(v[1], m[8][29]), add(mul(v[2], m[8][30]), mul(v[3], m[8][31]))))));
+    psi[I + d0 + d3] = (add(tmp[9], add(mul(v[0], m[9][28]), add(mul(v[1], m[9][29]), add(mul(v[2], m[9][30]), mul(v[3], m[9][31]))))));
+    psi[I + d1 + d3] = (add(tmp[10], add(mul(v[0], m[10][28]), add(mul(v[1], m[10][29]), add(mul(v[2], m[10][30]), mul(v[3], m[10][31]))))));
+    psi[I + d0 + d1 + d3] = (add(tmp[11], add(mul(v[0], m[11][28]), add(mul(v[1], m[11][29]), add(mul(v[2], m[11][30]), mul(v[3], m[11][31]))))));
+    psi[I + d2 + d3] = (add(tmp[12], add(mul(v[0], m[12][28]), add(mul(v[1], m[12][29]), add(mul(v[2], m[12][30]), mul(v[3], m[12][31]))))));
+    psi[I + d0 + d2 + d3] = (add(tmp[13], add(mul(v[0], m[13][28]), add(mul(v[1], m[13][29]), add(mul(v[2], m[13][30]), mul(v[3], m[13][31]))))));
+    psi[I + d1 + d2 + d3] = (add(tmp[14], add(mul(v[0], m[14][28]), add(mul(v[1], m[14][29]), add(mul(v[2], m[14][30]), mul(v[3], m[14][31]))))));
+    psi[I + d0 + d1 + d2 + d3] = (add(tmp[15], add(mul(v[0], m[15][28]), add(mul(v[1], m[15][29]), add(mul(v[2], m[15][30]), mul(v[3], m[15][31]))))));
+    psi[I + d4] = (add(tmp[16], add(mul(v[0], m[16][28]), add(mul(v[1], m[16][29]), add(mul(v[2], m[16][30]), mul(v[3], m[16][31]))))));
+    psi[I + d0 + d4] = (add(tmp[17], add(mul(v[0], m[17][28]), add(mul(v[1], m[17][29]), add(mul(v[2], m[17][30]), mul(v[3], m[17][31]))))));
+    psi[I + d1 + d4] = (add(tmp[18], add(mul(v[0], m[18][28]), add(mul(v[1], m[18][29]), add(mul(v[2], m[18][30]), mul(v[3], m[18][31]))))));
+    psi[I + d0 + d1 + d4] = (add(tmp[19], add(mul(v[0], m[19][28]), add(mul(v[1], m[19][29]), add(mul(v[2], m[19][30]), mul(v[3], m[19][31]))))));
+    psi[I + d2 + d4] = (add(tmp[20], add(mul(v[0], m[20][28]), add(mul(v[1], m[20][29]), add(mul(v[2], m[20][30]), mul(v[3], m[20][31]))))));
+    psi[I + d0 + d2 + d4] = (add(tmp[21], add(mul(v[0], m[21][28]), add(mul(v[1], m[21][29]), add(mul(v[2], m[21][30]), mul(v[3], m[21][31]))))));
+    psi[I + d1 + d2 + d4] = (add(tmp[22], add(mul(v[0], m[22][28]), add(mul(v[1], m[22][29]), add(mul(v[2], m[22][30]), mul(v[3], m[22][31]))))));
+    psi[I + d0 + d1 + d2 + d4] = (add(tmp[23], add(mul(v[0], m[23][28]), add(mul(v[1], m[23][29]), add(mul(v[2], m[23][30]), mul(v[3], m[23][31]))))));
+    psi[I + d3 + d4] = (add(tmp[24], add(mul(v[0], m[24][28]), add(mul(v[1], m[24][29]), add(mul(v[2], m[24][30]), mul(v[3], m[24][31]))))));
+    psi[I + d0 + d3 + d4] = (add(tmp[25], add(mul(v[0], m[25][28]), add(mul(v[1], m[25][29]), add(mul(v[2], m[25][30]), mul(v[3], m[25][31]))))));
+    psi[I + d1 + d3 + d4] = (add(tmp[26], add(mul(v[0], m[26][28]), add(mul(v[1], m[26][29]), add(mul(v[2], m[26][30]), mul(v[3], m[26][31]))))));
+    psi[I + d0 + d1 + d3 + d4] = (add(tmp[27], add(mul(v[0], m[27][28]), add(mul(v[1], m[27][29]), add(mul(v[2], m[27][30]), mul(v[3], m[27][31]))))));
+    psi[I + d2 + d3 + d4] = (add(tmp[28], add(mul(v[0], m[28][28]), add(mul(v[1], m[28][29]), add(mul(v[2], m[28][30]), mul(v[3], m[28][31]))))));
+    psi[I + d0 + d2 + d3 + d4] = (add(tmp[29], add(mul(v[0], m[29][28]), add(mul(v[1], m[29][29]), add(mul(v[2], m[29][30]), mul(v[3], m[29][31]))))));
+    psi[I + d1 + d2 + d3 + d4] = (add(tmp[30], add(mul(v[0], m[30][28]), add(mul(v[1], m[30][29]), add(mul(v[2], m[30][30]), mul(v[3], m[30][31]))))));
+    psi[I + d0 + d1 + d2 + d3 + d4] = (add(tmp[31], add(mul(v[0], m[31][28]), add(mul(v[1], m[31][29]), add(mul(v[2], m[31][30]), mul(v[3], m[31][31]))))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+    std::size_t d2 = 1UL << id2;
+    std::size_t d3 = 1UL << id3;
+    std::size_t d4 = 1UL << id4;
+    std::size_t dsorted[] = {d0 , d1, d2, d3, d4};
+    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+                                kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+                                if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
+                                    kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/nointrin/kernels.hpp b/third_party/cppsim/nointrin/kernels.hpp
new file mode 100644
index 00000000..51026b81
--- /dev/null
+++ b/third_party/cppsim/nointrin/kernels.hpp
@@ -0,0 +1,40 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include <complex>
+#include <functional>
+#include <algorithm>
+#include "../intrin/alignedallocator.hpp"
+
+template <class T>
+inline T add(T a, T b){ return a+b; }
+
+template <class T>
+inline T mul(T a, T b){ return a*b; }
+
+
+#define LOOP_COLLAPSE1 2
+#define LOOP_COLLAPSE2 3
+#define LOOP_COLLAPSE3 4
+#define LOOP_COLLAPSE4 5
+#define LOOP_COLLAPSE5 6
+
+#include "kernel1.hpp"
+#include "kernel2.hpp"
+#include "kernel3.hpp"
+#include "kernel4.hpp"
+#include "kernel5.hpp"

From 68915d9e8d3ed73dcf4bcfa9e7f5746d6dedabc5 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 26 Jul 2022 19:29:29 +0200
Subject: [PATCH 02/82] Working on the code generation for kernel_core

---
 third_party/cppsim/nointrin/kernel.py | 43 +++++++++++++++------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/third_party/cppsim/nointrin/kernel.py b/third_party/cppsim/nointrin/kernel.py
index b3fe8f28..1768acab 100644
--- a/third_party/cppsim/nointrin/kernel.py
+++ b/third_party/cppsim/nointrin/kernel.py
@@ -1,7 +1,28 @@
 #!/usr/bin/env python3
+import itertools
 
 nqubits = 3
 
+# All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
+# 0 0
+# 1 0
+# 0 1
+# 1 1
+combs = list(itertools.product([0, 1], repeat=nqubits))
+
+# Pretty-print the indexed PSI array values.
+strcombs = []
+for j in range(0, len(combs)):
+	comb = tuple(reversed(combs[j]))
+	strcomb = 'psi[I'.format(j)
+	for i in range(0, nqubits):
+		if comb[i] != 0:
+			strcomb += " + d{}".format(i)
+	strcomb += ']';
+	strcombs.append(strcomb)
+
+# Some string constants clash with the {} syntax of print(), so we
+# substitute them as constants.
 pragma = "#pragma";
 newline = "\n";
 
@@ -10,25 +31,9 @@
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0{''.join(', std::size_t d{}'.format(i) for i in range (1, nqubits))}, M const& m)
 {{
-    std::array<std::complex<double>, 1U << nqubits> v;
-    v[0] = psi[I];
-    v[1] = psi[I + d0];
-
-    nqubits = 2:
-    
-    v[0] = psi[I];
-    v[1] = psi[I + d0];
-    v[2] = psi[I + d1];
-    v[3] = psi[I + d0 + d1];    
-
-    // All combinations of qubits, excluding dupes:
-    v[0] = 0 0
-    v[1] = 1 0
-    v[2] = 0 1
-    v[3] = 1 1
-
-    psi[I] = (add(mul(v[0], m[0][0]), mul(v[1], m[0][1])));
-    psi[I + d0] = (add(mul(v[0], m[1][0]), mul(v[1], m[1][1])));
+    std::complex<double> v[{1 << nqubits}];
+{''.join('    v[{}] = {};{}'.format(i, strcombs[i], newline) for i in range(0, len(strcombs)))}
+{''.join('    {} = {}'.format(strcombs[i], newline) for i in range(0, len(strcombs)))}
 }}
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)

From 48d245c65c8e30602e474251809b373422c26559 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 27 Jul 2022 16:31:04 +0200
Subject: [PATCH 03/82] Finalizing the code generation for kernel_core

---
 third_party/cppsim/nointrin/kernel.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/third_party/cppsim/nointrin/kernel.py b/third_party/cppsim/nointrin/kernel.py
index 1768acab..83e50a40 100644
--- a/third_party/cppsim/nointrin/kernel.py
+++ b/third_party/cppsim/nointrin/kernel.py
@@ -21,6 +21,17 @@
 	strcomb += ']';
 	strcombs.append(strcomb)
 
+def rhs(n, j, i):
+	if i < n - 1:
+		return f'add(mul(v[{i}], m[{j}][{i}]), ' + rhs(n, j, i + 1)
+	else:
+		return f'mul(v[{i}], m[{j}][{i}]' + ''.join(')' for k in range(0, n))
+
+# Pretty-print the right hand sides (recursively).
+strrhs = [] 
+for j in range(0, len(strcombs)):
+	strrhs.append(rhs(len(strcombs), j, 0))
+
 # Some string constants clash with the {} syntax of print(), so we
 # substitute them as constants.
 pragma = "#pragma";
@@ -33,8 +44,7 @@
 {{
     std::complex<double> v[{1 << nqubits}];
 {''.join('    v[{}] = {};{}'.format(i, strcombs[i], newline) for i in range(0, len(strcombs)))}
-{''.join('    {} = {}'.format(strcombs[i], newline) for i in range(0, len(strcombs)))}
-}}
+{''.join('    {} = {}{}'.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))}}}
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template <class V, class M>

From 6b30b41c54b44c0b9e8efffec1f27db3bbab7776 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 28 Jul 2022 15:38:47 +0200
Subject: [PATCH 04/82] Adding build scripts, adding test program stub

---
 third_party/cppsim/CMakeLists.txt             | 35 ++++++++
 third_party/cppsim/README.md                  | 60 ++++++++++++++
 .../cppsim/{ => include}/nointrin/kernel1.hpp |  0
 .../cppsim/{ => include}/nointrin/kernel2.hpp |  0
 .../cppsim/{ => include}/nointrin/kernel3.hpp |  0
 .../cppsim/{ => include}/nointrin/kernel4.hpp |  0
 .../cppsim/{ => include}/nointrin/kernel5.hpp |  0
 .../nointrin/kernelgen.py}                    | 80 +++++++++++--------
 .../cppsim/{ => include}/nointrin/kernels.hpp |  6 +-
 third_party/cppsim/src/test/test_nointrin.cpp | 17 ++++
 10 files changed, 160 insertions(+), 38 deletions(-)
 create mode 100644 third_party/cppsim/CMakeLists.txt
 create mode 100644 third_party/cppsim/README.md
 rename third_party/cppsim/{ => include}/nointrin/kernel1.hpp (100%)
 rename third_party/cppsim/{ => include}/nointrin/kernel2.hpp (100%)
 rename third_party/cppsim/{ => include}/nointrin/kernel3.hpp (100%)
 rename third_party/cppsim/{ => include}/nointrin/kernel4.hpp (100%)
 rename third_party/cppsim/{ => include}/nointrin/kernel5.hpp (100%)
 rename third_party/cppsim/{nointrin/kernel.py => include/nointrin/kernelgen.py} (60%)
 rename third_party/cppsim/{ => include}/nointrin/kernels.hpp (89%)
 create mode 100644 third_party/cppsim/src/test/test_nointrin.cpp

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
new file mode 100644
index 00000000..521985db
--- /dev/null
+++ b/third_party/cppsim/CMakeLists.txt
@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+
+project(ProjectQGen)
+
+find_package(Python3 COMPONENTS Interpreter)
+
+macro(kernelgen)
+	set(oneValueArgs NQUBITS VARIANT TARGET)
+	cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+	set(NQUBITS ${KERNELGEN_NQUBITS})
+	set(VARIANT ${KERNELGEN_VARIANT})
+	set(KERNELGEN "${CMAKE_CURRENT_SOURCE_DIR}/include/${VARIANT}/kernelgen.py")
+	set(KERNEL_PATH "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
+
+	# Call generator.
+	add_custom_command(
+		OUTPUT ${KERNEL_PATH}
+		COMMAND ${Python3_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH}
+		COMMENT "Generating kernel for ${NQUBITS} qubits"
+		DEPENDS ${KERNELGEN})
+	set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
+
+	# Append the generated file to the target sources.
+	target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH})
+	target_include_directories(${KERNELGEN_TARGET} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+endmacro()
+
+add_executable(test_nointrin "src/test/test_nointrin.cpp")
+target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
+kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
+kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
+kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
+kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
diff --git a/third_party/cppsim/README.md b/third_party/cppsim/README.md
new file mode 100644
index 00000000..a1e07cb0
--- /dev/null
+++ b/third_party/cppsim/README.md
@@ -0,0 +1,60 @@
+# ProjectQGen
+
+Generate Haener-Steiger quantum kernels in the form used in ProjectQ simulator.
+
+## Description
+
+The original code provides handwritten kernels of up to 5 qubits in the following form:
+
+```c++
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m)
+{
+    std::complex<double> v[2];
+    v[0] = psi[I];
+    v[1] = psi[I + d0];
+
+    psi[I] = (add(mul(v[0], m[0][0]), mul(v[1], m[0][1])));
+    psi[I + d0] = (add(mul(v[0], m[1][0]), mul(v[1], m[1][1])));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t dsorted[] = {d0 };
+    std::sort(dsorted, dsorted + 1, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+                kernel_core(psi, i0 + i1, d0, m);
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+                if (((i0 + i1)&ctrlmask) == ctrlmask)
+                    kernel_core(psi, i0 + i1, d0, m);
+            }
+        }
+    }
+}
+```
+
+The proposed generator reproduces the hand-written kernels, and extends support to unlimited number of qubits.
+
+## Testing
+
+```
+mkdir build
+cd build
+cmake .. -G Ninja
+ninja
+```
diff --git a/third_party/cppsim/nointrin/kernel1.hpp b/third_party/cppsim/include/nointrin/kernel1.hpp
similarity index 100%
rename from third_party/cppsim/nointrin/kernel1.hpp
rename to third_party/cppsim/include/nointrin/kernel1.hpp
diff --git a/third_party/cppsim/nointrin/kernel2.hpp b/third_party/cppsim/include/nointrin/kernel2.hpp
similarity index 100%
rename from third_party/cppsim/nointrin/kernel2.hpp
rename to third_party/cppsim/include/nointrin/kernel2.hpp
diff --git a/third_party/cppsim/nointrin/kernel3.hpp b/third_party/cppsim/include/nointrin/kernel3.hpp
similarity index 100%
rename from third_party/cppsim/nointrin/kernel3.hpp
rename to third_party/cppsim/include/nointrin/kernel3.hpp
diff --git a/third_party/cppsim/nointrin/kernel4.hpp b/third_party/cppsim/include/nointrin/kernel4.hpp
similarity index 100%
rename from third_party/cppsim/nointrin/kernel4.hpp
rename to third_party/cppsim/include/nointrin/kernel4.hpp
diff --git a/third_party/cppsim/nointrin/kernel5.hpp b/third_party/cppsim/include/nointrin/kernel5.hpp
similarity index 100%
rename from third_party/cppsim/nointrin/kernel5.hpp
rename to third_party/cppsim/include/nointrin/kernel5.hpp
diff --git a/third_party/cppsim/nointrin/kernel.py b/third_party/cppsim/include/nointrin/kernelgen.py
similarity index 60%
rename from third_party/cppsim/nointrin/kernel.py
rename to third_party/cppsim/include/nointrin/kernelgen.py
index 83e50a40..18dbf60e 100644
--- a/third_party/cppsim/nointrin/kernel.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -1,50 +1,59 @@
 #!/usr/bin/env python3
+import argparse
 import itertools
+import os
 
-nqubits = 3
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Generate Haener-Steiger quantum kernels in the form used in ProjectQ simulator')
+    parser.add_argument('nqubits', type=int, help='The number of qubits to generate the kernel for')
+    parser.add_argument('output', type=str, help='Output file name')
+    args = parser.parse_args()
+    
+    nqubits = int(args.nqubits)
+    output = args.output
 
-# All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
-# 0 0
-# 1 0
-# 0 1
-# 1 1
-combs = list(itertools.product([0, 1], repeat=nqubits))
+    # All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
+    # 0 0
+    # 1 0
+    # 0 1
+    # 1 1
+    combs = list(itertools.product([0, 1], repeat=nqubits))
 
-# Pretty-print the indexed PSI array values.
-strcombs = []
-for j in range(0, len(combs)):
-	comb = tuple(reversed(combs[j]))
-	strcomb = 'psi[I'.format(j)
-	for i in range(0, nqubits):
-		if comb[i] != 0:
-			strcomb += " + d{}".format(i)
-	strcomb += ']';
-	strcombs.append(strcomb)
+    # Pretty-print the indexed PSI array values.
+    strcombs = []
+    for j in range(0, len(combs)):
+        comb = tuple(reversed(combs[j]))
+        strcomb = 'psi[I'.format(j)
+        for i in range(0, nqubits):
+            if comb[i] != 0:
+                strcomb += " + d{}".format(i)
+        strcomb += ']';
+        strcombs.append(strcomb)
 
-def rhs(n, j, i):
-	if i < n - 1:
-		return f'add(mul(v[{i}], m[{j}][{i}]), ' + rhs(n, j, i + 1)
-	else:
-		return f'mul(v[{i}], m[{j}][{i}]' + ''.join(')' for k in range(0, n))
+    def rhs(n, j, i):
+        if i < n - 1:
+            return f'add(mul(v[{i}], m[{j}][{i}]), ' + rhs(n, j, i + 1)
+        else:
+            return f'mul(v[{i}], m[{j}][{i}]' + ''.join(')' for k in range(0, n))
 
-# Pretty-print the right hand sides (recursively).
-strrhs = [] 
-for j in range(0, len(strcombs)):
-	strrhs.append(rhs(len(strcombs), j, 0))
+    # Pretty-print the right hand sides (recursively).
+    strrhs = [] 
+    for j in range(0, len(strcombs)):
+        strrhs.append(rhs(len(strcombs), j, 0))
 
-# Some string constants clash with the {} syntax of print(), so we
-# substitute them as constants.
-pragma = "#pragma";
-newline = "\n";
+    # Some string constants clash with the {} syntax of print(), so we
+    # substitute them as constants.
+    pragma = "#pragma";
+    newline = "\n";
 
-kernel = \
+    kernel = \
 f"""
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0{''.join(', std::size_t d{}'.format(i) for i in range (1, nqubits))}, M const& m)
 {{
     std::complex<double> v[{1 << nqubits}];
 {''.join('    v[{}] = {};{}'.format(i, strcombs[i], newline) for i in range(0, len(strcombs)))}
-{''.join('    {} = {}{}'.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))}}}
+{''.join('    {} = {};{}'.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))}}}
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template <class V, class M>
@@ -75,5 +84,10 @@ def rhs(n, j, i):
 }}
 """
 
-print(kernel)
+    try:
+        os.makedirs(os.path.dirname(output))
+    except:
+        pass
+    with open(output, "w") as o:
+        o.write(kernel)
 
diff --git a/third_party/cppsim/nointrin/kernels.hpp b/third_party/cppsim/include/nointrin/kernels.hpp
similarity index 89%
rename from third_party/cppsim/nointrin/kernels.hpp
rename to third_party/cppsim/include/nointrin/kernels.hpp
index 51026b81..f754731b 100644
--- a/third_party/cppsim/nointrin/kernels.hpp
+++ b/third_party/cppsim/include/nointrin/kernels.hpp
@@ -12,13 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cmath>
-#include <cstdlib>
-#include <vector>
+#include <cstddef> // size_t
 #include <complex>
-#include <functional>
 #include <algorithm>
-#include "../intrin/alignedallocator.hpp"
 
 template <class T>
 inline T add(T a, T b){ return a+b; }
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
new file mode 100644
index 00000000..66e50f6b
--- /dev/null
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -0,0 +1,17 @@
+#include "nointrin/kernels.hpp"
+
+namespace generated {
+
+#include "generated/nointrin/kernel1.hpp"
+#include "generated/nointrin/kernel2.hpp"
+#include "generated/nointrin/kernel3.hpp"
+#include "generated/nointrin/kernel4.hpp"
+#include "generated/nointrin/kernel5.hpp"
+
+} // namespace generated
+
+int main(int argc, char* argv[])
+{
+	// TODO GoogleTest
+	return 0;
+}

From 7cc0b7a00cf3c211bd0dfc802f2a4607f50bbc93 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 29 Jul 2022 12:37:10 +0200
Subject: [PATCH 05/82] Working on the test stub

---
 third_party/cppsim/.gitmodules                |  3 ++
 third_party/cppsim/CMakeLists.txt             |  3 ++
 third_party/cppsim/ThirdParty/googletest      |  1 +
 third_party/cppsim/src/test/test_nointrin.cpp | 52 ++++++++++++++++++-
 4 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 third_party/cppsim/.gitmodules
 create mode 160000 third_party/cppsim/ThirdParty/googletest

diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/.gitmodules
new file mode 100644
index 00000000..ead476e8
--- /dev/null
+++ b/third_party/cppsim/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "googletest"]
+	path = ThirdParty/googletest
+	url = https://github.com/google/googletest.git
diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 521985db..e0cafd9c 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -4,6 +4,8 @@ project(ProjectQGen)
 
 find_package(Python3 COMPONENTS Interpreter)
 
+add_subdirectory(ThirdParty/googletest EXCLUDE_FROM_ALL)
+
 macro(kernelgen)
 	set(oneValueArgs NQUBITS VARIANT TARGET)
 	cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -28,6 +30,7 @@ endmacro()
 
 add_executable(test_nointrin "src/test/test_nointrin.cpp")
 target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_link_libraries(test_nointrin PRIVATE gtest)
 kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
diff --git a/third_party/cppsim/ThirdParty/googletest b/third_party/cppsim/ThirdParty/googletest
new file mode 160000
index 00000000..25cc5777
--- /dev/null
+++ b/third_party/cppsim/ThirdParty/googletest
@@ -0,0 +1 @@
+Subproject commit 25cc5777a17820a6339204a3552aa1dd5e428669
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 66e50f6b..1fa16715 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -10,8 +10,56 @@ namespace generated {
 
 } // namespace generated
 
+#include <array>
+
+#include "gtest/gtest.h"
+
+template<int nqubits>
+bool compare()
+{
+	constexpr auto dim = 1UL << nqubits;
+	
+	// TODO Generate m matrix as integers.
+	std::array<std::array<int, nqubits>, nqubits> m;
+	
+	// TODO Replace std::complex with auto.
+	
+	// TODO Generate id_0...id_nq integers (unsorted).
+	
+	// TODO Generate psi matrix as integers.
+	
+	// TODO Compare kernel against generated kernel
+	return true;
+}
+
+TEST(nointrin, kernel1)
+{
+	ASSERT_TRUE(compare<1>);
+}
+
+TEST(nointrin, kernel2)
+{
+	ASSERT_TRUE(compare<2>);
+}
+
+TEST(nointrin, kernel3)
+{
+	ASSERT_TRUE(compare<3>);
+}
+
+TEST(nointrin, kernel4)
+{
+	ASSERT_TRUE(compare<4>);
+}
+
+TEST(nointrin, kernel5)
+{
+	ASSERT_TRUE(compare<5>);
+}
+
 int main(int argc, char* argv[])
 {
-	// TODO GoogleTest
-	return 0;
+	::testing::InitGoogleTest( & argc, argv);
+	return RUN_ALL_TESTS();
 }
+

From 2b67121a4a348c64c89ce4ee6d443a1622e60fa3 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 29 Jul 2022 19:54:03 +0200
Subject: [PATCH 06/82] Getting all 5 handwritten kernels to compile within the
 test, thanks to automatic arrays type deduction for v and tmp

---
 third_party/cppsim/CMakeLists.txt             |  1 +
 .../cppsim/include/nointrin/kernel1.hpp       |  8 +-
 .../cppsim/include/nointrin/kernel2.hpp       | 12 +--
 .../cppsim/include/nointrin/kernel3.hpp       | 37 +++++----
 .../cppsim/include/nointrin/kernel4.hpp       | 51 ++++++------
 .../cppsim/include/nointrin/kernel5.hpp       | 81 ++++++++++---------
 third_party/cppsim/src/test/test_nointrin.cpp | 33 ++++++--
 7 files changed, 125 insertions(+), 98 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index e0cafd9c..a1a56cef 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -29,6 +29,7 @@ macro(kernelgen)
 endmacro()
 
 add_executable(test_nointrin "src/test/test_nointrin.cpp")
+set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_link_libraries(test_nointrin PRIVATE gtest)
 kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
diff --git a/third_party/cppsim/include/nointrin/kernel1.hpp b/third_party/cppsim/include/nointrin/kernel1.hpp
index e1cd9e66..d5fcf0d6 100644
--- a/third_party/cppsim/include/nointrin/kernel1.hpp
+++ b/third_party/cppsim/include/nointrin/kernel1.hpp
@@ -15,9 +15,11 @@
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m)
 {
-    std::complex<double> v[2];
-    v[0] = psi[I];
-    v[1] = psi[I + d0];
+    std::array v =
+    {
+        psi[I],
+        psi[I + d0]
+    };
 
     psi[I] = (add(mul(v[0], m[0][0]), mul(v[1], m[0][1])));
     psi[I + d0] = (add(mul(v[0], m[1][0]), mul(v[1], m[1][1])));
diff --git a/third_party/cppsim/include/nointrin/kernel2.hpp b/third_party/cppsim/include/nointrin/kernel2.hpp
index 879fa857..7aecbae1 100644
--- a/third_party/cppsim/include/nointrin/kernel2.hpp
+++ b/third_party/cppsim/include/nointrin/kernel2.hpp
@@ -15,11 +15,13 @@
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m)
 {
-    std::complex<double> v[4];
-    v[0] = psi[I];
-    v[1] = psi[I + d0];
-    v[2] = psi[I + d1];
-    v[3] = psi[I + d0 + d1];
+    std::array v =
+    {
+        psi[I],
+        psi[I + d0],
+        psi[I + d1],
+        psi[I + d0 + d1]
+    };
 
     psi[I] = (add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))));
     psi[I + d0] = (add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))));
diff --git a/third_party/cppsim/include/nointrin/kernel3.hpp b/third_party/cppsim/include/nointrin/kernel3.hpp
index c70d721d..76037c43 100644
--- a/third_party/cppsim/include/nointrin/kernel3.hpp
+++ b/third_party/cppsim/include/nointrin/kernel3.hpp
@@ -15,26 +15,25 @@
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m)
 {
-    std::complex<double> v[4];
-    v[0] = psi[I];
-    v[1] = psi[I + d0];
-    v[2] = psi[I + d1];
-    v[3] = psi[I + d0 + d1];
+    std::array v =
+    {
+        psi[I],
+        psi[I + d0],
+        psi[I + d1],
+        psi[I + d0 + d1]
+    };
 
-    std::complex<double> tmp[8];
-
-    // Сделай систему команд load/store,add/mul, и реализуй для них компилятор
-
-    // Идея этой формы в том, что половина PSI-выражения может быть расчитана независимо,
-    // но это не может помочь для распределённых вычислений
-    tmp[0] = add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3]))));
-    tmp[1] = add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3]))));
-    tmp[2] = add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3]))));
-    tmp[3] = add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3]))));
-    tmp[4] = add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3]))));
-    tmp[5] = add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3]))));
-    tmp[6] = add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3]))));
-    tmp[7] = add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3]))));
+    std::array tmp =
+    {
+        add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))),
+        add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))),
+        add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3])))),
+        add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3])))),
+        add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3])))),
+        add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3])))),
+        add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3])))),
+        add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3]))))
+    };
 
     v[0] = psi[I + d2];
     v[1] = psi[I + d0 + d2];
diff --git a/third_party/cppsim/include/nointrin/kernel4.hpp b/third_party/cppsim/include/nointrin/kernel4.hpp
index b12424a7..263b664a 100644
--- a/third_party/cppsim/include/nointrin/kernel4.hpp
+++ b/third_party/cppsim/include/nointrin/kernel4.hpp
@@ -15,31 +15,34 @@
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m)
 {
-    std::complex<double> v[4];
-    v[0] = psi[I];
-    v[1] = psi[I + d0];
-    v[2] = psi[I + d1];
-    v[3] = psi[I + d0 + d1];
-
-    std::complex<double> tmp[16];
-
-    tmp[0] = add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3]))));
-    tmp[1] = add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3]))));
-    tmp[2] = add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3]))));
-    tmp[3] = add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3]))));
-    tmp[4] = add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3]))));
-    tmp[5] = add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3]))));
-    tmp[6] = add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3]))));
-    tmp[7] = add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3]))));
-    tmp[8] = add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3]))));
-    tmp[9] = add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3]))));
-    tmp[10] = add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3]))));
-    tmp[11] = add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3]))));
-    tmp[12] = add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3]))));
-    tmp[13] = add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3]))));
-    tmp[14] = add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3]))));
-    tmp[15] = add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3]))));
+    std::array v =
+    {
+        psi[I],
+        psi[I + d0],
+        psi[I + d1],
+        psi[I + d0 + d1]
+    };
 
+    std::array tmp =
+    {
+        add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))),
+        add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))),
+        add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3])))),
+        add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3])))),
+        add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3])))),
+        add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3])))),
+        add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3])))),
+        add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3])))),
+        add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3])))),
+        add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3])))),
+        add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3])))),
+        add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3])))),
+        add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3])))),
+        add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3])))),
+        add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3])))),
+        add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3]))))
+    };
+    
     v[0] = psi[I + d2];
     v[1] = psi[I + d0 + d2];
     v[2] = psi[I + d1 + d2];
diff --git a/third_party/cppsim/include/nointrin/kernel5.hpp b/third_party/cppsim/include/nointrin/kernel5.hpp
index a3e47f10..04773b6d 100644
--- a/third_party/cppsim/include/nointrin/kernel5.hpp
+++ b/third_party/cppsim/include/nointrin/kernel5.hpp
@@ -15,46 +15,49 @@
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m)
 {
-    std::complex<double> v[4];
-    v[0] = psi[I];
-    v[1] = psi[I + d0];
-    v[2] = psi[I + d1];
-    v[3] = psi[I + d0 + d1];
+    std::array v =
+    {
+        psi[I],
+        psi[I + d0],
+        psi[I + d1],
+        psi[I + d0 + d1]
+    };
 
-    std::complex<double> tmp[32];
-
-    tmp[0] = add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3]))));
-    tmp[1] = add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3]))));
-    tmp[2] = add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3]))));
-    tmp[3] = add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3]))));
-    tmp[4] = add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3]))));
-    tmp[5] = add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3]))));
-    tmp[6] = add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3]))));
-    tmp[7] = add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3]))));
-    tmp[8] = add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3]))));
-    tmp[9] = add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3]))));
-    tmp[10] = add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3]))));
-    tmp[11] = add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3]))));
-    tmp[12] = add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3]))));
-    tmp[13] = add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3]))));
-    tmp[14] = add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3]))));
-    tmp[15] = add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3]))));
-    tmp[16] = add(mul(v[0], m[16][0]), add(mul(v[1], m[16][1]), add(mul(v[2], m[16][2]), mul(v[3], m[16][3]))));
-    tmp[17] = add(mul(v[0], m[17][0]), add(mul(v[1], m[17][1]), add(mul(v[2], m[17][2]), mul(v[3], m[17][3]))));
-    tmp[18] = add(mul(v[0], m[18][0]), add(mul(v[1], m[18][1]), add(mul(v[2], m[18][2]), mul(v[3], m[18][3]))));
-    tmp[19] = add(mul(v[0], m[19][0]), add(mul(v[1], m[19][1]), add(mul(v[2], m[19][2]), mul(v[3], m[19][3]))));
-    tmp[20] = add(mul(v[0], m[20][0]), add(mul(v[1], m[20][1]), add(mul(v[2], m[20][2]), mul(v[3], m[20][3]))));
-    tmp[21] = add(mul(v[0], m[21][0]), add(mul(v[1], m[21][1]), add(mul(v[2], m[21][2]), mul(v[3], m[21][3]))));
-    tmp[22] = add(mul(v[0], m[22][0]), add(mul(v[1], m[22][1]), add(mul(v[2], m[22][2]), mul(v[3], m[22][3]))));
-    tmp[23] = add(mul(v[0], m[23][0]), add(mul(v[1], m[23][1]), add(mul(v[2], m[23][2]), mul(v[3], m[23][3]))));
-    tmp[24] = add(mul(v[0], m[24][0]), add(mul(v[1], m[24][1]), add(mul(v[2], m[24][2]), mul(v[3], m[24][3]))));
-    tmp[25] = add(mul(v[0], m[25][0]), add(mul(v[1], m[25][1]), add(mul(v[2], m[25][2]), mul(v[3], m[25][3]))));
-    tmp[26] = add(mul(v[0], m[26][0]), add(mul(v[1], m[26][1]), add(mul(v[2], m[26][2]), mul(v[3], m[26][3]))));
-    tmp[27] = add(mul(v[0], m[27][0]), add(mul(v[1], m[27][1]), add(mul(v[2], m[27][2]), mul(v[3], m[27][3]))));
-    tmp[28] = add(mul(v[0], m[28][0]), add(mul(v[1], m[28][1]), add(mul(v[2], m[28][2]), mul(v[3], m[28][3]))));
-    tmp[29] = add(mul(v[0], m[29][0]), add(mul(v[1], m[29][1]), add(mul(v[2], m[29][2]), mul(v[3], m[29][3]))));
-    tmp[30] = add(mul(v[0], m[30][0]), add(mul(v[1], m[30][1]), add(mul(v[2], m[30][2]), mul(v[3], m[30][3]))));
-    tmp[31] = add(mul(v[0], m[31][0]), add(mul(v[1], m[31][1]), add(mul(v[2], m[31][2]), mul(v[3], m[31][3]))));
+    std::array tmp =
+    {
+        add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))),
+        add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))),
+        add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3])))),
+        add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3])))),
+        add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3])))),
+        add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3])))),
+        add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3])))),
+        add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3])))),
+        add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3])))),
+        add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3])))),
+        add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3])))),
+        add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3])))),
+        add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3])))),
+        add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3])))),
+        add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3])))),
+        add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3])))),
+        add(mul(v[0], m[16][0]), add(mul(v[1], m[16][1]), add(mul(v[2], m[16][2]), mul(v[3], m[16][3])))),
+        add(mul(v[0], m[17][0]), add(mul(v[1], m[17][1]), add(mul(v[2], m[17][2]), mul(v[3], m[17][3])))),
+        add(mul(v[0], m[18][0]), add(mul(v[1], m[18][1]), add(mul(v[2], m[18][2]), mul(v[3], m[18][3])))),
+        add(mul(v[0], m[19][0]), add(mul(v[1], m[19][1]), add(mul(v[2], m[19][2]), mul(v[3], m[19][3])))),
+        add(mul(v[0], m[20][0]), add(mul(v[1], m[20][1]), add(mul(v[2], m[20][2]), mul(v[3], m[20][3])))),
+        add(mul(v[0], m[21][0]), add(mul(v[1], m[21][1]), add(mul(v[2], m[21][2]), mul(v[3], m[21][3])))),
+        add(mul(v[0], m[22][0]), add(mul(v[1], m[22][1]), add(mul(v[2], m[22][2]), mul(v[3], m[22][3])))),
+        add(mul(v[0], m[23][0]), add(mul(v[1], m[23][1]), add(mul(v[2], m[23][2]), mul(v[3], m[23][3])))),
+        add(mul(v[0], m[24][0]), add(mul(v[1], m[24][1]), add(mul(v[2], m[24][2]), mul(v[3], m[24][3])))),
+        add(mul(v[0], m[25][0]), add(mul(v[1], m[25][1]), add(mul(v[2], m[25][2]), mul(v[3], m[25][3])))),
+        add(mul(v[0], m[26][0]), add(mul(v[1], m[26][1]), add(mul(v[2], m[26][2]), mul(v[3], m[26][3])))),
+        add(mul(v[0], m[27][0]), add(mul(v[1], m[27][1]), add(mul(v[2], m[27][2]), mul(v[3], m[27][3])))),
+        add(mul(v[0], m[28][0]), add(mul(v[1], m[28][1]), add(mul(v[2], m[28][2]), mul(v[3], m[28][3])))),
+        add(mul(v[0], m[29][0]), add(mul(v[1], m[29][1]), add(mul(v[2], m[29][2]), mul(v[3], m[29][3])))),
+        add(mul(v[0], m[30][0]), add(mul(v[1], m[30][1]), add(mul(v[2], m[30][2]), mul(v[3], m[30][3])))),
+        add(mul(v[0], m[31][0]), add(mul(v[1], m[31][1]), add(mul(v[2], m[31][2]), mul(v[3], m[31][3]))))
+    };
 
     v[0] = psi[I + d2];
     v[1] = psi[I + d0 + d2];
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 1fa16715..33603491 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -11,6 +11,7 @@ namespace generated {
 } // namespace generated
 
 #include <array>
+#include <random>
 
 #include "gtest/gtest.h"
 
@@ -19,42 +20,58 @@ bool compare()
 {
 	constexpr auto dim = 1UL << nqubits;
 	
+	std::default_random_engine dre;
+	std::uniform_int_distribution<int> uid(0, 1000);
+
 	// TODO Generate m matrix as integers.
 	std::array<std::array<int, nqubits>, nqubits> m;
-	
-	// TODO Replace std::complex with auto.
+	for (int j = 0; j < m.size(); j++)
+		for (int i = 0; i < m.size(); i++)
+			m[j][i] = uid(dre);
 	
 	// TODO Generate id_0...id_nq integers (unsorted).
-	
+	unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4;
+
 	// TODO Generate psi matrix as integers.
+	std::array<int, nqubits> psi;
+	for (int i = 0; i < m.size(); i++)
+		psi[i] = uid(dre);
+
+	// Generate control mask.	
+	std::size_t ctrlmask = uid(dre); 
 	
 	// TODO Compare kernel against generated kernel
+	kernel(psi, id0, m, ctrlmask);
+	kernel(psi, id1, id0, m, ctrlmask);
+	kernel(psi, id2, id1, id0, m, ctrlmask);
+	kernel(psi, id3, id2, id1, id0, m, ctrlmask);
+	kernel(psi, id4, id3, id2, id1, id0, m, ctrlmask);
 	return true;
 }
 
 TEST(nointrin, kernel1)
 {
-	ASSERT_TRUE(compare<1>);
+	ASSERT_TRUE(compare<1>());
 }
 
 TEST(nointrin, kernel2)
 {
-	ASSERT_TRUE(compare<2>);
+	ASSERT_TRUE(compare<2>());
 }
 
 TEST(nointrin, kernel3)
 {
-	ASSERT_TRUE(compare<3>);
+	ASSERT_TRUE(compare<3>());
 }
 
 TEST(nointrin, kernel4)
 {
-	ASSERT_TRUE(compare<4>);
+	ASSERT_TRUE(compare<4>());
 }
 
 TEST(nointrin, kernel5)
 {
-	ASSERT_TRUE(compare<5>);
+	ASSERT_TRUE(compare<5>());
 }
 
 int main(int argc, char* argv[])

From c1928b8ee9e3f77c7c23898939007d77dc7b87e2 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 8 Aug 2022 17:52:39 +0200
Subject: [PATCH 07/82] Finalizing the test to ensure hand-written and
 generated kernels give equal results

---
 .../cppsim/include/nointrin/kernelgen.py      |   8 +-
 third_party/cppsim/src/test/test_nointrin.cpp | 111 +++++++++++++-----
 2 files changed, 88 insertions(+), 31 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 18dbf60e..f376e1e9 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -51,13 +51,15 @@ def rhs(n, j, i):
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0{''.join(', std::size_t d{}'.format(i) for i in range (1, nqubits))}, M const& m)
 {{
-    std::complex<double> v[{1 << nqubits}];
-{''.join('    v[{}] = {};{}'.format(i, strcombs[i], newline) for i in range(0, len(strcombs)))}
+    std::array v =
+    {{
+{''.join('        {},{}'.format(strcombs[i], newline) for i in range(0, len(strcombs)))}    }};
+
 {''.join('    {} = {};{}'.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))}}}
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template <class V, class M>
-void kernel(V &psi, {''.join('unsigned id{}, '.format(i) for i in range (0, nqubits))}M const& m, std::size_t ctrlmask)
+void kernel(V &psi, {''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits))}M const& m, std::size_t ctrlmask)
 {{
     std::size_t n = psi.size();
     std::size_t d0 = 1UL << id0{''.join(', d{} = 1UL << id{}'.format(i, i) for i in range (1, nqubits))};
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 33603491..8d3b2da8 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -11,72 +11,127 @@ namespace generated {
 } // namespace generated
 
 #include <array>
+#include <iostream>
 #include <random>
 
 #include "gtest/gtest.h"
 
-template<int nqubits>
-bool compare()
+template<int nqubits, typename Kernels, typename V>
+bool compare(Kernels kernels, V& psi1)
 {
 	constexpr auto dim = 1UL << nqubits;
-	
+
 	std::default_random_engine dre;
 	std::uniform_int_distribution<int> uid(0, 1000);
 
-	// TODO Generate m matrix as integers.
+	// Generate m matrix as integers.
 	std::array<std::array<int, nqubits>, nqubits> m;
 	for (int j = 0; j < m.size(); j++)
 		for (int i = 0; i < m.size(); i++)
 			m[j][i] = uid(dre);
-	
-	// TODO Generate id_0...id_nq integers (unsorted).
-	unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4;
 
-	// TODO Generate psi matrix as integers.
-	std::array<int, nqubits> psi;
-	for (int i = 0; i < m.size(); i++)
-		psi[i] = uid(dre);
-
-	// Generate control mask.	
-	std::size_t ctrlmask = uid(dre); 
-	
-	// TODO Compare kernel against generated kernel
-	kernel(psi, id0, m, ctrlmask);
-	kernel(psi, id1, id0, m, ctrlmask);
-	kernel(psi, id2, id1, id0, m, ctrlmask);
-	kernel(psi, id3, id2, id1, id0, m, ctrlmask);
-	kernel(psi, id4, id3, id2, id1, id0, m, ctrlmask);
-	return true;
+	// Generate psi matrix as integers.
+	for (int i = 0; i < psi1.size(); i++)
+		psi1[i] = uid(dre);
+	auto psi2 = psi1;
+
+	// Generate control mask.
+	std::size_t ctrlmask = 0; // uid(dre);
+
+	// Compare kernel against generated kernel.
+	kernels(psi1, psi2, m, ctrlmask);
+	auto diff = std::mismatch(psi1.begin(), psi1.end(), psi2.begin());
+	if (diff.first == psi1.end())
+		return true;
+
+	std::cout << "Mismatch at " << std::distance(psi1.begin(), diff.first) <<
+		" : " << *(diff.first) << " != " << *(diff.second) << std::endl;
+	return false;
 }
 
 TEST(nointrin, kernel1)
 {
-	ASSERT_TRUE(compare<1>());
+	unsigned id0 = 0;
+	size_t n = 1;
+	n += 1UL << id0;
+	std::vector<int> psi(n);
+	ASSERT_TRUE(compare<1>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	{
+		kernel(psi1, id0, m, ctrlmask);
+		generated::kernel(psi2, id0, m, ctrlmask);
+	},
+	psi));
 }
 
 TEST(nointrin, kernel2)
 {
-	ASSERT_TRUE(compare<2>());
+	unsigned id0 = 0, id1 = 1;
+	size_t n = 1;
+	n += 1UL << id0;
+	n += 1UL << id1;
+	std::vector<int> psi(n);
+	ASSERT_TRUE(compare<2>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	{
+		kernel(psi1, id1, id0, m, ctrlmask);
+		generated::kernel(psi2, id1, id0, m, ctrlmask);
+	},
+	psi));
 }
 
 TEST(nointrin, kernel3)
 {
-	ASSERT_TRUE(compare<3>());
+	unsigned id0 = 0, id1 = 1, id2 = 2;
+	size_t n = 1;
+	n += 1UL << id0;
+	n += 1UL << id1;
+	n += 1UL << id2;
+	std::vector<int> psi(n);
+	ASSERT_TRUE(compare<3>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	{
+		kernel(psi1, id2, id1, id0, m, ctrlmask);
+		generated::kernel(psi2, id2, id1, id0, m, ctrlmask);
+	},
+	psi));
 }
 
 TEST(nointrin, kernel4)
 {
-	ASSERT_TRUE(compare<4>());
+	unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3;
+	size_t n = 1;
+	n += 1UL << id0;
+	n += 1UL << id1;
+	n += 1UL << id2;
+	n += 1UL << id3;
+	std::vector<int> psi(n);
+	ASSERT_TRUE(compare<4>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	{
+		kernel(psi1, id3, id2, id1, id0, m, ctrlmask);
+		generated::kernel(psi2, id3, id2, id1, id0, m, ctrlmask);
+	},
+	psi));
 }
 
 TEST(nointrin, kernel5)
 {
-	ASSERT_TRUE(compare<5>());
+	unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4;
+	size_t n = 1;
+	n += 1UL << id0;
+	n += 1UL << id1;
+	n += 1UL << id2;
+	n += 1UL << id3;
+	n += 1UL << id4;
+	std::vector<int> psi(n);
+	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	{
+		kernel(psi1, id4, id3, id2, id1, id0, m, ctrlmask);
+		generated::kernel(psi2, id4, id3, id2, id1, id0, m, ctrlmask);
+	},
+	psi));
 }
 
 int main(int argc, char* argv[])
 {
-	::testing::InitGoogleTest( & argc, argv);
+	::testing::InitGoogleTest(&argc, argv);
 	return RUN_ALL_TESTS();
 }
 

From 907733af16ca5a658095c16b427c442d830a5323 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 9 Aug 2022 11:50:27 +0200
Subject: [PATCH 08/82] Adding class for creating temporary files

---
 third_party/cppsim/src/benchmark/tempfile.cpp | 44 +++++++++++++++++++
 third_party/cppsim/src/benchmark/tempfile.h   | 23 ++++++++++
 third_party/cppsim/src/test/test_nointrin.cpp |  2 +
 3 files changed, 69 insertions(+)
 create mode 100644 third_party/cppsim/src/benchmark/tempfile.cpp
 create mode 100644 third_party/cppsim/src/benchmark/tempfile.h

diff --git a/third_party/cppsim/src/benchmark/tempfile.cpp b/third_party/cppsim/src/benchmark/tempfile.cpp
new file mode 100644
index 00000000..6d34166e
--- /dev/null
+++ b/third_party/cppsim/src/benchmark/tempfile.cpp
@@ -0,0 +1,44 @@
+#include "tempfile.h"
+
+#include <errno.h>
+#include <filesystem>
+#include <vector>
+#include <unistd.h>
+
+namespace fs = std::filesystem;
+
+const std::string& TempFile::string(std::error_code& ec_) const
+{
+	ec_ = ec;
+	return filename;
+}
+
+TempFile::TempFile(const std::string& mask_)
+{
+	auto dir = fs::temp_directory_path(ec);
+	if (ec) return; 
+
+	std::string mask = (dir / mask_).string();
+
+	std::vector<char> vfilename(mask.c_str(), mask.c_str() + mask.size() + 1);
+	int fd = mkstemp(&vfilename[0]);
+	if (fd == -1)
+	{
+		ec = std::error_code(errno, std::generic_category());
+		return;
+	}
+
+	close(fd);
+	filename = (char*)&vfilename[0];
+}
+
+TempFile::~TempFile()
+{
+	bool keepCache = false;
+	const char* keepCacheValue = getenv("KEEP_CACHE");
+	if (keepCacheValue)
+		keepCache = atoi(keepCacheValue);
+	if (!keepCache)
+		unlink(filename.c_str());
+}
+
diff --git a/third_party/cppsim/src/benchmark/tempfile.h b/third_party/cppsim/src/benchmark/tempfile.h
new file mode 100644
index 00000000..14e4f7d7
--- /dev/null
+++ b/third_party/cppsim/src/benchmark/tempfile.h
@@ -0,0 +1,23 @@
+#ifndef TEMP_FILE_H
+#define TEMP_FILE_H
+
+#include <string>
+#include <system_error>
+
+class TempFile
+{
+	std::error_code ec;
+
+	std::string filename;
+
+public :
+
+	const std::string& string(std::error_code& ec) const;
+
+	TempFile(const std::string& mask);
+
+	~TempFile();
+};
+
+#endif // TEMP_FILE_H
+
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 8d3b2da8..60ef2b22 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -1,3 +1,5 @@
+// Ensure hand-written and generated kernels give equal results.
+
 #include "nointrin/kernels.hpp"
 
 namespace generated {

From a61040d487d6bce6592787ad89f2b2fc36000620 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 9 Aug 2022 16:35:49 +0200
Subject: [PATCH 09/82] Embedding kernel generation script as a resource

---
 third_party/cppsim/.gitmodules          |  3 +++
 third_party/cppsim/CMakeLists.txt       | 10 +++++++++-
 third_party/cppsim/ThirdParty/res_embed |  1 +
 3 files changed, 13 insertions(+), 1 deletion(-)
 create mode 160000 third_party/cppsim/ThirdParty/res_embed

diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/.gitmodules
index ead476e8..1af0c51d 100644
--- a/third_party/cppsim/.gitmodules
+++ b/third_party/cppsim/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "googletest"]
 	path = ThirdParty/googletest
 	url = https://github.com/google/googletest.git
+[submodule "ThirdParty/res_embed"]
+	path = ThirdParty/res_embed
+	url = https://github.com/dmikushin/res_embed.git
diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index a1a56cef..4857bb00 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -1,11 +1,18 @@
 cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 
-project(ProjectQGen)
+project(projectqgen)
 
 find_package(Python3 COMPONENTS Interpreter)
 
 add_subdirectory(ThirdParty/googletest EXCLUDE_FROM_ALL)
 
+add_subdirectory(ThirdParty/res_embed EXCLUDE_FROM_ALL)
+
+include(ResEmbed)
+
+add_library(${PROJECT_NAME} SHARED)
+res_embed(TARGET ${PROJECT_NAME} NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py")
+
 macro(kernelgen)
 	set(oneValueArgs NQUBITS VARIANT TARGET)
 	cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -37,3 +44,4 @@ kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
+
diff --git a/third_party/cppsim/ThirdParty/res_embed b/third_party/cppsim/ThirdParty/res_embed
new file mode 160000
index 00000000..ece62797
--- /dev/null
+++ b/third_party/cppsim/ThirdParty/res_embed
@@ -0,0 +1 @@
+Subproject commit ece6279711c553d6a8d67814ac468549e9dcf31c

From dea77367a7df86b7de941111897169386d0447f9 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 10 Aug 2022 11:41:46 +0200
Subject: [PATCH 10/82] Correcting the wrong order of statements: agrument
 parser must come earlier than the use of the parsed arguments

---
 third_party/cppsim/ThirdParty/res_embed | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/ThirdParty/res_embed b/third_party/cppsim/ThirdParty/res_embed
index ece62797..acae2559 160000
--- a/third_party/cppsim/ThirdParty/res_embed
+++ b/third_party/cppsim/ThirdParty/res_embed
@@ -1 +1 @@
-Subproject commit ece6279711c553d6a8d67814ac468549e9dcf31c
+Subproject commit acae2559153462208b449fedb1b186522126e3bf

From 3d7087e214ff02cc318e9abe79c0b990c03e16b2 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 10 Aug 2022 14:53:31 +0200
Subject: [PATCH 11/82] Adding the rest of the original _cppsim source code,
 adding the Python native module compilation

---
 third_party/cppsim/.gitmodules                |   3 +
 third_party/cppsim/CMakeLists.txt             |  13 +-
 third_party/cppsim/ThirdParty/pybind11        |   1 +
 third_party/cppsim/include/fusion.hpp         | 167 +++++
 .../include/intrin/alignedallocator.hpp       | 119 ++++
 third_party/cppsim/include/intrin/cintrin.hpp | 124 ++++
 third_party/cppsim/include/intrin/kernel1.hpp |  62 ++
 third_party/cppsim/include/intrin/kernel2.hpp |  71 +++
 third_party/cppsim/include/intrin/kernel3.hpp |  90 +++
 third_party/cppsim/include/intrin/kernel4.hpp | 131 ++++
 third_party/cppsim/include/intrin/kernel5.hpp | 256 ++++++++
 third_party/cppsim/include/intrin/kernels.hpp |  34 +
 third_party/cppsim/include/simulator.hpp      | 580 ++++++++++++++++++
 third_party/cppsim/src/_cppsim.cpp            |  67 ++
 14 files changed, 1713 insertions(+), 5 deletions(-)
 create mode 160000 third_party/cppsim/ThirdParty/pybind11
 create mode 100644 third_party/cppsim/include/fusion.hpp
 create mode 100644 third_party/cppsim/include/intrin/alignedallocator.hpp
 create mode 100644 third_party/cppsim/include/intrin/cintrin.hpp
 create mode 100644 third_party/cppsim/include/intrin/kernel1.hpp
 create mode 100644 third_party/cppsim/include/intrin/kernel2.hpp
 create mode 100644 third_party/cppsim/include/intrin/kernel3.hpp
 create mode 100644 third_party/cppsim/include/intrin/kernel4.hpp
 create mode 100644 third_party/cppsim/include/intrin/kernel5.hpp
 create mode 100644 third_party/cppsim/include/intrin/kernels.hpp
 create mode 100644 third_party/cppsim/include/simulator.hpp
 create mode 100644 third_party/cppsim/src/_cppsim.cpp

diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/.gitmodules
index 1af0c51d..11cd2be0 100644
--- a/third_party/cppsim/.gitmodules
+++ b/third_party/cppsim/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "ThirdParty/res_embed"]
 	path = ThirdParty/res_embed
 	url = https://github.com/dmikushin/res_embed.git
+[submodule "ThirdParty/pybind11"]
+	path = ThirdParty/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 4857bb00..fbc32bcb 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -1,17 +1,20 @@
 cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 
-project(projectqgen)
-
-find_package(Python3 COMPONENTS Interpreter)
+project(_cppsim)
 
 add_subdirectory(ThirdParty/googletest EXCLUDE_FROM_ALL)
 
 add_subdirectory(ThirdParty/res_embed EXCLUDE_FROM_ALL)
 
+add_subdirectory(ThirdParty/pybind11 EXCLUDE_FROM_ALL)
+
+find_package(Python3 COMPONENTS Interpreter)
+
 include(ResEmbed)
 
-add_library(${PROJECT_NAME} SHARED)
-res_embed(TARGET ${PROJECT_NAME} NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py")
+pybind11_add_module(${PROJECT_NAME} SHARED "src/${PROJECT_NAME}.cpp")
+target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+res_embed(TARGET ${PROJECT_NAME} NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
 
 macro(kernelgen)
 	set(oneValueArgs NQUBITS VARIANT TARGET)
diff --git a/third_party/cppsim/ThirdParty/pybind11 b/third_party/cppsim/ThirdParty/pybind11
new file mode 160000
index 00000000..68e6fdaa
--- /dev/null
+++ b/third_party/cppsim/ThirdParty/pybind11
@@ -0,0 +1 @@
+Subproject commit 68e6fdaa90fc93979e6d5d1e9f788f464593e8f2
diff --git a/third_party/cppsim/include/fusion.hpp b/third_party/cppsim/include/fusion.hpp
new file mode 100644
index 00000000..6539ea91
--- /dev/null
+++ b/third_party/cppsim/include/fusion.hpp
@@ -0,0 +1,167 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATE_QUEUE_HPP_
+#define GATE_QUEUE_HPP_
+
+#include <set>
+#include <vector>
+#include <complex>
+#include <algorithm>
+#include <iostream>
+#include "intrin/alignedallocator.hpp"
+
+class Item{
+public:
+    using Index = unsigned;
+    using IndexVector = std::vector<Index>;
+    using Complex = std::complex<double>;
+    using Matrix = std::vector<std::vector<Complex, aligned_allocator<Complex, 64>>>;
+    Item(Matrix mat, IndexVector idx) : mat_(mat), idx_(idx) {}
+    Matrix& get_matrix() { return mat_; }
+    IndexVector& get_indices() { return idx_; }
+private:
+    Matrix mat_;
+    IndexVector idx_;
+};
+
+class Fusion{
+public:
+    using Index = unsigned;
+    using IndexSet = std::set<Index>;
+    using IndexVector = std::vector<Index>;
+    using Complex = std::complex<double>;
+    using Matrix = std::vector<std::vector<Complex, aligned_allocator<Complex, 64>>>;
+    using ItemVector = std::vector<Item>;
+
+    unsigned num_qubits() {
+        return set_.size();
+    }
+
+    std::size_t size() const {
+        return items_.size();
+    }
+
+    void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}){
+        for (auto idx : index_list)
+            set_.emplace(idx);
+
+        handle_controls(matrix, index_list, ctrl_list);
+        Item item(matrix, index_list);
+        items_.push_back(item);
+    }
+
+    void perform_fusion(Matrix& fused_matrix, IndexVector& index_list, IndexVector& ctrl_list){
+        for (auto idx : set_)
+            index_list.push_back(idx);
+
+        std::size_t N = num_qubits();
+        fused_matrix = Matrix(1UL<<N, std::vector<Complex, aligned_allocator<Complex, 64>>(1UL<<N));
+        auto &M = fused_matrix;
+
+        for (std::size_t i = 0; i < (1UL<<N); ++i)
+            M[i][i] = 1.;
+
+        for (auto& item : items_){
+            auto const& idx = item.get_indices();
+            IndexVector idx2mat(idx.size());
+            for (std::size_t i = 0; i < idx.size(); ++i)
+                idx2mat[i] = ((std::equal_range(index_list.begin(), index_list.end(), idx[i])).first - index_list.begin());
+
+            for (std::size_t k = 0; k < (1UL<<N); ++k){ // loop over big matrix columns
+                // check if column index satisfies control-mask
+                // if not: leave it unchanged
+                std::vector<Complex> oldcol(1UL<<N);
+                for (std::size_t i = 0; i < (1UL<<N); ++i)
+                    oldcol[i] = M[i][k];
+
+                for (std::size_t i = 0; i < (1UL<<N); ++i){
+                    std::size_t local_i = 0;
+                    for (std::size_t l = 0; l < idx.size(); ++l)
+                        local_i |= ((i >> idx2mat[l])&1UL)<<l;
+
+                    Complex res = 0.;
+                    for (std::size_t j = 0; j < (1UL<<idx.size()); ++j){
+                        std::size_t locidx = i;
+                        for (std::size_t l = 0; l < idx.size(); ++l)
+                            if (((j >> l)&1UL) != ((i >> idx2mat[l])&1UL))
+                                locidx ^= (1UL << idx2mat[l]);
+                        res += oldcol[locidx] * item.get_matrix()[local_i][j];
+                    }
+                    M[i][k] = res;
+                }
+            }
+        }
+        ctrl_list.reserve(ctrl_set_.size());
+        for (auto ctrl : ctrl_set_)
+            ctrl_list.push_back(ctrl);
+    }
+
+private:
+    void add_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& new_ctrls){
+        indexList.reserve(indexList.size()+new_ctrls.size());
+        indexList.insert(indexList.end(), new_ctrls.begin(), new_ctrls.end());
+
+        std::size_t F = (1UL << new_ctrls.size());
+        Matrix newmatrix(F*matrix.size(), std::vector<Complex, aligned_allocator<Complex,64>>(F*matrix.size(), 0.));
+
+        std::size_t Offset = newmatrix.size()-matrix.size();
+
+        for (std::size_t i = 0; i < Offset; ++i)
+            newmatrix[i][i] = 1.;
+        for (std::size_t i = 0; i < matrix.size(); ++i){
+            for (std::size_t j = 0; j < matrix.size(); ++j)
+                newmatrix[Offset+i][Offset+j] = matrix[i][j];
+        }
+        matrix = std::move(newmatrix);
+    }
+
+    void handle_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& ctrlList){
+        auto unhandled_ctrl = ctrl_set_; // will contain all ctrls that are not part of the new command
+        // --> need to be removed from the global mask and the controls incorporated into the old
+        // commands (the ones already in the list).
+
+        for (auto ctrlIdx : ctrlList){
+            if (ctrl_set_.count(ctrlIdx) == 0){ // need to either add it to the list or to the command
+                if (items_.size() > 0){ // add it to the command
+                    add_controls(matrix, indexList, {ctrlIdx});
+                    set_.insert(ctrlIdx);
+                }
+                else // add it to the list
+                    ctrl_set_.emplace(ctrlIdx);
+            }
+            else
+                unhandled_ctrl.erase(ctrlIdx);
+        }
+        // remove global controls which are no longer global (because the current command didn't
+        // have it)
+        if (unhandled_ctrl.size() > 0){
+            IndexVector new_ctrls;
+            new_ctrls.reserve(unhandled_ctrl.size());
+            for (auto idx : unhandled_ctrl){
+                new_ctrls.push_back(idx);
+                ctrl_set_.erase(idx);
+                set_.insert(idx);
+            }
+            for (auto &item : items_)
+                add_controls(item.get_matrix(), item.get_indices(), new_ctrls);
+        }
+    }
+
+    IndexSet set_;
+    ItemVector items_;
+    IndexSet ctrl_set_;
+};
+
+#endif
diff --git a/third_party/cppsim/include/intrin/alignedallocator.hpp b/third_party/cppsim/include/intrin/alignedallocator.hpp
new file mode 100644
index 00000000..7719f2d0
--- /dev/null
+++ b/third_party/cppsim/include/intrin/alignedallocator.hpp
@@ -0,0 +1,119 @@
+// Copyright (C) 2012 Andreas Hehn  <hehn@phys.ethz.ch>.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#include <cstdlib>
+#endif
+#include <cstddef>
+#include <memory>
+#include <new>
+
+#if __cplusplus < 201103L
+#define noexcept
+#endif
+
+
+template <typename T, unsigned int Alignment>
+class aligned_allocator
+{
+ public:
+    typedef T* pointer;
+    typedef T const* const_pointer;
+    typedef T& reference;
+    typedef T const& const_reference;
+    typedef T value_type;
+    typedef std::size_t size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    template <typename U>
+    struct rebind
+    {
+        typedef aligned_allocator<U, Alignment> other;
+    };
+
+    aligned_allocator() noexcept {}
+    aligned_allocator(aligned_allocator const&) noexcept {}
+    template <typename U>
+    aligned_allocator(aligned_allocator<U, Alignment> const&) noexcept
+    {
+    }
+
+    pointer allocate(size_type n)
+    {
+        pointer p;
+
+
+#ifdef _WIN32
+        p = reinterpret_cast<pointer>(_aligned_malloc(n * sizeof(T), Alignment));
+        if (p == 0) throw std::bad_alloc();
+#else
+        if (posix_memalign(reinterpret_cast<void**>(&p), Alignment, n * sizeof(T)))
+            throw std::bad_alloc();
+#endif
+        return p;
+    }
+
+    void deallocate(pointer p, size_type) noexcept
+    {
+#ifdef _WIN32
+        _aligned_free(p);
+#else
+        std::free(p);
+#endif
+    }
+
+    size_type max_size() const noexcept
+    {
+        std::allocator<T> a;
+        return a.max_size();
+    }
+
+#if __cplusplus >= 201103L
+    template <typename C, class... Args>
+    void construct(C* c, Args&&... args)
+    {
+        new ((void*)c) C(std::forward<Args>(args)...);
+    }
+#else
+    void construct(pointer p, const_reference t) { new ((void*)p) T(t); }
+#endif
+
+    template <typename C>
+    void destroy(C* c)
+    {
+        c->~C();
+    }
+
+    bool operator==(aligned_allocator const&) const noexcept { return true; }
+    bool operator!=(aligned_allocator const&) const noexcept { return false; }
+    template <typename U, unsigned int UAlignment>
+    bool operator==(aligned_allocator<U, UAlignment> const&) const noexcept
+    {
+        return false;
+    }
+
+    template <typename U, unsigned int UAlignment>
+    bool operator!=(aligned_allocator<U, UAlignment> const&) const noexcept
+    {
+        return true;
+    }
+};
+
+#if __cplusplus < 201103L
+#undef noexcept
+#endif
diff --git a/third_party/cppsim/include/intrin/cintrin.hpp b/third_party/cppsim/include/intrin/cintrin.hpp
new file mode 100644
index 00000000..4319ada2
--- /dev/null
+++ b/third_party/cppsim/include/intrin/cintrin.hpp
@@ -0,0 +1,124 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CINTRIN_HPP_
+#define CINTRIN_HPP_
+
+#include <immintrin.h>
+#include <complex>
+
+#ifndef _mm256_set_m128d
+#define _mm256_set_m128d(hi,lo) _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), (hi), 0x1)
+#endif
+#ifndef _mm256_storeu2_m128d
+#define _mm256_storeu2_m128d(hiaddr,loaddr,a) do { __m256d _a = (a); _mm_storeu_pd((loaddr), _mm256_castpd256_pd128(_a)); _mm_storeu_pd((hiaddr), _mm256_extractf128_pd(_a, 0x1)); } while (0)
+#endif
+#ifndef _mm256_loadu2_m128d
+#define _mm256_loadu2_m128d(hiaddr,loaddr) _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr))
+#endif
+
+template <class T>
+class cintrin;
+
+template <>
+class cintrin<double>{
+public:
+    using calc_t = double;
+    using ret_t = cintrin<calc_t>;
+
+
+    cintrin() {}
+
+    template <class U>
+    cintrin(U const *p){
+        v_ = _mm256_load_pd((calc_t const*)p);
+    }
+
+    template <class U>
+    cintrin(U const *p1, U const *p2){
+        v_ = _mm256_loadu2_m128d((calc_t const*)p2, (calc_t const*)p1);
+    }
+
+    template <class U>
+    cintrin(U const *p, bool broadcast){
+        auto tmp = _mm_load_pd((calc_t const*)p);
+        v_ = _mm256_broadcast_pd(&tmp);
+    }
+
+    explicit cintrin(calc_t const& s1){
+        v_ = _mm256_set1_pd(s1);
+    }
+
+    cintrin(__m256d const& v) : v_(v) {  }
+
+    std::complex<calc_t> operator[](unsigned i){
+        calc_t v[4];
+        _mm256_store_pd(v, v_);
+        return {v[i*2], v[i*2+1]};
+    }
+
+    template <class U>
+    void store(U *p) const{
+        _mm256_store_pd((calc_t *)p, v_);
+    }
+
+    template <class U>
+    void store(U *p1, U *p2) const{
+        _mm256_storeu2_m128d((calc_t *)p2, (calc_t *)p1, v_);
+    }
+    __m256d v_;
+};
+
+inline cintrin<double> mul(cintrin<double> const& c1, cintrin<double> const& c2, cintrin<double> const& c2tm){
+    auto ac_bd = _mm256_mul_pd(c1.v_, c2.v_);
+    auto multbmadmc = _mm256_mul_pd(c1.v_, c2tm.v_);
+    return cintrin<double>(_mm256_hsub_pd(ac_bd, multbmadmc));
+}
+inline cintrin<double> operator*(cintrin<double> const& c1, cintrin<double> const& c2){
+    __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+    auto badc = _mm256_permute_pd(c2.v_, 5);
+    auto bmadmc = _mm256_mul_pd(badc, neg);
+    return mul(c1, c2, bmadmc);
+}
+inline cintrin<double> operator+(cintrin<double> const& c1, cintrin<double> const& c2){
+    return cintrin<double>(_mm256_add_pd(c1.v_, c2.v_));
+}
+inline cintrin<double> operator*(cintrin<double> const& c1, double const& d){
+    auto d_d = _mm256_set1_pd(d);
+    return _mm256_mul_pd(c1.v_, d_d);
+}
+inline cintrin<double> operator*(double const& d, cintrin<double> const& c1){
+    return c1*d;
+}
+
+
+
+inline __m256d mul(__m256d const& c1, __m256d const& c2, __m256d const& c2tm){
+    auto ac_bd = _mm256_mul_pd(c1, c2);
+    auto multbmadmc = _mm256_mul_pd(c1, c2tm);
+    return _mm256_hsub_pd(ac_bd, multbmadmc);
+}
+inline __m256d add(__m256d const& c1, __m256d const& c2){
+    return _mm256_add_pd(c1, c2);
+}
+template <class U>
+inline __m256d load2(U *p){
+    auto tmp = _mm_load_pd((double const*)p);
+    return _mm256_broadcast_pd(&tmp);
+}
+template <class U>
+inline __m256d load(U const*p1, U const*p2){
+    return _mm256_loadu2_m128d((double const*)p2, (double const*)p1);
+}
+#endif
diff --git a/third_party/cppsim/include/intrin/kernel1.hpp b/third_party/cppsim/include/intrin/kernel1.hpp
new file mode 100644
index 00000000..793a116f
--- /dev/null
+++ b/third_party/cppsim/include/intrin/kernel1.hpp
@@ -0,0 +1,62 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m, M const& mt)
+{
+    __m256d v[2];
+
+    v[0] = load2(&psi[I]);
+    v[1] = load2(&psi[I + d0]);
+
+    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1])));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+
+    __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1])};
+    __m256d mmt[2];
+
+    __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+    for (unsigned i = 0; i < 2; ++i){
+        auto badc = _mm256_permute_pd(mm[i], 5);
+        mmt[i] = _mm256_mul_pd(badc, neg);
+    }
+
+    std::size_t dsorted[] = {d0};
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+                kernel_core(psi, i0 + i1, d0, mm, mmt);
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+                if (((i0 + i1)&ctrlmask) == ctrlmask)
+                    kernel_core(psi, i0 + i1, d0, mm, mmt);
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/include/intrin/kernel2.hpp b/third_party/cppsim/include/intrin/kernel2.hpp
new file mode 100644
index 00000000..e1a2c9a9
--- /dev/null
+++ b/third_party/cppsim/include/intrin/kernel2.hpp
@@ -0,0 +1,71 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m, M const& mt)
+{
+    __m256d v[4];
+
+    v[0] = load2(&psi[I]);
+    v[1] = load2(&psi[I + d0]);
+    v[2] = load2(&psi[I + d1]);
+    v[3] = load2(&psi[I + d0 + d1]);
+
+    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+
+    __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3])};
+    __m256d mmt[8];
+
+    __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+    for (unsigned i = 0; i < 8; ++i){
+        auto badc = _mm256_permute_pd(mm[i], 5);
+        mmt[i] = _mm256_mul_pd(badc, neg);
+    }
+
+    std::size_t dsorted[] = {d0 , d1};
+    std::sort(dsorted, dsorted + 2, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+                    kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt);
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+                    if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
+                        kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt);
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/include/intrin/kernel3.hpp b/third_party/cppsim/include/intrin/kernel3.hpp
new file mode 100644
index 00000000..2aac0f8a
--- /dev/null
+++ b/third_party/cppsim/include/intrin/kernel3.hpp
@@ -0,0 +1,90 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m, M const& mt)
+{
+    __m256d v[4];
+
+    v[0] = load2(&psi[I]);
+    v[1] = load2(&psi[I + d0]);
+    v[2] = load2(&psi[I + d1]);
+    v[3] = load2(&psi[I + d0 + d1]);
+
+    __m256d tmp[4];
+
+    tmp[0] = add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))));
+    tmp[1] = add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))));
+    tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11]))));
+    tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15]))));
+
+    v[0] = load2(&psi[I + d2]);
+    v[1] = load2(&psi[I + d0 + d2]);
+    v[2] = load2(&psi[I + d1 + d2]);
+    v[3] = load2(&psi[I + d0 + d1 + d2]);
+
+    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+    std::size_t d2 = 1UL << id2;
+
+    __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7])};
+    __m256d mmt[32];
+
+    __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+    for (unsigned i = 0; i < 32; ++i){
+        auto badc = _mm256_permute_pd(mm[i], 5);
+        mmt[i] = _mm256_mul_pd(badc, neg);
+    }
+
+    std::size_t dsorted[] = {d0 , d1, d2};
+    std::sort(dsorted, dsorted + 3, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+                        kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt);
+                    }
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+                        if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
+                            kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt);
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/include/intrin/kernel4.hpp b/third_party/cppsim/include/intrin/kernel4.hpp
new file mode 100644
index 00000000..5523a556
--- /dev/null
+++ b/third_party/cppsim/include/intrin/kernel4.hpp
@@ -0,0 +1,131 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m, M const& mt)
+{
+    __m256d v[4];
+
+    v[0] = load2(&psi[I]);
+    v[1] = load2(&psi[I + d0]);
+    v[2] = load2(&psi[I + d1]);
+    v[3] = load2(&psi[I + d0 + d1]);
+
+    __m256d tmp[8];
+
+    tmp[0] = add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))));
+    tmp[1] = add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))));
+    tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11]))));
+    tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15]))));
+    tmp[4] = add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19]))));
+    tmp[5] = add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23]))));
+    tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))));
+    tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))));
+
+    v[0] = load2(&psi[I + d2]);
+    v[1] = load2(&psi[I + d0 + d2]);
+    v[2] = load2(&psi[I + d1 + d2]);
+    v[3] = load2(&psi[I + d0 + d1 + d2]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[40], mt[40]), add(mul(v[1], m[41], mt[41]), add(mul(v[2], m[42], mt[42]), mul(v[3], m[43], mt[43])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[44], mt[44]), add(mul(v[1], m[45], mt[45]), add(mul(v[2], m[46], mt[46]), mul(v[3], m[47], mt[47])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[48], mt[48]), add(mul(v[1], m[49], mt[49]), add(mul(v[2], m[50], mt[50]), mul(v[3], m[51], mt[51])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[52], mt[52]), add(mul(v[1], m[53], mt[53]), add(mul(v[2], m[54], mt[54]), mul(v[3], m[55], mt[55])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63])))));
+
+    v[0] = load2(&psi[I + d3]);
+    v[1] = load2(&psi[I + d0 + d3]);
+    v[2] = load2(&psi[I + d1 + d3]);
+    v[3] = load2(&psi[I + d0 + d1 + d3]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[72], mt[72]), add(mul(v[1], m[73], mt[73]), add(mul(v[2], m[74], mt[74]), mul(v[3], m[75], mt[75])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[76], mt[76]), add(mul(v[1], m[77], mt[77]), add(mul(v[2], m[78], mt[78]), mul(v[3], m[79], mt[79])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[80], mt[80]), add(mul(v[1], m[81], mt[81]), add(mul(v[2], m[82], mt[82]), mul(v[3], m[83], mt[83])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[84], mt[84]), add(mul(v[1], m[85], mt[85]), add(mul(v[2], m[86], mt[86]), mul(v[3], m[87], mt[87])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95])))));
+
+    v[0] = load2(&psi[I + d2 + d3]);
+    v[1] = load2(&psi[I + d0 + d2 + d3]);
+    v[2] = load2(&psi[I + d1 + d2 + d3]);
+    v[3] = load2(&psi[I + d0 + d1 + d2 + d3]);
+
+    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+    std::size_t d2 = 1UL << id2;
+    std::size_t d3 = 1UL << id3;
+
+    __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[8][0], &m[9][0]), load(&m[8][1], &m[9][1]), load(&m[8][2], &m[9][2]), load(&m[8][3], &m[9][3]), load(&m[10][0], &m[11][0]), load(&m[10][1], &m[11][1]), load(&m[10][2], &m[11][2]), load(&m[10][3], &m[11][3]), load(&m[12][0], &m[13][0]), load(&m[12][1], &m[13][1]), load(&m[12][2], &m[13][2]), load(&m[12][3], &m[13][3]), load(&m[14][0], &m[15][0]), load(&m[14][1], &m[15][1]), load(&m[14][2], &m[15][2]), load(&m[14][3], &m[15][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7]), load(&m[8][4], &m[9][4]), load(&m[8][5], &m[9][5]), load(&m[8][6], &m[9][6]), load(&m[8][7], &m[9][7]), load(&m[10][4], &m[11][4]), load(&m[10][5], &m[11][5]), load(&m[10][6], &m[11][6]), load(&m[10][7], &m[11][7]), load(&m[12][4], &m[13][4]), load(&m[12][5], &m[13][5]), load(&m[12][6], &m[13][6]), load(&m[12][7], &m[13][7]), load(&m[14][4], &m[15][4]), load(&m[14][5], &m[15][5]), load(&m[14][6], &m[15][6]), load(&m[14][7], &m[15][7]), load(&m[0][8], &m[1][8]), load(&m[0][9], &m[1][9]), load(&m[0][10], &m[1][10]), load(&m[0][11], &m[1][11]), load(&m[2][8], &m[3][8]), load(&m[2][9], &m[3][9]), load(&m[2][10], &m[3][10]), load(&m[2][11], &m[3][11]), load(&m[4][8], &m[5][8]), load(&m[4][9], &m[5][9]), load(&m[4][10], &m[5][10]), load(&m[4][11], &m[5][11]), load(&m[6][8], &m[7][8]), load(&m[6][9], &m[7][9]), load(&m[6][10], &m[7][10]), load(&m[6][11], &m[7][11]), load(&m[8][8], &m[9][8]), load(&m[8][9], &m[9][9]), load(&m[8][10], &m[9][10]), load(&m[8][11], &m[9][11]), load(&m[10][8], &m[11][8]), load(&m[10][9], &m[11][9]), load(&m[10][10], &m[11][10]), load(&m[10][11], &m[11][11]), load(&m[12][8], &m[13][8]), load(&m[12][9], &m[13][9]), load(&m[12][10], &m[13][10]), load(&m[12][11], &m[13][11]), load(&m[14][8], &m[15][8]), load(&m[14][9], &m[15][9]), load(&m[14][10], &m[15][10]), load(&m[14][11], &m[15][11]), load(&m[0][12], &m[1][12]), load(&m[0][13], &m[1][13]), load(&m[0][14], &m[1][14]), load(&m[0][15], &m[1][15]), load(&m[2][12], &m[3][12]), load(&m[2][13], &m[3][13]), load(&m[2][14], &m[3][14]), load(&m[2][15], &m[3][15]), load(&m[4][12], &m[5][12]), load(&m[4][13], &m[5][13]), load(&m[4][14], &m[5][14]), load(&m[4][15], &m[5][15]), load(&m[6][12], &m[7][12]), load(&m[6][13], &m[7][13]), load(&m[6][14], &m[7][14]), load(&m[6][15], &m[7][15]), load(&m[8][12], &m[9][12]), load(&m[8][13], &m[9][13]), load(&m[8][14], &m[9][14]), load(&m[8][15], &m[9][15]), load(&m[10][12], &m[11][12]), load(&m[10][13], &m[11][13]), load(&m[10][14], &m[11][14]), load(&m[10][15], &m[11][15]), load(&m[12][12], &m[13][12]), load(&m[12][13], &m[13][13]), load(&m[12][14], &m[13][14]), load(&m[12][15], &m[13][15]), load(&m[14][12], &m[15][12]), load(&m[14][13], &m[15][13]), load(&m[14][14], &m[15][14]), load(&m[14][15], &m[15][15])};
+    __m256d mmt[128];
+
+    __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+    for (unsigned i = 0; i < 128; ++i){
+        auto badc = _mm256_permute_pd(mm[i], 5);
+        mmt[i] = _mm256_mul_pd(badc, neg);
+    }
+
+    std::size_t dsorted[] = {d0 , d1, d2, d3};
+    std::sort(dsorted, dsorted + 4, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+                            kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+                            if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
+                                kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/include/intrin/kernel5.hpp b/third_party/cppsim/include/intrin/kernel5.hpp
new file mode 100644
index 00000000..9cf781fa
--- /dev/null
+++ b/third_party/cppsim/include/intrin/kernel5.hpp
@@ -0,0 +1,256 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+template <class V, class M>
+inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m, M const& mt)
+{
+    __m256d v[4];
+
+    v[0] = load2(&psi[I]);
+    v[1] = load2(&psi[I + d0]);
+    v[2] = load2(&psi[I + d1]);
+    v[3] = load2(&psi[I + d0 + d1]);
+
+    __m256d tmp[16];
+
+    tmp[0] = add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))));
+    tmp[1] = add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))));
+    tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11]))));
+    tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15]))));
+    tmp[4] = add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19]))));
+    tmp[5] = add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23]))));
+    tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))));
+    tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))));
+    tmp[8] = add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35]))));
+    tmp[9] = add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39]))));
+    tmp[10] = add(mul(v[0], m[40], mt[40]), add(mul(v[1], m[41], mt[41]), add(mul(v[2], m[42], mt[42]), mul(v[3], m[43], mt[43]))));
+    tmp[11] = add(mul(v[0], m[44], mt[44]), add(mul(v[1], m[45], mt[45]), add(mul(v[2], m[46], mt[46]), mul(v[3], m[47], mt[47]))));
+    tmp[12] = add(mul(v[0], m[48], mt[48]), add(mul(v[1], m[49], mt[49]), add(mul(v[2], m[50], mt[50]), mul(v[3], m[51], mt[51]))));
+    tmp[13] = add(mul(v[0], m[52], mt[52]), add(mul(v[1], m[53], mt[53]), add(mul(v[2], m[54], mt[54]), mul(v[3], m[55], mt[55]))));
+    tmp[14] = add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59]))));
+    tmp[15] = add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63]))));
+
+    v[0] = load2(&psi[I + d2]);
+    v[1] = load2(&psi[I + d0 + d2]);
+    v[2] = load2(&psi[I + d1 + d2]);
+    v[3] = load2(&psi[I + d0 + d1 + d2]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[72], mt[72]), add(mul(v[1], m[73], mt[73]), add(mul(v[2], m[74], mt[74]), mul(v[3], m[75], mt[75])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[76], mt[76]), add(mul(v[1], m[77], mt[77]), add(mul(v[2], m[78], mt[78]), mul(v[3], m[79], mt[79])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[80], mt[80]), add(mul(v[1], m[81], mt[81]), add(mul(v[2], m[82], mt[82]), mul(v[3], m[83], mt[83])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[84], mt[84]), add(mul(v[1], m[85], mt[85]), add(mul(v[2], m[86], mt[86]), mul(v[3], m[87], mt[87])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))));
+
+    v[0] = load2(&psi[I + d3]);
+    v[1] = load2(&psi[I + d0 + d3]);
+    v[2] = load2(&psi[I + d1 + d3]);
+    v[3] = load2(&psi[I + d0 + d1 + d3]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[128], mt[128]), add(mul(v[1], m[129], mt[129]), add(mul(v[2], m[130], mt[130]), mul(v[3], m[131], mt[131])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[132], mt[132]), add(mul(v[1], m[133], mt[133]), add(mul(v[2], m[134], mt[134]), mul(v[3], m[135], mt[135])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[136], mt[136]), add(mul(v[1], m[137], mt[137]), add(mul(v[2], m[138], mt[138]), mul(v[3], m[139], mt[139])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[140], mt[140]), add(mul(v[1], m[141], mt[141]), add(mul(v[2], m[142], mt[142]), mul(v[3], m[143], mt[143])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[144], mt[144]), add(mul(v[1], m[145], mt[145]), add(mul(v[2], m[146], mt[146]), mul(v[3], m[147], mt[147])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[148], mt[148]), add(mul(v[1], m[149], mt[149]), add(mul(v[2], m[150], mt[150]), mul(v[3], m[151], mt[151])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[152], mt[152]), add(mul(v[1], m[153], mt[153]), add(mul(v[2], m[154], mt[154]), mul(v[3], m[155], mt[155])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[156], mt[156]), add(mul(v[1], m[157], mt[157]), add(mul(v[2], m[158], mt[158]), mul(v[3], m[159], mt[159])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[160], mt[160]), add(mul(v[1], m[161], mt[161]), add(mul(v[2], m[162], mt[162]), mul(v[3], m[163], mt[163])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[164], mt[164]), add(mul(v[1], m[165], mt[165]), add(mul(v[2], m[166], mt[166]), mul(v[3], m[167], mt[167])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[168], mt[168]), add(mul(v[1], m[169], mt[169]), add(mul(v[2], m[170], mt[170]), mul(v[3], m[171], mt[171])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[172], mt[172]), add(mul(v[1], m[173], mt[173]), add(mul(v[2], m[174], mt[174]), mul(v[3], m[175], mt[175])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[176], mt[176]), add(mul(v[1], m[177], mt[177]), add(mul(v[2], m[178], mt[178]), mul(v[3], m[179], mt[179])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[180], mt[180]), add(mul(v[1], m[181], mt[181]), add(mul(v[2], m[182], mt[182]), mul(v[3], m[183], mt[183])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[184], mt[184]), add(mul(v[1], m[185], mt[185]), add(mul(v[2], m[186], mt[186]), mul(v[3], m[187], mt[187])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[188], mt[188]), add(mul(v[1], m[189], mt[189]), add(mul(v[2], m[190], mt[190]), mul(v[3], m[191], mt[191])))));
+
+    v[0] = load2(&psi[I + d2 + d3]);
+    v[1] = load2(&psi[I + d0 + d2 + d3]);
+    v[2] = load2(&psi[I + d1 + d2 + d3]);
+    v[3] = load2(&psi[I + d0 + d1 + d2 + d3]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[192], mt[192]), add(mul(v[1], m[193], mt[193]), add(mul(v[2], m[194], mt[194]), mul(v[3], m[195], mt[195])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[196], mt[196]), add(mul(v[1], m[197], mt[197]), add(mul(v[2], m[198], mt[198]), mul(v[3], m[199], mt[199])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[200], mt[200]), add(mul(v[1], m[201], mt[201]), add(mul(v[2], m[202], mt[202]), mul(v[3], m[203], mt[203])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[204], mt[204]), add(mul(v[1], m[205], mt[205]), add(mul(v[2], m[206], mt[206]), mul(v[3], m[207], mt[207])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[208], mt[208]), add(mul(v[1], m[209], mt[209]), add(mul(v[2], m[210], mt[210]), mul(v[3], m[211], mt[211])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[212], mt[212]), add(mul(v[1], m[213], mt[213]), add(mul(v[2], m[214], mt[214]), mul(v[3], m[215], mt[215])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[216], mt[216]), add(mul(v[1], m[217], mt[217]), add(mul(v[2], m[218], mt[218]), mul(v[3], m[219], mt[219])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[220], mt[220]), add(mul(v[1], m[221], mt[221]), add(mul(v[2], m[222], mt[222]), mul(v[3], m[223], mt[223])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[224], mt[224]), add(mul(v[1], m[225], mt[225]), add(mul(v[2], m[226], mt[226]), mul(v[3], m[227], mt[227])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[228], mt[228]), add(mul(v[1], m[229], mt[229]), add(mul(v[2], m[230], mt[230]), mul(v[3], m[231], mt[231])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[232], mt[232]), add(mul(v[1], m[233], mt[233]), add(mul(v[2], m[234], mt[234]), mul(v[3], m[235], mt[235])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[236], mt[236]), add(mul(v[1], m[237], mt[237]), add(mul(v[2], m[238], mt[238]), mul(v[3], m[239], mt[239])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[240], mt[240]), add(mul(v[1], m[241], mt[241]), add(mul(v[2], m[242], mt[242]), mul(v[3], m[243], mt[243])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[244], mt[244]), add(mul(v[1], m[245], mt[245]), add(mul(v[2], m[246], mt[246]), mul(v[3], m[247], mt[247])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[248], mt[248]), add(mul(v[1], m[249], mt[249]), add(mul(v[2], m[250], mt[250]), mul(v[3], m[251], mt[251])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[252], mt[252]), add(mul(v[1], m[253], mt[253]), add(mul(v[2], m[254], mt[254]), mul(v[3], m[255], mt[255])))));
+
+    v[0] = load2(&psi[I + d4]);
+    v[1] = load2(&psi[I + d0 + d4]);
+    v[2] = load2(&psi[I + d1 + d4]);
+    v[3] = load2(&psi[I + d0 + d1 + d4]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[256], mt[256]), add(mul(v[1], m[257], mt[257]), add(mul(v[2], m[258], mt[258]), mul(v[3], m[259], mt[259])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[260], mt[260]), add(mul(v[1], m[261], mt[261]), add(mul(v[2], m[262], mt[262]), mul(v[3], m[263], mt[263])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[264], mt[264]), add(mul(v[1], m[265], mt[265]), add(mul(v[2], m[266], mt[266]), mul(v[3], m[267], mt[267])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[268], mt[268]), add(mul(v[1], m[269], mt[269]), add(mul(v[2], m[270], mt[270]), mul(v[3], m[271], mt[271])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[272], mt[272]), add(mul(v[1], m[273], mt[273]), add(mul(v[2], m[274], mt[274]), mul(v[3], m[275], mt[275])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[276], mt[276]), add(mul(v[1], m[277], mt[277]), add(mul(v[2], m[278], mt[278]), mul(v[3], m[279], mt[279])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[280], mt[280]), add(mul(v[1], m[281], mt[281]), add(mul(v[2], m[282], mt[282]), mul(v[3], m[283], mt[283])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[284], mt[284]), add(mul(v[1], m[285], mt[285]), add(mul(v[2], m[286], mt[286]), mul(v[3], m[287], mt[287])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[288], mt[288]), add(mul(v[1], m[289], mt[289]), add(mul(v[2], m[290], mt[290]), mul(v[3], m[291], mt[291])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[292], mt[292]), add(mul(v[1], m[293], mt[293]), add(mul(v[2], m[294], mt[294]), mul(v[3], m[295], mt[295])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[296], mt[296]), add(mul(v[1], m[297], mt[297]), add(mul(v[2], m[298], mt[298]), mul(v[3], m[299], mt[299])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[300], mt[300]), add(mul(v[1], m[301], mt[301]), add(mul(v[2], m[302], mt[302]), mul(v[3], m[303], mt[303])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[304], mt[304]), add(mul(v[1], m[305], mt[305]), add(mul(v[2], m[306], mt[306]), mul(v[3], m[307], mt[307])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[308], mt[308]), add(mul(v[1], m[309], mt[309]), add(mul(v[2], m[310], mt[310]), mul(v[3], m[311], mt[311])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[312], mt[312]), add(mul(v[1], m[313], mt[313]), add(mul(v[2], m[314], mt[314]), mul(v[3], m[315], mt[315])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[316], mt[316]), add(mul(v[1], m[317], mt[317]), add(mul(v[2], m[318], mt[318]), mul(v[3], m[319], mt[319])))));
+
+    v[0] = load2(&psi[I + d2 + d4]);
+    v[1] = load2(&psi[I + d0 + d2 + d4]);
+    v[2] = load2(&psi[I + d1 + d2 + d4]);
+    v[3] = load2(&psi[I + d0 + d1 + d2 + d4]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[320], mt[320]), add(mul(v[1], m[321], mt[321]), add(mul(v[2], m[322], mt[322]), mul(v[3], m[323], mt[323])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[324], mt[324]), add(mul(v[1], m[325], mt[325]), add(mul(v[2], m[326], mt[326]), mul(v[3], m[327], mt[327])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[328], mt[328]), add(mul(v[1], m[329], mt[329]), add(mul(v[2], m[330], mt[330]), mul(v[3], m[331], mt[331])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[332], mt[332]), add(mul(v[1], m[333], mt[333]), add(mul(v[2], m[334], mt[334]), mul(v[3], m[335], mt[335])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[336], mt[336]), add(mul(v[1], m[337], mt[337]), add(mul(v[2], m[338], mt[338]), mul(v[3], m[339], mt[339])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[340], mt[340]), add(mul(v[1], m[341], mt[341]), add(mul(v[2], m[342], mt[342]), mul(v[3], m[343], mt[343])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[344], mt[344]), add(mul(v[1], m[345], mt[345]), add(mul(v[2], m[346], mt[346]), mul(v[3], m[347], mt[347])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[348], mt[348]), add(mul(v[1], m[349], mt[349]), add(mul(v[2], m[350], mt[350]), mul(v[3], m[351], mt[351])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[352], mt[352]), add(mul(v[1], m[353], mt[353]), add(mul(v[2], m[354], mt[354]), mul(v[3], m[355], mt[355])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[356], mt[356]), add(mul(v[1], m[357], mt[357]), add(mul(v[2], m[358], mt[358]), mul(v[3], m[359], mt[359])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[360], mt[360]), add(mul(v[1], m[361], mt[361]), add(mul(v[2], m[362], mt[362]), mul(v[3], m[363], mt[363])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[364], mt[364]), add(mul(v[1], m[365], mt[365]), add(mul(v[2], m[366], mt[366]), mul(v[3], m[367], mt[367])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[368], mt[368]), add(mul(v[1], m[369], mt[369]), add(mul(v[2], m[370], mt[370]), mul(v[3], m[371], mt[371])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[372], mt[372]), add(mul(v[1], m[373], mt[373]), add(mul(v[2], m[374], mt[374]), mul(v[3], m[375], mt[375])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[376], mt[376]), add(mul(v[1], m[377], mt[377]), add(mul(v[2], m[378], mt[378]), mul(v[3], m[379], mt[379])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[380], mt[380]), add(mul(v[1], m[381], mt[381]), add(mul(v[2], m[382], mt[382]), mul(v[3], m[383], mt[383])))));
+
+    v[0] = load2(&psi[I + d3 + d4]);
+    v[1] = load2(&psi[I + d0 + d3 + d4]);
+    v[2] = load2(&psi[I + d1 + d3 + d4]);
+    v[3] = load2(&psi[I + d0 + d1 + d3 + d4]);
+
+    tmp[0] = add(tmp[0], add(mul(v[0], m[384], mt[384]), add(mul(v[1], m[385], mt[385]), add(mul(v[2], m[386], mt[386]), mul(v[3], m[387], mt[387])))));
+    tmp[1] = add(tmp[1], add(mul(v[0], m[388], mt[388]), add(mul(v[1], m[389], mt[389]), add(mul(v[2], m[390], mt[390]), mul(v[3], m[391], mt[391])))));
+    tmp[2] = add(tmp[2], add(mul(v[0], m[392], mt[392]), add(mul(v[1], m[393], mt[393]), add(mul(v[2], m[394], mt[394]), mul(v[3], m[395], mt[395])))));
+    tmp[3] = add(tmp[3], add(mul(v[0], m[396], mt[396]), add(mul(v[1], m[397], mt[397]), add(mul(v[2], m[398], mt[398]), mul(v[3], m[399], mt[399])))));
+    tmp[4] = add(tmp[4], add(mul(v[0], m[400], mt[400]), add(mul(v[1], m[401], mt[401]), add(mul(v[2], m[402], mt[402]), mul(v[3], m[403], mt[403])))));
+    tmp[5] = add(tmp[5], add(mul(v[0], m[404], mt[404]), add(mul(v[1], m[405], mt[405]), add(mul(v[2], m[406], mt[406]), mul(v[3], m[407], mt[407])))));
+    tmp[6] = add(tmp[6], add(mul(v[0], m[408], mt[408]), add(mul(v[1], m[409], mt[409]), add(mul(v[2], m[410], mt[410]), mul(v[3], m[411], mt[411])))));
+    tmp[7] = add(tmp[7], add(mul(v[0], m[412], mt[412]), add(mul(v[1], m[413], mt[413]), add(mul(v[2], m[414], mt[414]), mul(v[3], m[415], mt[415])))));
+    tmp[8] = add(tmp[8], add(mul(v[0], m[416], mt[416]), add(mul(v[1], m[417], mt[417]), add(mul(v[2], m[418], mt[418]), mul(v[3], m[419], mt[419])))));
+    tmp[9] = add(tmp[9], add(mul(v[0], m[420], mt[420]), add(mul(v[1], m[421], mt[421]), add(mul(v[2], m[422], mt[422]), mul(v[3], m[423], mt[423])))));
+    tmp[10] = add(tmp[10], add(mul(v[0], m[424], mt[424]), add(mul(v[1], m[425], mt[425]), add(mul(v[2], m[426], mt[426]), mul(v[3], m[427], mt[427])))));
+    tmp[11] = add(tmp[11], add(mul(v[0], m[428], mt[428]), add(mul(v[1], m[429], mt[429]), add(mul(v[2], m[430], mt[430]), mul(v[3], m[431], mt[431])))));
+    tmp[12] = add(tmp[12], add(mul(v[0], m[432], mt[432]), add(mul(v[1], m[433], mt[433]), add(mul(v[2], m[434], mt[434]), mul(v[3], m[435], mt[435])))));
+    tmp[13] = add(tmp[13], add(mul(v[0], m[436], mt[436]), add(mul(v[1], m[437], mt[437]), add(mul(v[2], m[438], mt[438]), mul(v[3], m[439], mt[439])))));
+    tmp[14] = add(tmp[14], add(mul(v[0], m[440], mt[440]), add(mul(v[1], m[441], mt[441]), add(mul(v[2], m[442], mt[442]), mul(v[3], m[443], mt[443])))));
+    tmp[15] = add(tmp[15], add(mul(v[0], m[444], mt[444]), add(mul(v[1], m[445], mt[445]), add(mul(v[2], m[446], mt[446]), mul(v[3], m[447], mt[447])))));
+
+    v[0] = load2(&psi[I + d2 + d3 + d4]);
+    v[1] = load2(&psi[I + d0 + d2 + d3 + d4]);
+    v[2] = load2(&psi[I + d1 + d2 + d3 + d4]);
+    v[3] = load2(&psi[I + d0 + d1 + d2 + d3 + d4]);
+
+    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d4], (double*)&psi[I + d4], add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507]))))));
+    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511]))))));
+
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class M>
+void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
+{
+    std::size_t n = psi.size();
+    std::size_t d0 = 1UL << id0;
+    std::size_t d1 = 1UL << id1;
+    std::size_t d2 = 1UL << id2;
+    std::size_t d3 = 1UL << id3;
+    std::size_t d4 = 1UL << id4;
+
+    __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[8][0], &m[9][0]), load(&m[8][1], &m[9][1]), load(&m[8][2], &m[9][2]), load(&m[8][3], &m[9][3]), load(&m[10][0], &m[11][0]), load(&m[10][1], &m[11][1]), load(&m[10][2], &m[11][2]), load(&m[10][3], &m[11][3]), load(&m[12][0], &m[13][0]), load(&m[12][1], &m[13][1]), load(&m[12][2], &m[13][2]), load(&m[12][3], &m[13][3]), load(&m[14][0], &m[15][0]), load(&m[14][1], &m[15][1]), load(&m[14][2], &m[15][2]), load(&m[14][3], &m[15][3]), load(&m[16][0], &m[17][0]), load(&m[16][1], &m[17][1]), load(&m[16][2], &m[17][2]), load(&m[16][3], &m[17][3]), load(&m[18][0], &m[19][0]), load(&m[18][1], &m[19][1]), load(&m[18][2], &m[19][2]), load(&m[18][3], &m[19][3]), load(&m[20][0], &m[21][0]), load(&m[20][1], &m[21][1]), load(&m[20][2], &m[21][2]), load(&m[20][3], &m[21][3]), load(&m[22][0], &m[23][0]), load(&m[22][1], &m[23][1]), load(&m[22][2], &m[23][2]), load(&m[22][3], &m[23][3]), load(&m[24][0], &m[25][0]), load(&m[24][1], &m[25][1]), load(&m[24][2], &m[25][2]), load(&m[24][3], &m[25][3]), load(&m[26][0], &m[27][0]), load(&m[26][1], &m[27][1]), load(&m[26][2], &m[27][2]), load(&m[26][3], &m[27][3]), load(&m[28][0], &m[29][0]), load(&m[28][1], &m[29][1]), load(&m[28][2], &m[29][2]), load(&m[28][3], &m[29][3]), load(&m[30][0], &m[31][0]), load(&m[30][1], &m[31][1]), load(&m[30][2], &m[31][2]), load(&m[30][3], &m[31][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7]), load(&m[8][4], &m[9][4]), load(&m[8][5], &m[9][5]), load(&m[8][6], &m[9][6]), load(&m[8][7], &m[9][7]), load(&m[10][4], &m[11][4]), load(&m[10][5], &m[11][5]), load(&m[10][6], &m[11][6]), load(&m[10][7], &m[11][7]), load(&m[12][4], &m[13][4]), load(&m[12][5], &m[13][5]), load(&m[12][6], &m[13][6]), load(&m[12][7], &m[13][7]), load(&m[14][4], &m[15][4]), load(&m[14][5], &m[15][5]), load(&m[14][6], &m[15][6]), load(&m[14][7], &m[15][7]), load(&m[16][4], &m[17][4]), load(&m[16][5], &m[17][5]), load(&m[16][6], &m[17][6]), load(&m[16][7], &m[17][7]), load(&m[18][4], &m[19][4]), load(&m[18][5], &m[19][5]), load(&m[18][6], &m[19][6]), load(&m[18][7], &m[19][7]), load(&m[20][4], &m[21][4]), load(&m[20][5], &m[21][5]), load(&m[20][6], &m[21][6]), load(&m[20][7], &m[21][7]), load(&m[22][4], &m[23][4]), load(&m[22][5], &m[23][5]), load(&m[22][6], &m[23][6]), load(&m[22][7], &m[23][7]), load(&m[24][4], &m[25][4]), load(&m[24][5], &m[25][5]), load(&m[24][6], &m[25][6]), load(&m[24][7], &m[25][7]), load(&m[26][4], &m[27][4]), load(&m[26][5], &m[27][5]), load(&m[26][6], &m[27][6]), load(&m[26][7], &m[27][7]), load(&m[28][4], &m[29][4]), load(&m[28][5], &m[29][5]), load(&m[28][6], &m[29][6]), load(&m[28][7], &m[29][7]), load(&m[30][4], &m[31][4]), load(&m[30][5], &m[31][5]), load(&m[30][6], &m[31][6]), load(&m[30][7], &m[31][7]), load(&m[0][8], &m[1][8]), load(&m[0][9], &m[1][9]), load(&m[0][10], &m[1][10]), load(&m[0][11], &m[1][11]), load(&m[2][8], &m[3][8]), load(&m[2][9], &m[3][9]), load(&m[2][10], &m[3][10]), load(&m[2][11], &m[3][11]), load(&m[4][8], &m[5][8]), load(&m[4][9], &m[5][9]), load(&m[4][10], &m[5][10]), load(&m[4][11], &m[5][11]), load(&m[6][8], &m[7][8]), load(&m[6][9], &m[7][9]), load(&m[6][10], &m[7][10]), load(&m[6][11], &m[7][11]), load(&m[8][8], &m[9][8]), load(&m[8][9], &m[9][9]), load(&m[8][10], &m[9][10]), load(&m[8][11], &m[9][11]), load(&m[10][8], &m[11][8]), load(&m[10][9], &m[11][9]), load(&m[10][10], &m[11][10]), load(&m[10][11], &m[11][11]), load(&m[12][8], &m[13][8]), load(&m[12][9], &m[13][9]), load(&m[12][10], &m[13][10]), load(&m[12][11], &m[13][11]), load(&m[14][8], &m[15][8]), load(&m[14][9], &m[15][9]), load(&m[14][10], &m[15][10]), load(&m[14][11], &m[15][11]), load(&m[16][8], &m[17][8]), load(&m[16][9], &m[17][9]), load(&m[16][10], &m[17][10]), load(&m[16][11], &m[17][11]), load(&m[18][8], &m[19][8]), load(&m[18][9], &m[19][9]), load(&m[18][10], &m[19][10]), load(&m[18][11], &m[19][11]), load(&m[20][8], &m[21][8]), load(&m[20][9], &m[21][9]), load(&m[20][10], &m[21][10]), load(&m[20][11], &m[21][11]), load(&m[22][8], &m[23][8]), load(&m[22][9], &m[23][9]), load(&m[22][10], &m[23][10]), load(&m[22][11], &m[23][11]), load(&m[24][8], &m[25][8]), load(&m[24][9], &m[25][9]), load(&m[24][10], &m[25][10]), load(&m[24][11], &m[25][11]), load(&m[26][8], &m[27][8]), load(&m[26][9], &m[27][9]), load(&m[26][10], &m[27][10]), load(&m[26][11], &m[27][11]), load(&m[28][8], &m[29][8]), load(&m[28][9], &m[29][9]), load(&m[28][10], &m[29][10]), load(&m[28][11], &m[29][11]), load(&m[30][8], &m[31][8]), load(&m[30][9], &m[31][9]), load(&m[30][10], &m[31][10]), load(&m[30][11], &m[31][11]), load(&m[0][12], &m[1][12]), load(&m[0][13], &m[1][13]), load(&m[0][14], &m[1][14]), load(&m[0][15], &m[1][15]), load(&m[2][12], &m[3][12]), load(&m[2][13], &m[3][13]), load(&m[2][14], &m[3][14]), load(&m[2][15], &m[3][15]), load(&m[4][12], &m[5][12]), load(&m[4][13], &m[5][13]), load(&m[4][14], &m[5][14]), load(&m[4][15], &m[5][15]), load(&m[6][12], &m[7][12]), load(&m[6][13], &m[7][13]), load(&m[6][14], &m[7][14]), load(&m[6][15], &m[7][15]), load(&m[8][12], &m[9][12]), load(&m[8][13], &m[9][13]), load(&m[8][14], &m[9][14]), load(&m[8][15], &m[9][15]), load(&m[10][12], &m[11][12]), load(&m[10][13], &m[11][13]), load(&m[10][14], &m[11][14]), load(&m[10][15], &m[11][15]), load(&m[12][12], &m[13][12]), load(&m[12][13], &m[13][13]), load(&m[12][14], &m[13][14]), load(&m[12][15], &m[13][15]), load(&m[14][12], &m[15][12]), load(&m[14][13], &m[15][13]), load(&m[14][14], &m[15][14]), load(&m[14][15], &m[15][15]), load(&m[16][12], &m[17][12]), load(&m[16][13], &m[17][13]), load(&m[16][14], &m[17][14]), load(&m[16][15], &m[17][15]), load(&m[18][12], &m[19][12]), load(&m[18][13], &m[19][13]), load(&m[18][14], &m[19][14]), load(&m[18][15], &m[19][15]), load(&m[20][12], &m[21][12]), load(&m[20][13], &m[21][13]), load(&m[20][14], &m[21][14]), load(&m[20][15], &m[21][15]), load(&m[22][12], &m[23][12]), load(&m[22][13], &m[23][13]), load(&m[22][14], &m[23][14]), load(&m[22][15], &m[23][15]), load(&m[24][12], &m[25][12]), load(&m[24][13], &m[25][13]), load(&m[24][14], &m[25][14]), load(&m[24][15], &m[25][15]), load(&m[26][12], &m[27][12]), load(&m[26][13], &m[27][13]), load(&m[26][14], &m[27][14]), load(&m[26][15], &m[27][15]), load(&m[28][12], &m[29][12]), load(&m[28][13], &m[29][13]), load(&m[28][14], &m[29][14]), load(&m[28][15], &m[29][15]), load(&m[30][12], &m[31][12]), load(&m[30][13], &m[31][13]), load(&m[30][14], &m[31][14]), load(&m[30][15], &m[31][15]), load(&m[0][16], &m[1][16]), load(&m[0][17], &m[1][17]), load(&m[0][18], &m[1][18]), load(&m[0][19], &m[1][19]), load(&m[2][16], &m[3][16]), load(&m[2][17], &m[3][17]), load(&m[2][18], &m[3][18]), load(&m[2][19], &m[3][19]), load(&m[4][16], &m[5][16]), load(&m[4][17], &m[5][17]), load(&m[4][18], &m[5][18]), load(&m[4][19], &m[5][19]), load(&m[6][16], &m[7][16]), load(&m[6][17], &m[7][17]), load(&m[6][18], &m[7][18]), load(&m[6][19], &m[7][19]), load(&m[8][16], &m[9][16]), load(&m[8][17], &m[9][17]), load(&m[8][18], &m[9][18]), load(&m[8][19], &m[9][19]), load(&m[10][16], &m[11][16]), load(&m[10][17], &m[11][17]), load(&m[10][18], &m[11][18]), load(&m[10][19], &m[11][19]), load(&m[12][16], &m[13][16]), load(&m[12][17], &m[13][17]), load(&m[12][18], &m[13][18]), load(&m[12][19], &m[13][19]), load(&m[14][16], &m[15][16]), load(&m[14][17], &m[15][17]), load(&m[14][18], &m[15][18]), load(&m[14][19], &m[15][19]), load(&m[16][16], &m[17][16]), load(&m[16][17], &m[17][17]), load(&m[16][18], &m[17][18]), load(&m[16][19], &m[17][19]), load(&m[18][16], &m[19][16]), load(&m[18][17], &m[19][17]), load(&m[18][18], &m[19][18]), load(&m[18][19], &m[19][19]), load(&m[20][16], &m[21][16]), load(&m[20][17], &m[21][17]), load(&m[20][18], &m[21][18]), load(&m[20][19], &m[21][19]), load(&m[22][16], &m[23][16]), load(&m[22][17], &m[23][17]), load(&m[22][18], &m[23][18]), load(&m[22][19], &m[23][19]), load(&m[24][16], &m[25][16]), load(&m[24][17], &m[25][17]), load(&m[24][18], &m[25][18]), load(&m[24][19], &m[25][19]), load(&m[26][16], &m[27][16]), load(&m[26][17], &m[27][17]), load(&m[26][18], &m[27][18]), load(&m[26][19], &m[27][19]), load(&m[28][16], &m[29][16]), load(&m[28][17], &m[29][17]), load(&m[28][18], &m[29][18]), load(&m[28][19], &m[29][19]), load(&m[30][16], &m[31][16]), load(&m[30][17], &m[31][17]), load(&m[30][18], &m[31][18]), load(&m[30][19], &m[31][19]), load(&m[0][20], &m[1][20]), load(&m[0][21], &m[1][21]), load(&m[0][22], &m[1][22]), load(&m[0][23], &m[1][23]), load(&m[2][20], &m[3][20]), load(&m[2][21], &m[3][21]), load(&m[2][22], &m[3][22]), load(&m[2][23], &m[3][23]), load(&m[4][20], &m[5][20]), load(&m[4][21], &m[5][21]), load(&m[4][22], &m[5][22]), load(&m[4][23], &m[5][23]), load(&m[6][20], &m[7][20]), load(&m[6][21], &m[7][21]), load(&m[6][22], &m[7][22]), load(&m[6][23], &m[7][23]), load(&m[8][20], &m[9][20]), load(&m[8][21], &m[9][21]), load(&m[8][22], &m[9][22]), load(&m[8][23], &m[9][23]), load(&m[10][20], &m[11][20]), load(&m[10][21], &m[11][21]), load(&m[10][22], &m[11][22]), load(&m[10][23], &m[11][23]), load(&m[12][20], &m[13][20]), load(&m[12][21], &m[13][21]), load(&m[12][22], &m[13][22]), load(&m[12][23], &m[13][23]), load(&m[14][20], &m[15][20]), load(&m[14][21], &m[15][21]), load(&m[14][22], &m[15][22]), load(&m[14][23], &m[15][23]), load(&m[16][20], &m[17][20]), load(&m[16][21], &m[17][21]), load(&m[16][22], &m[17][22]), load(&m[16][23], &m[17][23]), load(&m[18][20], &m[19][20]), load(&m[18][21], &m[19][21]), load(&m[18][22], &m[19][22]), load(&m[18][23], &m[19][23]), load(&m[20][20], &m[21][20]), load(&m[20][21], &m[21][21]), load(&m[20][22], &m[21][22]), load(&m[20][23], &m[21][23]), load(&m[22][20], &m[23][20]), load(&m[22][21], &m[23][21]), load(&m[22][22], &m[23][22]), load(&m[22][23], &m[23][23]), load(&m[24][20], &m[25][20]), load(&m[24][21], &m[25][21]), load(&m[24][22], &m[25][22]), load(&m[24][23], &m[25][23]), load(&m[26][20], &m[27][20]), load(&m[26][21], &m[27][21]), load(&m[26][22], &m[27][22]), load(&m[26][23], &m[27][23]), load(&m[28][20], &m[29][20]), load(&m[28][21], &m[29][21]), load(&m[28][22], &m[29][22]), load(&m[28][23], &m[29][23]), load(&m[30][20], &m[31][20]), load(&m[30][21], &m[31][21]), load(&m[30][22], &m[31][22]), load(&m[30][23], &m[31][23]), load(&m[0][24], &m[1][24]), load(&m[0][25], &m[1][25]), load(&m[0][26], &m[1][26]), load(&m[0][27], &m[1][27]), load(&m[2][24], &m[3][24]), load(&m[2][25], &m[3][25]), load(&m[2][26], &m[3][26]), load(&m[2][27], &m[3][27]), load(&m[4][24], &m[5][24]), load(&m[4][25], &m[5][25]), load(&m[4][26], &m[5][26]), load(&m[4][27], &m[5][27]), load(&m[6][24], &m[7][24]), load(&m[6][25], &m[7][25]), load(&m[6][26], &m[7][26]), load(&m[6][27], &m[7][27]), load(&m[8][24], &m[9][24]), load(&m[8][25], &m[9][25]), load(&m[8][26], &m[9][26]), load(&m[8][27], &m[9][27]), load(&m[10][24], &m[11][24]), load(&m[10][25], &m[11][25]), load(&m[10][26], &m[11][26]), load(&m[10][27], &m[11][27]), load(&m[12][24], &m[13][24]), load(&m[12][25], &m[13][25]), load(&m[12][26], &m[13][26]), load(&m[12][27], &m[13][27]), load(&m[14][24], &m[15][24]), load(&m[14][25], &m[15][25]), load(&m[14][26], &m[15][26]), load(&m[14][27], &m[15][27]), load(&m[16][24], &m[17][24]), load(&m[16][25], &m[17][25]), load(&m[16][26], &m[17][26]), load(&m[16][27], &m[17][27]), load(&m[18][24], &m[19][24]), load(&m[18][25], &m[19][25]), load(&m[18][26], &m[19][26]), load(&m[18][27], &m[19][27]), load(&m[20][24], &m[21][24]), load(&m[20][25], &m[21][25]), load(&m[20][26], &m[21][26]), load(&m[20][27], &m[21][27]), load(&m[22][24], &m[23][24]), load(&m[22][25], &m[23][25]), load(&m[22][26], &m[23][26]), load(&m[22][27], &m[23][27]), load(&m[24][24], &m[25][24]), load(&m[24][25], &m[25][25]), load(&m[24][26], &m[25][26]), load(&m[24][27], &m[25][27]), load(&m[26][24], &m[27][24]), load(&m[26][25], &m[27][25]), load(&m[26][26], &m[27][26]), load(&m[26][27], &m[27][27]), load(&m[28][24], &m[29][24]), load(&m[28][25], &m[29][25]), load(&m[28][26], &m[29][26]), load(&m[28][27], &m[29][27]), load(&m[30][24], &m[31][24]), load(&m[30][25], &m[31][25]), load(&m[30][26], &m[31][26]), load(&m[30][27], &m[31][27]), load(&m[0][28], &m[1][28]), load(&m[0][29], &m[1][29]), load(&m[0][30], &m[1][30]), load(&m[0][31], &m[1][31]), load(&m[2][28], &m[3][28]), load(&m[2][29], &m[3][29]), load(&m[2][30], &m[3][30]), load(&m[2][31], &m[3][31]), load(&m[4][28], &m[5][28]), load(&m[4][29], &m[5][29]), load(&m[4][30], &m[5][30]), load(&m[4][31], &m[5][31]), load(&m[6][28], &m[7][28]), load(&m[6][29], &m[7][29]), load(&m[6][30], &m[7][30]), load(&m[6][31], &m[7][31]), load(&m[8][28], &m[9][28]), load(&m[8][29], &m[9][29]), load(&m[8][30], &m[9][30]), load(&m[8][31], &m[9][31]), load(&m[10][28], &m[11][28]), load(&m[10][29], &m[11][29]), load(&m[10][30], &m[11][30]), load(&m[10][31], &m[11][31]), load(&m[12][28], &m[13][28]), load(&m[12][29], &m[13][29]), load(&m[12][30], &m[13][30]), load(&m[12][31], &m[13][31]), load(&m[14][28], &m[15][28]), load(&m[14][29], &m[15][29]), load(&m[14][30], &m[15][30]), load(&m[14][31], &m[15][31]), load(&m[16][28], &m[17][28]), load(&m[16][29], &m[17][29]), load(&m[16][30], &m[17][30]), load(&m[16][31], &m[17][31]), load(&m[18][28], &m[19][28]), load(&m[18][29], &m[19][29]), load(&m[18][30], &m[19][30]), load(&m[18][31], &m[19][31]), load(&m[20][28], &m[21][28]), load(&m[20][29], &m[21][29]), load(&m[20][30], &m[21][30]), load(&m[20][31], &m[21][31]), load(&m[22][28], &m[23][28]), load(&m[22][29], &m[23][29]), load(&m[22][30], &m[23][30]), load(&m[22][31], &m[23][31]), load(&m[24][28], &m[25][28]), load(&m[24][29], &m[25][29]), load(&m[24][30], &m[25][30]), load(&m[24][31], &m[25][31]), load(&m[26][28], &m[27][28]), load(&m[26][29], &m[27][29]), load(&m[26][30], &m[27][30]), load(&m[26][31], &m[27][31]), load(&m[28][28], &m[29][28]), load(&m[28][29], &m[29][29]), load(&m[28][30], &m[29][30]), load(&m[28][31], &m[29][31]), load(&m[30][28], &m[31][28]), load(&m[30][29], &m[31][29]), load(&m[30][30], &m[31][30]), load(&m[30][31], &m[31][31])};
+    __m256d mmt[512];
+
+    __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+    for (unsigned i = 0; i < 512; ++i){
+        auto badc = _mm256_permute_pd(mm[i], 5);
+        mmt[i] = _mm256_mul_pd(badc, neg);
+    }
+
+    std::size_t dsorted[] = {d0 , d1, d2, d3, d4};
+    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+                                kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else{
+        #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
+                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+                                if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
+                                    kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cppsim/include/intrin/kernels.hpp b/third_party/cppsim/include/intrin/kernels.hpp
new file mode 100644
index 00000000..f592142d
--- /dev/null
+++ b/third_party/cppsim/include/intrin/kernels.hpp
@@ -0,0 +1,34 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include <complex>
+#include <functional>
+#include <algorithm>
+#include "cintrin.hpp"
+#include "alignedallocator.hpp"
+
+#define LOOP_COLLAPSE1 2
+#define LOOP_COLLAPSE2 3
+#define LOOP_COLLAPSE3 4
+#define LOOP_COLLAPSE4 5
+#define LOOP_COLLAPSE5 6
+
+#include "kernel1.hpp"
+#include "kernel2.hpp"
+#include "kernel3.hpp"
+#include "kernel4.hpp"
+#include "kernel5.hpp"
diff --git a/third_party/cppsim/include/simulator.hpp b/third_party/cppsim/include/simulator.hpp
new file mode 100644
index 00000000..1a84723f
--- /dev/null
+++ b/third_party/cppsim/include/simulator.hpp
@@ -0,0 +1,580 @@
+// Copyright 2017 ProjectQ-Framework (www.projectq.ch)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_HPP_
+#define SIMULATOR_HPP_
+
+#include <vector>
+#include <complex>
+
+#if defined(NOINTRIN) || !defined(INTRIN)
+#include "nointrin/kernels.hpp"
+#else
+#include "intrin/kernels.hpp"
+#endif
+
+#include "intrin/alignedallocator.hpp"
+#include "fusion.hpp"
+#include <map>
+#include <cassert>
+#include <algorithm>
+#include <tuple>
+#include <random>
+#include <functional>
+
+
+class Simulator{
+public:
+    using calc_type = double;
+    using complex_type = std::complex<calc_type>;
+    using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512>>;
+    using Map = std::map<unsigned, unsigned>;
+    using RndEngine = std::mt19937;
+    using Term = std::vector<std::pair<unsigned, char>>;
+    using TermsDict = std::vector<std::pair<Term, calc_type>>;
+    using ComplexTermsDict = std::vector<std::pair<Term, complex_type>>;
+
+    Simulator(unsigned seed = 1) : N_(0), vec_(1,0.), fusion_qubits_min_(4),
+                                   fusion_qubits_max_(5), rnd_eng_(seed) {
+        vec_[0]=1.; // all-zero initial state
+        std::uniform_real_distribution<double> dist(0., 1.);
+        rng_ = std::bind(dist, std::ref(rnd_eng_));
+    }
+
+    void allocate_qubit(unsigned id){
+        if (map_.count(id) == 0){
+            map_[id] = N_++;
+            StateVector newvec; // avoid large memory allocations
+            if( tmpBuff1_.capacity() >= (1UL << N_) )
+              std::swap(newvec, tmpBuff1_);
+            newvec.resize(1UL << N_);
+#pragma omp parallel for schedule(static)
+            for (std::size_t i = 0; i < newvec.size(); ++i)
+                newvec[i] = (i < vec_.size())?vec_[i]:0.;
+            std::swap(vec_, newvec);
+            // recycle large memory
+            std::swap(tmpBuff1_, newvec);
+            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+              std::swap(tmpBuff1_, tmpBuff2_);
+        }
+        else
+            throw(std::runtime_error(
+                "AllocateQubit: ID already exists. Qubit IDs should be unique."));
+    }
+
+    bool get_classical_value(unsigned id, calc_type tol = 1.e-12){
+        run();
+        unsigned pos = map_[id];
+        std::size_t delta = (1UL << pos);
+
+        for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
+            for (std::size_t j = 0; j < delta; ++j){
+                if (std::norm(vec_[i+j]) > tol)
+                    return false;
+                if (std::norm(vec_[i+j+delta]) > tol)
+                    return true;
+            }
+        }
+        assert(false); // this will never happen
+        return false; // suppress 'control reaches end of non-void...'
+    }
+
+    bool is_classical(unsigned id, calc_type tol = 1.e-12){
+        run();
+        unsigned pos = map_[id];
+        std::size_t delta = (1UL << pos);
+
+        short up = 0, down = 0;
+        #pragma omp parallel for schedule(static) reduction(|:up,down)
+        for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
+            for (std::size_t j = 0; j < delta; ++j){
+                up = up | ((std::norm(vec_[i+j]) > tol)&1);
+                down = down | ((std::norm(vec_[i+j+delta]) > tol)&1);
+            }
+        }
+
+        return 1 == (up^down);
+    }
+
+    void collapse_vector(unsigned id, bool value = false, bool shrink = false){
+        run();
+        unsigned pos = map_[id];
+        std::size_t delta = (1UL << pos);
+
+        if (!shrink){
+            #pragma omp parallel for schedule(static)
+            for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
+                for (std::size_t j = 0; j < delta; ++j)
+                    vec_[i+j+static_cast<std::size_t>(!value)*delta] = 0.;
+            }
+        }
+        else{
+            StateVector newvec; // avoid costly memory reallocations
+            if( tmpBuff1_.capacity() >= (1UL << (N_-1)) )
+              std::swap(tmpBuff1_, newvec);
+            newvec.resize((1UL << (N_-1)));
+            #pragma omp parallel for schedule(static) if(0)
+            for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
+                std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
+                            delta, &newvec[i/2]);
+            std::swap(vec_, newvec);
+            std::swap(tmpBuff1_, newvec);
+            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+              std::swap(tmpBuff1_, tmpBuff2_);
+
+            for (auto& p : map_){
+                if (p.second > pos)
+                    p.second--;
+            }
+            map_.erase(id);
+            N_--;
+        }
+    }
+
+    void measure_qubits(std::vector<unsigned> const& ids, std::vector<bool> &res){
+        run();
+
+        std::vector<unsigned> positions(ids.size());
+        for (unsigned i = 0; i < ids.size(); ++i)
+            positions[i] = map_[ids[i]];
+
+        calc_type P = 0.;
+        calc_type rnd = rng_();
+
+        // pick entry at random with probability |entry|^2
+        std::size_t pick = 0;
+        while (P < rnd && pick < vec_.size())
+            P += std::norm(vec_[pick++]);
+
+        pick--;
+        // determine result vector (boolean values for each qubit)
+        // and create mask to detect bad entries (i.e., entries that don't agree with measurement)
+        res = std::vector<bool>(ids.size());
+        std::size_t mask = 0;
+        std::size_t val = 0;
+        for (unsigned i = 0; i < ids.size(); ++i){
+            bool r = ((pick >> positions[i]) & 1) == 1;
+            res[i] = r;
+            mask |= (1UL << positions[i]);
+            val |= (static_cast<std::size_t>(r&1) << positions[i]);
+        }
+        // set bad entries to 0
+        calc_type N = 0.;
+        #pragma omp parallel for reduction(+:N) schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i){
+            if ((i & mask) != val)
+                vec_[i] = 0.;
+            else
+                N += std::norm(vec_[i]);
+        }
+        // re-normalize
+        N = 1./std::sqrt(N);
+        #pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i)
+            vec_[i] *= N;
+    }
+
+    std::vector<bool> measure_qubits_return(std::vector<unsigned> const& ids){
+        std::vector<bool> ret;
+        measure_qubits(ids, ret);
+        return ret;
+    }
+
+    void deallocate_qubit(unsigned id){
+        run();
+        assert(map_.count(id) == 1);
+        if (!is_classical(id))
+            throw(std::runtime_error("Error: Qubit has not been measured / uncomputed! There is most likely a bug in your code."));
+
+        bool value = get_classical_value(id);
+        collapse_vector(id, value, true);
+    }
+
+    template <class M>
+    void apply_controlled_gate(M const& m, const std::vector<unsigned>& ids,
+                               const std::vector<unsigned>& ctrl){
+        auto fused_gates = fused_gates_;
+        fused_gates.insert(m, ids, ctrl);
+
+        if (fused_gates.num_qubits() >= fusion_qubits_min_
+                && fused_gates.num_qubits() <= fusion_qubits_max_){
+            fused_gates_ = fused_gates;
+            run();
+        }
+        else if (fused_gates.num_qubits() > fusion_qubits_max_
+                 || (fused_gates.num_qubits() - ids.size()) > fused_gates_.num_qubits()){
+            run();
+            fused_gates_.insert(m, ids, ctrl);
+        }
+        else
+            fused_gates_ = fused_gates;
+    }
+
+    template <class F, class QuReg>
+    void emulate_math(F const& f, QuReg quregs, const std::vector<unsigned>& ctrl,
+                      bool parallelize = false){
+        run();
+        auto ctrlmask = get_control_mask(ctrl);
+
+        for (unsigned i = 0; i < quregs.size(); ++i)
+            for (unsigned j = 0; j < quregs[i].size(); ++j)
+                quregs[i][j] = map_[quregs[i][j]];
+
+        StateVector newvec; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(newvec, tmpBuff1_);
+        newvec.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); i++)
+          newvec[i] = 0;
+
+//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
+        {
+          std::vector<int> res(quregs.size());
+          //#pragma omp for schedule(static)
+          for (std::size_t i = 0; i < vec_.size(); ++i){
+              if ((ctrlmask&i) == ctrlmask){
+                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+                      res[qr_i] = 0;
+                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
+                          res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
+                  }
+                  f(res);
+                  auto new_i = i;
+                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
+                          if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
+                              new_i ^= (1UL << quregs[qr_i][qb_i]);
+                      }
+                  }
+                  newvec[new_i] += vec_[i];
+              }
+              else
+                  newvec[i] += vec_[i];
+          }
+        }
+        std::swap(vec_, newvec);
+        std::swap(tmpBuff1_, newvec);
+    }
+
+    // faster version without calling python
+    template<class QuReg>
+    inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a](std::vector<int> &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true);
+    }
+
+    // faster version without calling python
+    template<class QuReg>
+    inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true);
+    }
+
+    // faster version without calling python
+    template<class QuReg>
+    inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+    {
+      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true);
+    }
+
+    calc_type get_expectation_value(TermsDict const& td, std::vector<unsigned> const& ids){
+        run();
+        calc_type expectation = 0.;
+
+        StateVector current_state; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(tmpBuff1_, current_state);
+        current_state.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i)
+          current_state[i] = vec_[i];
+
+        for (auto const& term : td){
+            auto const& coefficient = term.second;
+            apply_term(term.first, ids, {});
+            calc_type delta = 0.;
+            #pragma omp parallel for reduction(+:delta) schedule(static)
+            for (std::size_t i = 0; i < vec_.size(); ++i){
+                auto const a1 = std::real(current_state[i]);
+                auto const b1 = -std::imag(current_state[i]);
+                auto const a2 = std::real(vec_[i]);
+                auto const b2 = std::imag(vec_[i]);
+                delta += a1 * a2 - b1 * b2;
+                // reset vec_
+                vec_[i] = current_state[i];
+            }
+            expectation += coefficient * delta;
+        }
+        std::swap(current_state, tmpBuff1_);
+        return expectation;
+    }
+
+    void apply_qubit_operator(ComplexTermsDict const& td, std::vector<unsigned> const& ids){
+        run();
+        StateVector new_state, current_state; // avoid costly memory reallocations
+        if( tmpBuff1_.capacity() >= vec_.size() )
+          std::swap(tmpBuff1_, new_state);
+        if( tmpBuff2_.capacity() >= vec_.size() )
+          std::swap(tmpBuff2_, current_state);
+        new_state.resize(vec_.size());
+        current_state.resize(vec_.size());
+#pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i){
+          new_state[i] = 0;
+          current_state[i] = vec_[i];
+        }
+        for (auto const& term : td){
+            auto const& coefficient = term.second;
+            apply_term(term.first, ids, {});
+            #pragma omp parallel for schedule(static)
+            for (std::size_t i = 0; i < vec_.size(); ++i){
+                new_state[i] += coefficient * vec_[i];
+                vec_[i] = current_state[i];
+            }
+        }
+        std::swap(vec_, new_state);
+        std::swap(tmpBuff1_, new_state);
+        std::swap(tmpBuff2_, current_state);
+    }
+
+    calc_type get_probability(std::vector<bool> const& bit_string,
+                              std::vector<unsigned> const& ids){
+        run();
+        if (!check_ids(ids))
+            throw(std::runtime_error("get_probability(): Unknown qubit id. Please make sure you have called eng.flush()."));
+        std::size_t mask = 0, bit_str = 0;
+        for (unsigned i = 0; i < ids.size(); ++i){
+            mask |= 1UL << map_[ids[i]];
+            bit_str |= (bit_string[i]?1UL:0UL) << map_[ids[i]];
+        }
+        calc_type probability = 0.;
+        #pragma omp parallel for reduction(+:probability) schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i)
+            if ((i & mask) == bit_str)
+                probability += std::norm(vec_[i]);
+        return probability;
+    }
+
+    complex_type const& get_amplitude(std::vector<bool> const& bit_string,
+                                      std::vector<unsigned> const& ids){
+        run();
+        std::size_t chk = 0;
+        std::size_t index = 0;
+        for (unsigned i = 0; i < ids.size(); ++i){
+            if (map_.count(ids[i]) == 0)
+                break;
+            chk |= 1UL << map_[ids[i]];
+            index |= (bit_string[i]?1UL:0UL) << map_[ids[i]];
+        }
+        if (chk + 1 != vec_.size())
+            throw(std::runtime_error("The second argument to get_amplitude() must be a permutation of all allocated qubits. Please make sure you have called eng.flush()."));
+        return vec_[index];
+    }
+
+    void emulate_time_evolution(TermsDict const& tdict, calc_type const& time,
+                                std::vector<unsigned> const& ids,
+                                std::vector<unsigned> const& ctrl){
+        run();
+        complex_type I(0., 1.);
+        calc_type tr = 0., op_nrm = 0.;
+        TermsDict td;
+        for (unsigned i = 0; i < tdict.size(); ++i){
+            if (tdict[i].first.size() == 0)
+                tr += tdict[i].second;
+            else{
+                td.push_back(tdict[i]);
+                op_nrm += std::abs(tdict[i].second);
+            }
+        }
+        unsigned s = std::abs(time) * op_nrm + 1.;
+        complex_type correction = std::exp(-time * I * tr / (double)s);
+        auto output_state = vec_;
+        auto ctrlmask = get_control_mask(ctrl);
+        for (unsigned i = 0; i < s; ++i){
+            calc_type nrm_change = 1.;
+            for (unsigned k = 0; nrm_change > 1.e-12; ++k){
+                auto coeff = (-time * I) / double(s * (k + 1));
+                auto current_state = vec_;
+                auto update = StateVector(vec_.size(), 0.);
+                for (auto const& tup : td){
+                    apply_term(tup.first, ids, {});
+                    #pragma omp parallel for schedule(static)
+                    for (std::size_t j = 0; j < vec_.size(); ++j){
+                        update[j] += vec_[j] * tup.second;
+                        vec_[j] = current_state[j];
+                    }
+                }
+                nrm_change = 0.;
+                #pragma omp parallel for reduction(+:nrm_change) schedule(static)
+                for (std::size_t j = 0; j < vec_.size(); ++j){
+                    update[j] *= coeff;
+                    vec_[j] = update[j];
+                    if ((j & ctrlmask) == ctrlmask){
+                        output_state[j] += update[j];
+                        nrm_change += std::norm(update[j]);
+                    }
+                }
+                nrm_change = std::sqrt(nrm_change);
+            }
+            #pragma omp parallel for schedule(static)
+            for (std::size_t j = 0; j < vec_.size(); ++j){
+                if ((j & ctrlmask) == ctrlmask)
+                    output_state[j] *= correction;
+                vec_[j] = output_state[j];
+            }
+        }
+    }
+
+    void set_wavefunction(StateVector const& wavefunction, std::vector<unsigned> const& ordering){
+        run();
+        // make sure there are 2^n amplitudes for n qubits
+        assert(wavefunction.size() == (1UL << ordering.size()));
+        // check that all qubits have been allocated previously
+        if (map_.size() != ordering.size() || !check_ids(ordering))
+            throw(std::runtime_error("set_wavefunction(): Invalid mapping provided. Please make sure all qubits have been allocated previously (call eng.flush())."));
+
+        // set mapping and wavefunction
+        for (unsigned i = 0; i < ordering.size(); ++i)
+            map_[ordering[i]] = i;
+        #pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < wavefunction.size(); ++i)
+            vec_[i] = wavefunction[i];
+    }
+
+    void collapse_wavefunction(std::vector<unsigned> const& ids, std::vector<bool> const& values){
+        run();
+        if (ids.size() != values.size())
+            throw(std::length_error("collapse_wavefunction(): ids and values size mismatch"));
+        if (!check_ids(ids))
+            throw(std::runtime_error("collapse_wavefunction(): Unknown qubit id(s) provided. Try calling eng.flush() before invoking this function."));
+        std::size_t mask = 0, val = 0;
+        for (unsigned i = 0; i < ids.size(); ++i){
+            mask |= (1UL << map_[ids[i]]);
+            val |= ((values[i]?1UL:0UL) << map_[ids[i]]);
+        }
+        // set bad entries to 0 and compute probability of outcome to renormalize
+        calc_type N = 0.;
+        #pragma omp parallel for reduction(+:N) schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i){
+            if ((i & mask) == val)
+                N += std::norm(vec_[i]);
+        }
+        if (N < 1.e-12)
+            throw(std::runtime_error("collapse_wavefunction(): Invalid collapse! Probability is ~0."));
+        // re-normalize (if possible)
+        N = 1./std::sqrt(N);
+        #pragma omp parallel for schedule(static)
+        for (std::size_t i = 0; i < vec_.size(); ++i){
+            if ((i & mask) != val)
+                vec_[i] = 0.;
+            else
+                vec_[i] *= N;
+        }
+    }
+
+    void run(){
+        if (fused_gates_.size() < 1)
+            return;
+
+        Fusion::Matrix m;
+        Fusion::IndexVector ids, ctrls;
+
+        fused_gates_.perform_fusion(m, ids, ctrls);
+
+        for (auto& id : ids)
+            id = map_[id];
+
+        auto ctrlmask = get_control_mask(ctrls);
+
+        switch (ids.size()){
+            case 1:
+                #pragma omp parallel
+                kernel(vec_, ids[0], m, ctrlmask);
+                break;
+            case 2:
+                #pragma omp parallel
+                kernel(vec_, ids[1], ids[0], m, ctrlmask);
+                break;
+            case 3:
+                #pragma omp parallel
+                kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask);
+                break;
+            case 4:
+                #pragma omp parallel
+                kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
+                break;
+            case 5:
+                #pragma omp parallel
+                kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
+                break;
+            default:
+                throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
+        }
+
+        fused_gates_ = Fusion();
+    }
+
+    std::tuple<Map, StateVector&> cheat(){
+        run();
+        return make_tuple(map_, std::ref(vec_));
+    }
+
+    ~Simulator(){
+    }
+
+private:
+    void apply_term(Term const& term, std::vector<unsigned> const& ids,
+                    std::vector<unsigned> const& ctrl){
+        complex_type I(0., 1.);
+        Fusion::Matrix X = {{0., 1.}, {1., 0.}};
+        Fusion::Matrix Y = {{0., -I}, {I, 0.}};
+        Fusion::Matrix Z = {{1., 0.}, {0., -1.}};
+        std::vector<Fusion::Matrix> gates = {X, Y, Z};
+        for (auto const& local_op : term){
+            unsigned id = ids[local_op.first];
+            apply_controlled_gate(gates[local_op.second - 'X'], {id}, ctrl);
+        }
+        run();
+    }
+    std::size_t get_control_mask(std::vector<unsigned> const& ctrls){
+        std::size_t ctrlmask = 0;
+        for (auto c : ctrls)
+            ctrlmask |= (1UL << map_[c]);
+        return ctrlmask;
+    }
+
+    bool check_ids(std::vector<unsigned> const& ids){
+        for (auto id : ids)
+            if (!map_.count(id))
+                return false;
+        return true;
+    }
+
+    unsigned N_; // #qubits
+    StateVector vec_;
+    Map map_;
+    Fusion fused_gates_;
+    unsigned fusion_qubits_min_, fusion_qubits_max_;
+    RndEngine rnd_eng_;
+    std::function<double()> rng_;
+
+    // large array buffers to avoid costly reallocations
+    static StateVector tmpBuff1_, tmpBuff2_;
+};
+
+Simulator::StateVector Simulator::tmpBuff1_;
+Simulator::StateVector Simulator::tmpBuff2_;
+
+#endif
diff --git a/third_party/cppsim/src/_cppsim.cpp b/third_party/cppsim/src/_cppsim.cpp
new file mode 100644
index 00000000..a8122155
--- /dev/null
+++ b/third_party/cppsim/src/_cppsim.cpp
@@ -0,0 +1,67 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/complex.h>
+#include <pybind11/stl.h>
+#include <pybind11/pytypes.h>
+#include <vector>
+#include <complex>
+#include <iostream>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+#include "simulator.hpp"
+
+namespace py = pybind11;
+
+using c_type = std::complex<double>;
+using ArrayType = std::vector<c_type, aligned_allocator<c_type,64>>;
+using MatrixType = std::vector<ArrayType>;
+using QuRegs = std::vector<std::vector<unsigned>>;
+
+template <class QR>
+void emulate_math_wrapper(Simulator &sim, py::function const& pyfunc, QR const& qr, std::vector<unsigned> const& ctrls){
+    auto f = [&](std::vector<int>& x) {
+        pybind11::gil_scoped_acquire acquire;
+        x = pyfunc(x).cast<std::vector<int>>();
+    };
+    pybind11::gil_scoped_release release;
+    sim.emulate_math(f, qr, ctrls);
+}
+
+PYBIND11_MODULE(_cppsim, m)
+{
+    py::class_<Simulator>(m, "Simulator")
+        .def(py::init<unsigned>())
+        .def("allocate_qubit", &Simulator::allocate_qubit)
+        .def("deallocate_qubit", &Simulator::deallocate_qubit)
+        .def("get_classical_value", &Simulator::get_classical_value)
+        .def("is_classical", &Simulator::is_classical)
+        .def("measure_qubits", &Simulator::measure_qubits_return)
+        .def("apply_controlled_gate", &Simulator::apply_controlled_gate<MatrixType>)
+        .def("emulate_math", &emulate_math_wrapper<QuRegs>)
+        .def("emulate_math_addConstant", &Simulator::emulate_math_addConstant<QuRegs>)
+        .def("emulate_math_addConstantModN", &Simulator::emulate_math_addConstantModN<QuRegs>)
+        .def("emulate_math_multiplyByConstantModN", &Simulator::emulate_math_multiplyByConstantModN<QuRegs>)
+        .def("get_expectation_value", &Simulator::get_expectation_value)
+        .def("apply_qubit_operator", &Simulator::apply_qubit_operator)
+        .def("emulate_time_evolution", &Simulator::emulate_time_evolution)
+        .def("get_probability", &Simulator::get_probability)
+        .def("get_amplitude", &Simulator::get_amplitude)
+        .def("set_wavefunction", &Simulator::set_wavefunction)
+        .def("collapse_wavefunction", &Simulator::collapse_wavefunction)
+        .def("run", &Simulator::run)
+        .def("cheat", &Simulator::cheat)
+        ;
+}

From 707f6fc5926130488ccda2647f6bca5ea184d5c0 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 10 Aug 2022 17:54:20 +0200
Subject: [PATCH 12/82] Adding support for both plain and keywork link
 libraries specification, based on the KEYWORD option

---
 third_party/cppsim/ThirdParty/res_embed | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/ThirdParty/res_embed b/third_party/cppsim/ThirdParty/res_embed
index acae2559..fec344c6 160000
--- a/third_party/cppsim/ThirdParty/res_embed
+++ b/third_party/cppsim/ThirdParty/res_embed
@@ -1 +1 @@
-Subproject commit acae2559153462208b449fedb1b186522126e3bf
+Subproject commit fec344c60b7ed5b4303c98e670c4bea8eead7d2e

From 2bfe527226d549dbaf4e8e640eb824d7e48db081 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 11 Aug 2022 14:08:50 +0200
Subject: [PATCH 13/82] Working on the embedded kernel generation and a
 benchmark executable

---
 third_party/cppsim/CMakeLists.txt             | 14 ++++--
 third_party/cppsim/include/kernelgen.hpp      | 30 +++++++++++
 third_party/cppsim/include/simulator.hpp      |  6 ++-
 .../cppsim/src/benchmark/benchmark.cpp        | 50 +++++++++++++++++++
 third_party/cppsim/src/kernelgen.cpp          | 40 +++++++++++++++
 third_party/cppsim/src/test/test_nointrin.cpp |  2 -
 6 files changed, 136 insertions(+), 6 deletions(-)
 create mode 100644 third_party/cppsim/include/kernelgen.hpp
 create mode 100644 third_party/cppsim/src/benchmark/benchmark.cpp
 create mode 100644 third_party/cppsim/src/kernelgen.cpp

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index fbc32bcb..8f402e77 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -8,13 +8,18 @@ add_subdirectory(ThirdParty/res_embed EXCLUDE_FROM_ALL)
 
 add_subdirectory(ThirdParty/pybind11 EXCLUDE_FROM_ALL)
 
-find_package(Python3 COMPONENTS Interpreter)
+find_package(Python3 COMPONENTS Interpreter Development)
 
 include(ResEmbed)
 
+add_library(kernelgen STATIC "src/kernelgen.cpp")
+set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+res_embed(TARGET kernelgen NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
+target_link_libraries(kernelgen PUBLIC pybind11::pybind11 Python3::Python)
+
 pybind11_add_module(${PROJECT_NAME} SHARED "src/${PROJECT_NAME}.cpp")
-target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-res_embed(TARGET ${PROJECT_NAME} NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
+target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
 
 macro(kernelgen)
 	set(oneValueArgs NQUBITS VARIANT TARGET)
@@ -48,3 +53,6 @@ kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
 
+add_executable(benchmark "src/benchmark/benchmark.cpp")
+target_link_libraries(benchmark PRIVATE gtest kernelgen)
+
diff --git a/third_party/cppsim/include/kernelgen.hpp b/third_party/cppsim/include/kernelgen.hpp
new file mode 100644
index 00000000..42848f87
--- /dev/null
+++ b/third_party/cppsim/include/kernelgen.hpp
@@ -0,0 +1,30 @@
+#ifndef KERNELGEN_HPP
+#define KERNELGEN_HPP
+
+#include <string>
+
+class KernelGen
+{
+	std::string nointrin;
+
+public :
+
+	std::string generate(int nqubits);
+
+	KernelGen();
+};
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template <class V, class Id, class M>
+void kernelgen(V &psi, Id ids, M const& m, std::size_t ctrlmask)
+{
+	static KernelGen g;
+
+	// Generate the kernel source code.
+	auto source = g.generate(ids.size());
+	
+	// TODO Compile the source code using external compiler.
+}
+
+#endif // KERNELGEN_HPP
+
diff --git a/third_party/cppsim/include/simulator.hpp b/third_party/cppsim/include/simulator.hpp
index 1a84723f..b50afb61 100644
--- a/third_party/cppsim/include/simulator.hpp
+++ b/third_party/cppsim/include/simulator.hpp
@@ -26,6 +26,8 @@
 
 #include "intrin/alignedallocator.hpp"
 #include "fusion.hpp"
+#include "kernelgen.hpp"
+
 #include <map>
 #include <cassert>
 #include <algorithm>
@@ -520,7 +522,9 @@ class Simulator{
                 kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
                 break;
             default:
-                throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
+                // Use embedded generator to generate larger gates in runtime
+                kernelgen(vec_, ids, m, ctrlmask);
+		break;
         }
 
         fused_gates_ = Fusion();
diff --git a/third_party/cppsim/src/benchmark/benchmark.cpp b/third_party/cppsim/src/benchmark/benchmark.cpp
new file mode 100644
index 00000000..4766101f
--- /dev/null
+++ b/third_party/cppsim/src/benchmark/benchmark.cpp
@@ -0,0 +1,50 @@
+#include "kernelgen.hpp"
+
+#include "gtest/gtest.h"
+
+#include <array>
+#include <random>
+#include <vector>
+
+template<int nqubits>
+bool benchmark()
+{
+	std::array<unsigned, nqubits> ids;
+	size_t n = 1;
+	for (int i = 0; i < nqubits; i++)
+	{
+		ids[i] = i;
+		n += 1UL << i;
+	}
+
+	std::default_random_engine dre;
+	std::uniform_int_distribution<int> uid(0, 1000);
+
+	// Generate m matrix as integers.
+	std::array<std::array<int, nqubits>, nqubits> m;
+	for (int j = 0; j < m.size(); j++)
+		for (int i = 0; i < m.size(); i++)
+			m[j][i] = uid(dre);
+
+	// Generate psi matrix as integers.
+	std::vector<int> psi(n);
+	for (int i = 0; i < psi.size(); i++)
+		psi[i] = uid(dre);
+
+	// Generate control mask.
+	std::size_t ctrlmask = 0; // uid(dre);
+
+	kernelgen(psi, ids, m, ctrlmask);
+}
+
+TEST(nointrin, kernel6)
+{
+	benchmark<6>();
+}
+
+int main(int argc, char* argv[])
+{
+	::testing::InitGoogleTest(&argc, argv);
+	return RUN_ALL_TESTS();
+}
+
diff --git a/third_party/cppsim/src/kernelgen.cpp b/third_party/cppsim/src/kernelgen.cpp
new file mode 100644
index 00000000..842a24bd
--- /dev/null
+++ b/third_party/cppsim/src/kernelgen.cpp
@@ -0,0 +1,40 @@
+#include "kernelgen.hpp"
+#include "res_embed.h"
+
+#include <pybind11/eval.h>
+
+namespace py = pybind11;
+
+std::string KernelGen::generate(int nqubits)
+{
+	// TODO Use embedded Python interpreter to run the script
+	// and get the resulting string of source code.
+	// We intentionally keep the generator in Python, in order
+	// to let the people to customize it more easily.
+	py::object scope = py::module_::import("__main__").attr("__dict__");
+	py::eval(nointrin, scope);
+}
+
+namespace res {
+
+namespace embed {
+
+namespace init {
+
+void nointrin();
+
+} // namespace init
+
+} // namespace embed
+
+} // namespace res
+
+KernelGen::KernelGen()
+{
+	// Extract the Python script.
+	res::embed::init::nointrin();
+	size_t size = 0;
+	auto source = res::embed::get("nointrin", &size);
+	nointrin = std::string(source, size);
+}
+
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 60ef2b22..4c4013c5 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -21,8 +21,6 @@ namespace generated {
 template<int nqubits, typename Kernels, typename V>
 bool compare(Kernels kernels, V& psi1)
 {
-	constexpr auto dim = 1UL << nqubits;
-
 	std::default_random_engine dre;
 	std::uniform_int_distribution<int> uid(0, 1000);
 

From 3e37b9693cfb9a824a6fdbc01298968da533d5d5 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 11 Aug 2022 17:10:29 +0200
Subject: [PATCH 14/82] Updating the README

---
 third_party/cppsim/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/third_party/cppsim/README.md b/third_party/cppsim/README.md
index a1e07cb0..a74b4c6d 100644
--- a/third_party/cppsim/README.md
+++ b/third_party/cppsim/README.md
@@ -57,4 +57,7 @@ mkdir build
 cd build
 cmake .. -G Ninja
 ninja
+./test_nointrin
+./benchmark
 ```
+

From f5996f9af1a9975e16190f187f0eaf2a83a5eac8 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 12 Aug 2022 11:16:02 +0200
Subject: [PATCH 15/82] Clarifying the README description

---
 third_party/cppsim/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/cppsim/README.md b/third_party/cppsim/README.md
index a74b4c6d..a91d4908 100644
--- a/third_party/cppsim/README.md
+++ b/third_party/cppsim/README.md
@@ -1,6 +1,7 @@
-# ProjectQGen
+# _cppsim
 
-Generate Haener-Steiger quantum kernels in the form used in ProjectQ simulator.
+This is a standalone simulator backend for the ProjectQ framework, extended to
+generate Haener-Steiger quantum kernels for arbitrary number of qubits.
 
 ## Description
 

From 6c352c534df32723a17e8e8cc6ac34e94648325c Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 12 Aug 2022 17:26:24 +0200
Subject: [PATCH 16/82] Use embedded Python interpreter to run the script and
 get the resulting string of source code. We intentionally keep the generator
 in Python, in order to let the people to customize it more easily.

---
 .../cppsim/include/nointrin/kernelgen.py      | 23 +++++++++++--------
 third_party/cppsim/src/kernelgen.cpp          | 21 ++++++++++++++---
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index f376e1e9..bd730297 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -3,15 +3,7 @@
 import itertools
 import os
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Generate Haener-Steiger quantum kernels in the form used in ProjectQ simulator')
-    parser.add_argument('nqubits', type=int, help='The number of qubits to generate the kernel for')
-    parser.add_argument('output', type=str, help='Output file name')
-    args = parser.parse_args()
-    
-    nqubits = int(args.nqubits)
-    output = args.output
-
+def kernelgen(nqubits):
     # All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
     # 0 0
     # 1 0
@@ -86,10 +78,21 @@ def rhs(n, j, i):
 }}
 """
 
+    return kernel
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Generate Haener-Steiger quantum kernels in the form used in ProjectQ simulator')
+    parser.add_argument('nqubits', type=int, help='The number of qubits to generate the kernel for')
+    parser.add_argument('output', type=str, help='Output file name')
+    args = parser.parse_args()
+    
+    nqubits = int(args.nqubits)
+    output = args.output
+
     try:
         os.makedirs(os.path.dirname(output))
     except:
         pass
     with open(output, "w") as o:
-        o.write(kernel)
+        o.write(kernelgen(nqubits))
 
diff --git a/third_party/cppsim/src/kernelgen.cpp b/third_party/cppsim/src/kernelgen.cpp
index 842a24bd..6c57251b 100644
--- a/third_party/cppsim/src/kernelgen.cpp
+++ b/third_party/cppsim/src/kernelgen.cpp
@@ -1,18 +1,33 @@
 #include "kernelgen.hpp"
 #include "res_embed.h"
 
+#include <iostream>
+#include <pybind11/embed.h>
 #include <pybind11/eval.h>
 
 namespace py = pybind11;
 
 std::string KernelGen::generate(int nqubits)
 {
-	// TODO Use embedded Python interpreter to run the script
+	// Use embedded Python interpreter to run the script
 	// and get the resulting string of source code.
 	// We intentionally keep the generator in Python, in order
 	// to let the people to customize it more easily.
-	py::object scope = py::module_::import("__main__").attr("__dict__");
-	py::eval(nointrin, scope);
+	py::scoped_interpreter guard {};
+	try
+	{
+		py::dict globals = py::globals();
+		// Assign the __name__, otherwise it is set to "__main__" by default.
+		globals["__name__"] = "kernelgen";
+		py::eval<py::eval_statements>(nointrin, globals, globals);
+		auto source = globals["kernelgen"](nqubits).cast<std::string>();
+		return source;
+	}
+	catch (pybind11::error_already_set e)
+	{
+		std::cerr << "Unable to invoke the Python script: " << e.what() << std::endl;
+		exit(-1);
+	}
 }
 
 namespace res {

From befaca6ca7ed4ca3aa80a53f8eaecab94e6534e4 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 15 Aug 2022 14:27:49 +0200
Subject: [PATCH 17/82] Adding intermediate files as dependencies, in order to
 perform proper rebuild after resource modification

---
 third_party/cppsim/ThirdParty/res_embed | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/ThirdParty/res_embed b/third_party/cppsim/ThirdParty/res_embed
index fec344c6..112588c8 160000
--- a/third_party/cppsim/ThirdParty/res_embed
+++ b/third_party/cppsim/ThirdParty/res_embed
@@ -1 +1 @@
-Subproject commit fec344c60b7ed5b4303c98e670c4bea8eead7d2e
+Subproject commit 112588c82a713f6c37ec78b7921fb26448b826c9

From b6f98b1d2f6d4bf9ebd57b3d1f67fbb1d38fdb58 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 15 Aug 2022 19:12:56 +0200
Subject: [PATCH 18/82] Working on the runtime compilation support

---
 third_party/cppsim/.gitmodules                |   3 +
 third_party/cppsim/CMakeLists.txt             |   4 +-
 third_party/cppsim/ThirdParty/digestpp        |   1 +
 third_party/cppsim/include/kernelgen.hpp      |  20 +-
 .../{src/benchmark => include}/tempfile.h     |   0
 third_party/cppsim/src/compiler.cpp           | 228 ++++++++++++++++++
 .../cppsim/src/{benchmark => }/tempfile.cpp   |   0
 7 files changed, 253 insertions(+), 3 deletions(-)
 create mode 160000 third_party/cppsim/ThirdParty/digestpp
 rename third_party/cppsim/{src/benchmark => include}/tempfile.h (100%)
 create mode 100644 third_party/cppsim/src/compiler.cpp
 rename third_party/cppsim/src/{benchmark => }/tempfile.cpp (100%)

diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/.gitmodules
index 11cd2be0..dcb9d6d6 100644
--- a/third_party/cppsim/.gitmodules
+++ b/third_party/cppsim/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "ThirdParty/pybind11"]
 	path = ThirdParty/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "build/ThirdParty/digestpp"]
+	path = ThirdParty/digestpp
+	url = https://github.com/kerukuro/digestpp.git
diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 8f402e77..e08d9b1f 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -12,9 +12,11 @@ find_package(Python3 COMPONENTS Interpreter Development)
 
 include(ResEmbed)
 
-add_library(kernelgen STATIC "src/kernelgen.cpp")
+add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp" "src/tempfile.cpp")
+set_target_properties(kernelgen PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/digestpp)
 res_embed(TARGET kernelgen NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
 target_link_libraries(kernelgen PUBLIC pybind11::pybind11 Python3::Python)
 
diff --git a/third_party/cppsim/ThirdParty/digestpp b/third_party/cppsim/ThirdParty/digestpp
new file mode 160000
index 00000000..4ec41066
--- /dev/null
+++ b/third_party/cppsim/ThirdParty/digestpp
@@ -0,0 +1 @@
+Subproject commit 4ec4106677e652a90716ad929d657a622089ef16
diff --git a/third_party/cppsim/include/kernelgen.hpp b/third_party/cppsim/include/kernelgen.hpp
index 42848f87..7c2dec79 100644
--- a/third_party/cppsim/include/kernelgen.hpp
+++ b/third_party/cppsim/include/kernelgen.hpp
@@ -1,6 +1,9 @@
 #ifndef KERNELGEN_HPP
 #define KERNELGEN_HPP
 
+#include "compiler.h"
+
+#include <iostream>
 #include <string>
 
 class KernelGen
@@ -20,10 +23,23 @@ void kernelgen(V &psi, Id ids, M const& m, std::size_t ctrlmask)
 {
 	static KernelGen g;
 
+	const auto nqubits = ids.size();
+
 	// Generate the kernel source code.
-	auto source = g.generate(ids.size());
+	auto source = g.generate(nqubits);
+	
+	// Compile the source code using external compiler.
+	std::string errmsg;
+	void* handle = get_compiler().codegen(nqubits, source, errmsg);
+	if (!handle)
+	{
+		std::cerr << "Kernel generation has failed, aborting:" << std::endl;
+		std::cerr << errmsg;
+		exit(-1);
+	}
 	
-	// TODO Compile the source code using external compiler.
+	// TODO Call the generated kernel.
+	// typedef (*kernel_t)(std::complex<double>* &psi, ???
 }
 
 #endif // KERNELGEN_HPP
diff --git a/third_party/cppsim/src/benchmark/tempfile.h b/third_party/cppsim/include/tempfile.h
similarity index 100%
rename from third_party/cppsim/src/benchmark/tempfile.h
rename to third_party/cppsim/include/tempfile.h
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
new file mode 100644
index 00000000..c54b7c8d
--- /dev/null
+++ b/third_party/cppsim/src/compiler.cpp
@@ -0,0 +1,228 @@
+#include "compiler.h"
+#include "tempfile.h"
+#include "digestpp.hpp"
+
+#include <cstdlib>
+#include <dlfcn.h>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <streambuf>
+
+namespace fs = std::filesystem;
+
+Compiler::Compiler() { }
+
+class Signature
+{
+	int nqubits;
+	std::string source;
+
+	std::string hash_;
+
+public :
+
+	const std::string& hash() const { return hash_; }
+
+	Signature(
+		int nqubits_,
+		const std::string& source_) :
+	
+	nqubits(nqubits_),
+	source(source_)
+	
+	{
+		std::stringstream ss;
+		// TODO Absorb individually.
+	       	ss << digestpp::sha512().absorb(reinterpret_cast<char*>(this), sizeof(Signature)).hexdigest();
+		hash_ = ss.str();
+	}
+};
+
+static std::map<std::string, void*> database;
+
+void* Compiler::codegen(
+	int nqubits,
+	const std::string& source,
+	std::string& errmsg)
+{
+	std::error_code ec;
+
+	// 0) Check whether the kernel has been already compiled for the
+	// requested dimensions.
+	auto hash = Signature(nqubits, source).hash();
+	auto existing = database.find(hash);
+	if (existing != database.end())
+		return existing->second;
+
+	// 1) Create source file.
+	const char* filenameTemplate = "kernelgenXXXXXX";
+	std::string filename = TempFile(filenameTemplate).string(ec);
+	if (ec) return nullptr;
+	{
+		std::stringstream ss;
+
+		// Add the content of engine include file.
+		ss << "#include <algorithm>";
+		ss << std::endl;
+		ss << "#include <array>";
+		ss << std::endl;
+		ss << "#include <complex>";
+		ss << std::endl;
+		ss << "#include <cstdlib>";
+		ss << std::endl;
+		ss << std::endl;
+
+		// Add source code.
+		ss << source;
+		
+		// Add template specializations.
+		for (auto type : std::array {
+			std::make_pair("std::complex<double>", "double"),
+			std::make_pair("int", "int")
+		})
+		{
+			// Add specialization.
+			ss << "template<> void kernel<";
+			ss << type.first;
+			ss << "*, std::array<std::array<int, ";
+			ss << nqubits;
+			ss << ">, ";
+			ss << nqubits;
+			ss << "> >(";
+			ss << type.first;
+			ss << "* &psi, " << std::endl;
+			for (int i = 0; i < nqubits; i++)
+			{
+				ss << "unsigned id";
+				ss << nqubits - i - 1;
+				ss << ", ";
+			}
+			ss << "std::array<std::array<int, ";
+			ss << nqubits;
+			ss << ">, ";
+			ss << nqubits;
+			ss << "> const& m, std::size_t ctrlmask);";
+			ss << std::endl << std::endl;
+			
+			// Adding entrypoint.
+			ss << "extern \"C\" void kernel_";
+			ss << type.second;
+			ss << "(";
+			ss << type.second;
+			ss << "* psi, std::array<unsigned, ";
+			ss << nqubits;
+			ss << "> ids, std::array<std::array<int, ";
+			ss << nqubits;
+			ss << ">, ";
+			ss << nqubits;
+			ss << "> const& m, std::size_t ctrlmask)";
+			ss << std::endl;
+			ss << "{";
+			ss << std::endl;
+			ss << "\tkernel(reinterpret_cast<";
+			ss << type.first;
+			ss << "*&>(psi), ";
+			for (int i = 0; i < nqubits; i++)
+			{
+				ss << "ids[";
+				ss << nqubits - i - 1;
+				ss << "], ";
+			}
+			ss << "m, ctrlmask);";
+			ss << std::endl;
+			ss << "}";
+			ss << std::endl;
+		}
+			
+		const std::string& source = ss.str();
+#if 1
+		std::cout << source << std::endl;
+#endif
+		std::ofstream file(filename);
+		file << source;
+	}
+
+	// 2) Compile source file into a shared library
+	std::string binname = TempFile(filenameTemplate).string(ec);
+	if (ec) return nullptr;
+	{
+		std::string errlog = TempFile(filenameTemplate).string(ec);
+		if (ec) return nullptr;
+
+		std::stringstream ss;
+#ifdef __APPLE__
+		ss << "g++-11";
+#else
+		ss << "g++";
+#endif
+#if 1
+		ss << " -g -O0 -std=c++17 -x c++ ";
+#else
+		ss << " -g -O3 -ffast-math -fopenmp -std=c++17 -x c++ ";
+#endif
+		ss << filename;
+		ss << " -fPIC -shared -o";
+		ss << binname;
+		ss << " >";
+		ss << errlog;
+		
+		const std::string command = ss.str();
+		system(command.c_str());
+
+		// If the output log file is not empty, read its contents,
+		// and put into errmsg.
+		if (fs::exists(errlog))
+		{
+			std::ifstream file(errlog);
+			errmsg = std::string((std::istreambuf_iterator<char>(file)),
+				std::istreambuf_iterator<char>());
+		}
+	}
+
+	// 3) Load shared library and bind its entry point
+	void* handle = nullptr;
+	{
+		// If the output file does not exist, return NULL.
+		if (!fs::exists(binname))
+			return nullptr;
+
+		// If the shared library could not be loaded, return NULL.
+		void* lib = dlopen(binname.c_str(), RTLD_NOW);
+		if (!lib)
+		{
+			std::stringstream ss;
+			ss << "Could not open \"" << binname <<
+				"\" as a shared library: \"" << dlerror() << "\"" << std::endl;
+			errmsg += ss.str();
+			return nullptr;
+		}
+
+		// TODO If the symbol does not exist, return NULL.
+		handle = dlsym(lib, "kernel_double");
+		if (!handle)
+		{
+			std::stringstream ss;
+			ss << "Could not bind symbol \"model_solve\" in shared library \"" <<
+				binname << "\": \"" << dlerror() << "\"" << std::endl;
+			errmsg += ss.str();
+			return nullptr;
+		}
+	}
+
+	// 4) Cache the compiled eskew kernel in our internal database,
+	// so that we could use it again without recompilation, should the same
+	// dimensions be requested.
+	database[hash] = handle;
+
+	return handle;
+}
+
+Compiler& get_compiler()
+{
+	static Compiler compiler;
+	return compiler;
+}
+
diff --git a/third_party/cppsim/src/benchmark/tempfile.cpp b/third_party/cppsim/src/tempfile.cpp
similarity index 100%
rename from third_party/cppsim/src/benchmark/tempfile.cpp
rename to third_party/cppsim/src/tempfile.cpp

From eaf50140004222856ebe16d67a8e1bcc42c1cc82 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 16 Aug 2022 18:56:46 +0200
Subject: [PATCH 19/82] Getting runtime compilation to succeed

---
 .../cppsim/include/nointrin/kernelgen.py      |  2 +-
 third_party/cppsim/src/compiler.cpp           | 38 ++++++-------------
 2 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index bd730297..7f514b3b 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -53,8 +53,8 @@ def rhs(n, j, i):
 template <class V, class M>
 void kernel(V &psi, {''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits))}M const& m, std::size_t ctrlmask)
 {{
-    std::size_t n = psi.size();
     std::size_t d0 = 1UL << id0{''.join(', d{} = 1UL << id{}'.format(i, i) for i in range (1, nqubits))};
+    std::size_t n = 1{''.join(' + d{}'.format(i) for i in range (0, nqubits))};
     std::size_t dsorted[] = {{ d0{''.join(', d{}'.format(i) for i in range (1, nqubits))} }};
     std::sort(dsorted, dsorted + {nqubits}, std::greater<std::size_t>());
 
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index c54b7c8d..0d82a5c7 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -73,48 +73,34 @@ void* Compiler::codegen(
 		ss << std::endl;
 		ss << "#include <cstdlib>";
 		ss << std::endl;
+		ss << "template <class T>";
+		ss << std::endl;
+		ss << "inline T add(T a, T b){ return a + b; }";
+		ss << std::endl;
+		ss << "template <class T>";
+		ss << "inline T mul(T a, T b){ return a * b; }";
+		ss << std::endl;
 		ss << std::endl;
 
 		// Add source code.
 		ss << source;
 		
-		// Add template specializations.
+		// Adding entrypoints.
 		for (auto type : std::array {
 			std::make_pair("std::complex<double>", "double"),
 			std::make_pair("int", "int")
 		})
 		{
-			// Add specialization.
-			ss << "template<> void kernel<";
-			ss << type.first;
-			ss << "*, std::array<std::array<int, ";
-			ss << nqubits;
-			ss << ">, ";
-			ss << nqubits;
-			ss << "> >(";
-			ss << type.first;
-			ss << "* &psi, " << std::endl;
-			for (int i = 0; i < nqubits; i++)
-			{
-				ss << "unsigned id";
-				ss << nqubits - i - 1;
-				ss << ", ";
-			}
-			ss << "std::array<std::array<int, ";
-			ss << nqubits;
-			ss << ">, ";
-			ss << nqubits;
-			ss << "> const& m, std::size_t ctrlmask);";
-			ss << std::endl << std::endl;
-			
-			// Adding entrypoint.
+			ss << std::endl;
 			ss << "extern \"C\" void kernel_";
 			ss << type.second;
 			ss << "(";
 			ss << type.second;
 			ss << "* psi, std::array<unsigned, ";
 			ss << nqubits;
-			ss << "> ids, std::array<std::array<int, ";
+			ss << "> ids, std::array<std::array<";
+			ss << type.first;
+			ss << ", ";
 			ss << nqubits;
 			ss << ">, ";
 			ss << nqubits;

From 908b26d4e0ed3d9f54292159627a3b4efff6106e Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 17 Aug 2022 21:12:26 +0200
Subject: [PATCH 20/82] Breaking the API compatibility, in order to pass
 generic pointers down to the kernels

---
 .../cppsim/include/nointrin/kernelgen.py       | 18 ++++++++++++------
 third_party/cppsim/src/compiler.cpp            | 16 +++++++---------
 third_party/cppsim/src/test/test_nointrin.cpp  | 10 +++++-----
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 7f514b3b..01a66f77 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -24,9 +24,9 @@ def kernelgen(nqubits):
 
     def rhs(n, j, i):
         if i < n - 1:
-            return f'add(mul(v[{i}], m[{j}][{i}]), ' + rhs(n, j, i + 1)
+            return f'add(mul(v[{i}], M({j}, {i})), ' + rhs(n, j, i + 1)
         else:
-            return f'mul(v[{i}], m[{j}][{i}]' + ''.join(')' for k in range(0, n))
+            return f'mul(v[{i}], M({j}, {i})' + ''.join(')' for k in range(0, n))
 
     # Pretty-print the right hand sides (recursively).
     strrhs = [] 
@@ -35,13 +35,17 @@ def rhs(n, j, i):
 
     # Some string constants clash with the {} syntax of print(), so we
     # substitute them as constants.
+    define = "#define"
+    undef = "#undef"
     pragma = "#pragma";
     newline = "\n";
 
     kernel = \
 f"""
-template <class V, class M>
-inline void kernel_core(V &psi, std::size_t I, std::size_t d0{''.join(', std::size_t d{}'.format(i) for i in range (1, nqubits))}, M const& m)
+{define} M(j, i) (m[j * {nqubits} + i])
+
+template<class T>
+inline void kernel_core(T* psi, std::size_t I, std::size_t d0{''.join(', std::size_t d{}'.format(i) for i in range (1, nqubits))}, const T* m)
 {{
     std::array v =
     {{
@@ -50,8 +54,8 @@ def rhs(n, j, i):
 {''.join('    {} = {};{}'.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))}}}
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
-template <class V, class M>
-void kernel(V &psi, {''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits))}M const& m, std::size_t ctrlmask)
+template<class T>
+void kernel(T* psi, {''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits))}const T* m, std::size_t ctrlmask)
 {{
     std::size_t d0 = 1UL << id0{''.join(', d{} = 1UL << id{}'.format(i, i) for i in range (1, nqubits))};
     std::size_t n = 1{''.join(' + d{}'.format(i) for i in range (0, nqubits))};
@@ -76,6 +80,8 @@ def rhs(n, j, i):
         }}
     }}
 }}
+
+{undef} M
 """
 
     return kernel
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index 0d82a5c7..09378b92 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -98,26 +98,24 @@ void* Compiler::codegen(
 			ss << type.second;
 			ss << "* psi, std::array<unsigned, ";
 			ss << nqubits;
-			ss << "> ids, std::array<std::array<";
-			ss << type.first;
-			ss << ", ";
-			ss << nqubits;
-			ss << ">, ";
-			ss << nqubits;
-			ss << "> const& m, std::size_t ctrlmask)";
+			ss << "> ids, const ";
+			ss << type.second;
+			ss << "* m, std::size_t ctrlmask)";
 			ss << std::endl;
 			ss << "{";
 			ss << std::endl;
 			ss << "\tkernel(reinterpret_cast<";
 			ss << type.first;
-			ss << "*&>(psi), ";
+			ss << "*>(psi), ";
 			for (int i = 0; i < nqubits; i++)
 			{
 				ss << "ids[";
 				ss << nqubits - i - 1;
 				ss << "], ";
 			}
-			ss << "m, ctrlmask);";
+			ss << "reinterpret_cast<const ";
+			ss << type.first;
+			ss << "*>(m), ctrlmask);";
 			ss << std::endl;
 			ss << "}";
 			ss << std::endl;
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 4c4013c5..0c5d3b66 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -58,7 +58,7 @@ TEST(nointrin, kernel1)
 	ASSERT_TRUE(compare<1>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id0, m, ctrlmask);
-		generated::kernel(psi2, id0, m, ctrlmask);
+		generated::kernel(&psi2[0], id0, &m[0][0], ctrlmask);
 	},
 	psi));
 }
@@ -73,7 +73,7 @@ TEST(nointrin, kernel2)
 	ASSERT_TRUE(compare<2>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id1, id0, m, ctrlmask);
-		generated::kernel(psi2, id1, id0, m, ctrlmask);
+		generated::kernel(&psi2[0], id1, id0, &m[0][0], ctrlmask);
 	},
 	psi));
 }
@@ -89,7 +89,7 @@ TEST(nointrin, kernel3)
 	ASSERT_TRUE(compare<3>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id2, id1, id0, m, ctrlmask);
-		generated::kernel(psi2, id2, id1, id0, m, ctrlmask);
+		generated::kernel(&psi2[0], id2, id1, id0, &m[0][0], ctrlmask);
 	},
 	psi));
 }
@@ -106,7 +106,7 @@ TEST(nointrin, kernel4)
 	ASSERT_TRUE(compare<4>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id3, id2, id1, id0, m, ctrlmask);
-		generated::kernel(psi2, id3, id2, id1, id0, m, ctrlmask);
+		generated::kernel(&psi2[0], id3, id2, id1, id0, &m[0][0], ctrlmask);
 	},
 	psi));
 }
@@ -124,7 +124,7 @@ TEST(nointrin, kernel5)
 	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id4, id3, id2, id1, id0, m, ctrlmask);
-		generated::kernel(psi2, id4, id3, id2, id1, id0, m, ctrlmask);
+		generated::kernel(&psi2[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask);
 	},
 	psi));
 }

From a56e5c7b616b5b2d7fc92ddc2b3a7886cf5b2dad Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 18 Aug 2022 15:13:27 +0200
Subject: [PATCH 21/82] Removing debug print

---
 third_party/cppsim/ThirdParty/res_embed | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/ThirdParty/res_embed b/third_party/cppsim/ThirdParty/res_embed
index 112588c8..26a18b27 160000
--- a/third_party/cppsim/ThirdParty/res_embed
+++ b/third_party/cppsim/ThirdParty/res_embed
@@ -1 +1 @@
-Subproject commit 112588c82a713f6c37ec78b7921fb26448b826c9
+Subproject commit 26a18b27794c1fcf698e603beb8b122218dae490

From 77151ab704d957f2d8f7e7fd34d2bc917394b58b Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 18 Aug 2022 19:34:53 +0200
Subject: [PATCH 22/82] Getting the runtime-generated kernel executed, adding
 OpenMP, adding generated kernel to the correctness test. Breaking
 compatibility with the original kernels API, in order to be able to supply
 psi and m as plain pointers

---
 third_party/cppsim/CMakeLists.txt             |  6 ++-
 third_party/cppsim/include/kernelgen.hpp      |  9 ++--
 .../cppsim/include/nointrin/kernelgen.py      |  1 +
 .../cppsim/src/benchmark/benchmark.cpp        |  2 +-
 third_party/cppsim/src/compiler.cpp           | 12 +++---
 third_party/cppsim/src/test/test_nointrin.cpp | 41 ++++++++++++++-----
 6 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index e08d9b1f..e95da456 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -10,6 +10,8 @@ add_subdirectory(ThirdParty/pybind11 EXCLUDE_FROM_ALL)
 
 find_package(Python3 COMPONENTS Interpreter Development)
 
+find_package(OpenMP REQUIRED)
+
 include(ResEmbed)
 
 add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp" "src/tempfile.cpp")
@@ -18,7 +20,7 @@ set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/digestpp)
 res_embed(TARGET kernelgen NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
-target_link_libraries(kernelgen PUBLIC pybind11::pybind11 Python3::Python)
+target_link_libraries(kernelgen PUBLIC pybind11::pybind11 Python3::Python OpenMP::OpenMP_CXX)
 
 pybind11_add_module(${PROJECT_NAME} SHARED "src/${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
@@ -48,7 +50,7 @@ endmacro()
 add_executable(test_nointrin "src/test/test_nointrin.cpp")
 set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_link_libraries(test_nointrin PRIVATE gtest)
+target_link_libraries(test_nointrin PRIVATE gtest kernelgen)
 kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
diff --git a/third_party/cppsim/include/kernelgen.hpp b/third_party/cppsim/include/kernelgen.hpp
index 7c2dec79..12fdb44e 100644
--- a/third_party/cppsim/include/kernelgen.hpp
+++ b/third_party/cppsim/include/kernelgen.hpp
@@ -19,7 +19,7 @@ public :
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template <class V, class Id, class M>
-void kernelgen(V &psi, Id ids, M const& m, std::size_t ctrlmask)
+void kernelgen(V &psi, Id& ids, M const& m, std::size_t ctrlmask)
 {
 	static KernelGen g;
 
@@ -38,8 +38,11 @@ void kernelgen(V &psi, Id ids, M const& m, std::size_t ctrlmask)
 		exit(-1);
 	}
 	
-	// TODO Call the generated kernel.
-	// typedef (*kernel_t)(std::complex<double>* &psi, ???
+	// Call the generated kernel.
+	typedef void (*kernel_t)(int* /*psi*/, unsigned int* /*ids*/, const int* /*m*/, size_t /*ctrlmask*/);
+	auto kernel = (kernel_t)handle;
+	#pragma omp parallel
+	kernel(reinterpret_cast<int*>(&psi[0]), &ids[0], reinterpret_cast<const int*>(&m[0][0]), ctrlmask);
 }
 
 #endif // KERNELGEN_HPP
diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 01a66f77..95e2b441 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -42,6 +42,7 @@ def rhs(n, j, i):
 
     kernel = \
 f"""
+{define} LOOP_COLLAPSE{nqubits} {nqubits + 1} 
 {define} M(j, i) (m[j * {nqubits} + i])
 
 template<class T>
diff --git a/third_party/cppsim/src/benchmark/benchmark.cpp b/third_party/cppsim/src/benchmark/benchmark.cpp
index 4766101f..900beb63 100644
--- a/third_party/cppsim/src/benchmark/benchmark.cpp
+++ b/third_party/cppsim/src/benchmark/benchmark.cpp
@@ -18,7 +18,7 @@ bool benchmark()
 	}
 
 	std::default_random_engine dre;
-	std::uniform_int_distribution<int> uid(0, 1000);
+	std::uniform_int_distribution<int> uid(-1000, 1000);
 
 	// Generate m matrix as integers.
 	std::array<std::array<int, nqubits>, nqubits> m;
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index 09378b92..2ed3d4a0 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -96,9 +96,7 @@ void* Compiler::codegen(
 			ss << type.second;
 			ss << "(";
 			ss << type.second;
-			ss << "* psi, std::array<unsigned, ";
-			ss << nqubits;
-			ss << "> ids, const ";
+			ss << "* psi, const unsigned* ids, const ";
 			ss << type.second;
 			ss << "* m, std::size_t ctrlmask)";
 			ss << std::endl;
@@ -122,7 +120,7 @@ void* Compiler::codegen(
 		}
 			
 		const std::string& source = ss.str();
-#if 1
+#if 0
 		std::cout << source << std::endl;
 #endif
 		std::ofstream file(filename);
@@ -142,7 +140,7 @@ void* Compiler::codegen(
 #else
 		ss << "g++";
 #endif
-#if 1
+#if 0
 		ss << " -g -O0 -std=c++17 -x c++ ";
 #else
 		ss << " -g -O3 -ffast-math -fopenmp -std=c++17 -x c++ ";
@@ -184,8 +182,8 @@ void* Compiler::codegen(
 			return nullptr;
 		}
 
-		// TODO If the symbol does not exist, return NULL.
-		handle = dlsym(lib, "kernel_double");
+		// If the symbol does not exist, return NULL.
+		handle = dlsym(lib, "kernel_int");
 		if (!handle)
 		{
 			std::stringstream ss;
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 0c5d3b66..ccde55c0 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -12,6 +12,8 @@ namespace generated {
 
 } // namespace generated
 
+#include "kernelgen.hpp"
+
 #include <array>
 #include <iostream>
 #include <random>
@@ -22,7 +24,7 @@ template<int nqubits, typename Kernels, typename V>
 bool compare(Kernels kernels, V& psi1)
 {
 	std::default_random_engine dre;
-	std::uniform_int_distribution<int> uid(0, 1000);
+	std::uniform_int_distribution<int> uid(-1000, 1000);
 
 	// Generate m matrix as integers.
 	std::array<std::array<int, nqubits>, nqubits> m;
@@ -34,18 +36,25 @@ bool compare(Kernels kernels, V& psi1)
 	for (int i = 0; i < psi1.size(); i++)
 		psi1[i] = uid(dre);
 	auto psi2 = psi1;
+	auto psi3 = psi1;
 
 	// Generate control mask.
 	std::size_t ctrlmask = 0; // uid(dre);
 
 	// Compare kernel against generated kernel.
-	kernels(psi1, psi2, m, ctrlmask);
-	auto diff = std::mismatch(psi1.begin(), psi1.end(), psi2.begin());
-	if (diff.first == psi1.end())
+	kernels(psi1, psi2, psi3, m, ctrlmask);
+	auto diff2 = std::mismatch(psi1.begin(), psi1.end(), psi2.begin());
+	auto diff3 = std::mismatch(psi1.begin(), psi1.end(), psi3.begin());
+	if ((diff2.first == psi1.end()) && (diff3.first == psi1.end()))
 		return true;
 
-	std::cout << "Mismatch at " << std::distance(psi1.begin(), diff.first) <<
-		" : " << *(diff.first) << " != " << *(diff.second) << std::endl;
+	if (diff2.first != psi1.end())
+		std::cout << "Mismatch at " << std::distance(psi1.begin(), diff2.first) <<
+			" : " << *(diff2.first) << " != " << *(diff2.second) << std::endl;
+	if (diff3.first != psi1.end())
+		std::cout << "Mismatch at " << std::distance(psi1.begin(), diff3.first) <<
+			" : " << *(diff3.first) << " != " << *(diff3.second) << std::endl;
+
 	return false;
 }
 
@@ -55,10 +64,12 @@ TEST(nointrin, kernel1)
 	size_t n = 1;
 	n += 1UL << id0;
 	std::vector<int> psi(n);
-	ASSERT_TRUE(compare<1>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	ASSERT_TRUE(compare<1>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id0, m, ctrlmask);
 		generated::kernel(&psi2[0], id0, &m[0][0], ctrlmask);
+		std::array ids { id0 };
+		kernelgen(psi3, ids, m , ctrlmask);
 	},
 	psi));
 }
@@ -70,10 +81,12 @@ TEST(nointrin, kernel2)
 	n += 1UL << id0;
 	n += 1UL << id1;
 	std::vector<int> psi(n);
-	ASSERT_TRUE(compare<2>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	ASSERT_TRUE(compare<2>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id1, id0, m, ctrlmask);
 		generated::kernel(&psi2[0], id1, id0, &m[0][0], ctrlmask);
+		std::array ids { id0, id1 };
+		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));
 }
@@ -86,10 +99,12 @@ TEST(nointrin, kernel3)
 	n += 1UL << id1;
 	n += 1UL << id2;
 	std::vector<int> psi(n);
-	ASSERT_TRUE(compare<3>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	ASSERT_TRUE(compare<3>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id2, id1, id0, m, ctrlmask);
 		generated::kernel(&psi2[0], id2, id1, id0, &m[0][0], ctrlmask);
+		std::array ids { id0, id1, id2 };
+		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));
 }
@@ -103,10 +118,12 @@ TEST(nointrin, kernel4)
 	n += 1UL << id2;
 	n += 1UL << id3;
 	std::vector<int> psi(n);
-	ASSERT_TRUE(compare<4>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	ASSERT_TRUE(compare<4>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id3, id2, id1, id0, m, ctrlmask);
 		generated::kernel(&psi2[0], id3, id2, id1, id0, &m[0][0], ctrlmask);
+		std::array ids { id0, id1, id2, id3 };
+		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));
 }
@@ -121,10 +138,12 @@ TEST(nointrin, kernel5)
 	n += 1UL << id3;
 	n += 1UL << id4;
 	std::vector<int> psi(n);
-	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id4, id3, id2, id1, id0, m, ctrlmask);
 		generated::kernel(&psi2[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask);
+		std::array ids { id0, id1, id2, id3, id4 };
+		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));
 }

From 1275bee8a9e851c111475ec88c2a06af8943493d Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 19 Aug 2022 17:18:57 +0200
Subject: [PATCH 23/82] Adding an option to substitute ids values directly into
 the runtime-compiled kernel

---
 third_party/cppsim/include/kernelgen.hpp      |  4 +--
 .../cppsim/include/nointrin/kernelgen.py      | 25 +++++++++++--------
 third_party/cppsim/src/compiler.cpp           | 20 +++++++++------
 third_party/cppsim/src/kernelgen.cpp          | 17 ++++++++++---
 4 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/third_party/cppsim/include/kernelgen.hpp b/third_party/cppsim/include/kernelgen.hpp
index 12fdb44e..e5fd130d 100644
--- a/third_party/cppsim/include/kernelgen.hpp
+++ b/third_party/cppsim/include/kernelgen.hpp
@@ -12,7 +12,7 @@ class KernelGen
 
 public :
 
-	std::string generate(int nqubits);
+	std::string generate(int nqubits, unsigned* ids = nullptr);
 
 	KernelGen();
 };
@@ -26,7 +26,7 @@ void kernelgen(V &psi, Id& ids, M const& m, std::size_t ctrlmask)
 	const auto nqubits = ids.size();
 
 	// Generate the kernel source code.
-	auto source = g.generate(nqubits);
+	auto source = g.generate(nqubits, &ids[0]);
 	
 	// Compile the source code using external compiler.
 	std::string errmsg;
diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 95e2b441..aeb0dcae 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -3,7 +3,7 @@
 import itertools
 import os
 
-def kernelgen(nqubits):
+def kernelgen(nqubits, ids=None):
     # All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
     # 0 0
     # 1 0
@@ -33,6 +33,10 @@ def rhs(n, j, i):
     for j in range(0, len(strcombs)):
         strrhs.append(rhs(len(strcombs), j, 0))
 
+    dsorted = []
+    if ids != None:
+    	dsorted = sorted(ids, reverse = True)
+
     # Some string constants clash with the {} syntax of print(), so we
     # substitute them as constants.
     define = "#define"
@@ -45,8 +49,8 @@ def rhs(n, j, i):
 {define} LOOP_COLLAPSE{nqubits} {nqubits + 1} 
 {define} M(j, i) (m[j * {nqubits} + i])
 
-template<class T>
-inline void kernel_core(T* psi, std::size_t I, std::size_t d0{''.join(', std::size_t d{}'.format(i) for i in range (1, nqubits))}, const T* m)
+template<{''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else ''}class T>
+inline void kernel_core(T* psi, std::size_t I{''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else ''}, const T* m)
 {{
     std::array v =
     {{
@@ -56,18 +60,17 @@ def rhs(n, j, i):
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template<class T>
-void kernel(T* psi, {''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits))}const T* m, std::size_t ctrlmask)
+void kernel(T* psi, {''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits)) if ids == None else ''}const T* m, std::size_t ctrlmask)
 {{
-    std::size_t d0 = 1UL << id0{''.join(', d{} = 1UL << id{}'.format(i, i) for i in range (1, nqubits))};
-    std::size_t n = 1{''.join(' + d{}'.format(i) for i in range (0, nqubits))};
-    std::size_t dsorted[] = {{ d0{''.join(', d{}'.format(i) for i in range (1, nqubits))} }};
-    std::sort(dsorted, dsorted + {nqubits}, std::greater<std::size_t>());
-
+    {'constexpr ' if ids != None else ''}std::size_t d0 = 1UL << {'id0' if ids == None else ids[0]}{''.join(', d{} = 1UL << {}'.format(i, 'id{}'.format(i) if ids == None else ids[i]) for i in range (1, nqubits))};
+    {'constexpr ' if ids != None else ''}std::size_t n = 1{''.join(' + d{}'.format(i) for i in range (0, nqubits))};
+    {'constexpr ' if ids != None else ''}std::size_t dsorted[] = {{ d{nqubits - 1}{''.join(', d{}'.format(nqubits - i - 1) for i in range (1, nqubits))} }};
+    {'std::sort(dsorted, dsorted + {}, std::greater<std::size_t>());{}'.format(nqubits, newline) if ids == None else ''}
     if (ctrlmask == 0){{
         {pragma} omp for collapse(LOOP_COLLAPSE{nqubits}) schedule(static)
         for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
 {''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits))}{''.join('    ' for i in range (0, nqubits + 2))}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits - 1}]; ++i{nqubits}){{
-        {''.join('    '.format(i) for i in range (1, nqubits + 2))}kernel_core(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits))}m);
+        {''.join('    '.format(i) for i in range (1, nqubits + 2))}kernel_core{'<d0' if ids != None else ''}{''.join(', d{}'.format(i) for i in range (1, nqubits)) if ids != None else ''}{'>' if ids != None else ''}(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else ''}m);
         {''.join('    '.format(i) for i in range (1, nqubits + 1))}}}
         }}
     }}
@@ -76,7 +79,7 @@ def rhs(n, j, i):
         for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
 {''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits))}{''.join('    ' for i in range (0, nqubits + 2))}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits - 1}]; ++i{nqubits}){{
         {''.join('    '.format(i) for i in range (1, nqubits + 2))}if (((i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))})&ctrlmask) == ctrlmask)
-        {''.join('    '.format(i) for i in range (1, nqubits + 3))}kernel_core(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits))}m);
+        {''.join('    '.format(i) for i in range (1, nqubits + 3))}kernel_core{'<d0' if ids != None else ''}{''.join(', d{}'.format(i) for i in range (1, nqubits)) if ids != None else ''}{'>' if ids != None else ''}(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else ''}m);
         {''.join('    '.format(i) for i in range (1, nqubits + 1))}}}
         }}
     }}
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index 2ed3d4a0..e72a27d2 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -104,14 +104,18 @@ void* Compiler::codegen(
 			ss << std::endl;
 			ss << "\tkernel(reinterpret_cast<";
 			ss << type.first;
-			ss << "*>(psi), ";
-			for (int i = 0; i < nqubits; i++)
-			{
-				ss << "ids[";
-				ss << nqubits - i - 1;
-				ss << "], ";
-			}
-			ss << "reinterpret_cast<const ";
+#if 0
+                        ss << "*>(psi), ";
+                        for (int i = 0; i < nqubits; i++)
+                        {
+                                ss << "ids[";
+                                ss << nqubits - i - 1;
+                                ss << "], ";
+                        }
+                        ss << "reinterpret_cast<const ";
+#else
+			ss << "*>(psi), reinterpret_cast<const ";
+#endif
 			ss << type.first;
 			ss << "*>(m), ctrlmask);";
 			ss << std::endl;
diff --git a/third_party/cppsim/src/kernelgen.cpp b/third_party/cppsim/src/kernelgen.cpp
index 6c57251b..5fa91f91 100644
--- a/third_party/cppsim/src/kernelgen.cpp
+++ b/third_party/cppsim/src/kernelgen.cpp
@@ -4,10 +4,11 @@
 #include <iostream>
 #include <pybind11/embed.h>
 #include <pybind11/eval.h>
+#include <pybind11/stl.h>
 
 namespace py = pybind11;
 
-std::string KernelGen::generate(int nqubits)
+std::string KernelGen::generate(int nqubits, unsigned* ids)
 {
 	// Use embedded Python interpreter to run the script
 	// and get the resulting string of source code.
@@ -20,8 +21,18 @@ std::string KernelGen::generate(int nqubits)
 		// Assign the __name__, otherwise it is set to "__main__" by default.
 		globals["__name__"] = "kernelgen";
 		py::eval<py::eval_statements>(nointrin, globals, globals);
-		auto source = globals["kernelgen"](nqubits).cast<std::string>();
-		return source;
+		if (ids)
+		{
+			std::vector<unsigned> vids;
+			vids.assign(ids, ids + nqubits);
+			auto source = globals["kernelgen"](nqubits, vids).cast<std::string>();
+			return source;
+		}
+		else
+		{
+			auto source = globals["kernelgen"](nqubits).cast<std::string>();
+			return source;
+		}
 	}
 	catch (pybind11::error_already_set e)
 	{

From ed4a7d237e19faf156feb52240af026a2adda9d3 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 22 Aug 2022 17:04:42 +0200
Subject: [PATCH 24/82] Working on a better representation of the generator
 script

---
 .../cppsim/include/nointrin/kernel1.hpp       |  2 +
 .../cppsim/include/nointrin/kernel2.hpp       |  2 +
 .../cppsim/include/nointrin/kernel3.hpp       |  2 +
 .../cppsim/include/nointrin/kernel4.hpp       |  2 +
 .../cppsim/include/nointrin/kernel5.hpp       |  2 +
 .../cppsim/include/nointrin/kernelgen.py      | 42 ++++++++++++++-----
 .../cppsim/include/nointrin/kernels.hpp       | 18 +++-----
 third_party/cppsim/src/test/test_nointrin.cpp | 18 ++++----
 8 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernel1.hpp b/third_party/cppsim/include/nointrin/kernel1.hpp
index d5fcf0d6..cbc8b928 100644
--- a/third_party/cppsim/include/nointrin/kernel1.hpp
+++ b/third_party/cppsim/include/nointrin/kernel1.hpp
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define LOOP_COLLAPSE1 2
+
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m)
 {
diff --git a/third_party/cppsim/include/nointrin/kernel2.hpp b/third_party/cppsim/include/nointrin/kernel2.hpp
index 7aecbae1..84d85830 100644
--- a/third_party/cppsim/include/nointrin/kernel2.hpp
+++ b/third_party/cppsim/include/nointrin/kernel2.hpp
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define LOOP_COLLAPSE2 3
+
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m)
 {
diff --git a/third_party/cppsim/include/nointrin/kernel3.hpp b/third_party/cppsim/include/nointrin/kernel3.hpp
index 76037c43..9369803f 100644
--- a/third_party/cppsim/include/nointrin/kernel3.hpp
+++ b/third_party/cppsim/include/nointrin/kernel3.hpp
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define LOOP_COLLAPSE3 4
+
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m)
 {
diff --git a/third_party/cppsim/include/nointrin/kernel4.hpp b/third_party/cppsim/include/nointrin/kernel4.hpp
index 263b664a..aafac39e 100644
--- a/third_party/cppsim/include/nointrin/kernel4.hpp
+++ b/third_party/cppsim/include/nointrin/kernel4.hpp
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define LOOP_COLLAPSE4 5
+
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m)
 {
diff --git a/third_party/cppsim/include/nointrin/kernel5.hpp b/third_party/cppsim/include/nointrin/kernel5.hpp
index 04773b6d..363f4e9a 100644
--- a/third_party/cppsim/include/nointrin/kernel5.hpp
+++ b/third_party/cppsim/include/nointrin/kernel5.hpp
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#define LOOP_COLLAPSE5 6
+
 template <class V, class M>
 inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m)
 {
diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index aeb0dcae..62cce823 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -41,22 +41,41 @@ def rhs(n, j, i):
     # substitute them as constants.
     define = "#define"
     undef = "#undef"
-    pragma = "#pragma";
-    newline = "\n";
+    pragma = "#pragma"
+    newline = "\n"
 
     kernel = \
-f"""
-{define} LOOP_COLLAPSE{nqubits} {nqubits + 1} 
+"""
+{include} <algorithm>
+{include} <array>
+{include} <complex>
+{include} <cstdlib>
+
+{define} add(a, b) (a + b)
+{define} mul(a, b) (a * b)
+
 {define} M(j, i) (m[j * {nqubits} + i])
 
-template<{''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else ''}class T>
-inline void kernel_core(T* psi, std::size_t I{''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else ''}, const T* m)
+template<{d_template}class T>
+inline void kernel_core(T* psi, std::size_t I{d_var}, const T* m)
 {{
-    std::array v =
-    {{
-{''.join('        {},{}'.format(strcombs[i], newline) for i in range(0, len(strcombs)))}    }};
+    std::array v = {{{v_array}}};
+
+    {psi_assign}
+}}
 
-{''.join('    {} = {};{}'.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))}}}
+{undef} M
+""".format( \
+        include    = "#include", \
+        define     = "#define", \
+        undef      = "#undef", \
+        nqubits    = nqubits, \
+        d_template = ''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else '', \
+        d_var      = ''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else '', \
+        v_array    = newline + ''.join('        {},{}'.format(strcombs[i], newline) for i in range(0, len(strcombs))) + '    ', \
+        psi_assign = ''.join('{} = {};{}    '.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))) + \
+f"""
+{define} LOOP_COLLAPSE{nqubits} ({nqubits} + 1) 
 
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template<class T>
@@ -85,7 +104,8 @@ def rhs(n, j, i):
     }}
 }}
 
-{undef} M
+{undef} LOOP_COLLAPSE{nqubits}
+
 """
 
     return kernel
diff --git a/third_party/cppsim/include/nointrin/kernels.hpp b/third_party/cppsim/include/nointrin/kernels.hpp
index f754731b..d6608a72 100644
--- a/third_party/cppsim/include/nointrin/kernels.hpp
+++ b/third_party/cppsim/include/nointrin/kernels.hpp
@@ -16,21 +16,15 @@
 #include <complex>
 #include <algorithm>
 
-template <class T>
-inline T add(T a, T b){ return a+b; }
-
-template <class T>
-inline T mul(T a, T b){ return a*b; }
-
-
-#define LOOP_COLLAPSE1 2
-#define LOOP_COLLAPSE2 3
-#define LOOP_COLLAPSE3 4
-#define LOOP_COLLAPSE4 5
-#define LOOP_COLLAPSE5 6
+#define add(a, b) (a + b)
+#define mul(a, b) (a * b)
 
 #include "kernel1.hpp"
 #include "kernel2.hpp"
 #include "kernel3.hpp"
 #include "kernel4.hpp"
 #include "kernel5.hpp"
+
+#undef add
+#undef mul
+
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index ccde55c0..0fb9096f 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -1,8 +1,6 @@
 // Ensure hand-written and generated kernels give equal results.
 
-#include "nointrin/kernels.hpp"
-
-namespace generated {
+#define kernel generated_kernel
 
 #include "generated/nointrin/kernel1.hpp"
 #include "generated/nointrin/kernel2.hpp"
@@ -10,7 +8,9 @@ namespace generated {
 #include "generated/nointrin/kernel4.hpp"
 #include "generated/nointrin/kernel5.hpp"
 
-} // namespace generated
+#undef kernel
+
+#include "nointrin/kernels.hpp"
 
 #include "kernelgen.hpp"
 
@@ -67,7 +67,7 @@ TEST(nointrin, kernel1)
 	ASSERT_TRUE(compare<1>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id0, m, ctrlmask);
-		generated::kernel(&psi2[0], id0, &m[0][0], ctrlmask);
+		generated_kernel(&psi2[0], id0, &m[0][0], ctrlmask);
 		std::array ids { id0 };
 		kernelgen(psi3, ids, m , ctrlmask);
 	},
@@ -84,7 +84,7 @@ TEST(nointrin, kernel2)
 	ASSERT_TRUE(compare<2>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id1, id0, m, ctrlmask);
-		generated::kernel(&psi2[0], id1, id0, &m[0][0], ctrlmask);
+		generated_kernel(&psi2[0], id1, id0, &m[0][0], ctrlmask);
 		std::array ids { id0, id1 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},
@@ -102,7 +102,7 @@ TEST(nointrin, kernel3)
 	ASSERT_TRUE(compare<3>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id2, id1, id0, m, ctrlmask);
-		generated::kernel(&psi2[0], id2, id1, id0, &m[0][0], ctrlmask);
+		generated_kernel(&psi2[0], id2, id1, id0, &m[0][0], ctrlmask);
 		std::array ids { id0, id1, id2 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},
@@ -121,7 +121,7 @@ TEST(nointrin, kernel4)
 	ASSERT_TRUE(compare<4>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id3, id2, id1, id0, m, ctrlmask);
-		generated::kernel(&psi2[0], id3, id2, id1, id0, &m[0][0], ctrlmask);
+		generated_kernel(&psi2[0], id3, id2, id1, id0, &m[0][0], ctrlmask);
 		std::array ids { id0, id1, id2, id3 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},
@@ -141,7 +141,7 @@ TEST(nointrin, kernel5)
 	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id4, id3, id2, id1, id0, m, ctrlmask);
-		generated::kernel(&psi2[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask);
+		generated_kernel(&psi2[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask);
 		std::array ids { id0, id1, id2, id3, id4 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},

From 1331e413066e103313e9b76aecf3b819a34b6781 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 23 Aug 2022 19:10:38 +0200
Subject: [PATCH 25/82] Finalizing better representation of the generator
 script

---
 .../cppsim/include/nointrin/kernelgen.py      | 63 ++++++++++++-------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 62cce823..6dd12599 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -39,9 +39,6 @@ def rhs(n, j, i):
 
     # Some string constants clash with the {} syntax of print(), so we
     # substitute them as constants.
-    define = "#define"
-    undef = "#undef"
-    pragma = "#pragma"
     newline = "\n"
 
     kernel = \
@@ -59,7 +56,7 @@ def rhs(n, j, i):
 template<{d_template}class T>
 inline void kernel_core(T* psi, std::size_t I{d_var}, const T* m)
 {{
-    std::array v = {{{v_array}}};
+    std::array {v_array};
 
     {psi_assign}
 }}
@@ -72,41 +69,59 @@ def rhs(n, j, i):
         nqubits    = nqubits, \
         d_template = ''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else '', \
         d_var      = ''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else '', \
-        v_array    = newline + ''.join('        {},{}'.format(strcombs[i], newline) for i in range(0, len(strcombs))) + '    ', \
+        v_array    = f"v = {{{newline}" + ''.join('{}{},{}'.format(' ' * 8, strcombs[i], newline) for i in range(0, len(strcombs))) + "{}}}".format(' ' * 4), \
         psi_assign = ''.join('{} = {};{}    '.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))) + \
-f"""
-{define} LOOP_COLLAPSE{nqubits} ({nqubits} + 1) 
-
+"""
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template<class T>
-void kernel(T* psi, {''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits)) if ids == None else ''}const T* m, std::size_t ctrlmask)
+void kernel(T* psi, {id_var}const T* m, std::size_t ctrlmask)
 {{
-    {'constexpr ' if ids != None else ''}std::size_t d0 = 1UL << {'id0' if ids == None else ids[0]}{''.join(', d{} = 1UL << {}'.format(i, 'id{}'.format(i) if ids == None else ids[i]) for i in range (1, nqubits))};
-    {'constexpr ' if ids != None else ''}std::size_t n = 1{''.join(' + d{}'.format(i) for i in range (0, nqubits))};
-    {'constexpr ' if ids != None else ''}std::size_t dsorted[] = {{ d{nqubits - 1}{''.join(', d{}'.format(nqubits - i - 1) for i in range (1, nqubits))} }};
-    {'std::sort(dsorted, dsorted + {}, std::greater<std::size_t>());{}'.format(nqubits, newline) if ids == None else ''}
+    {constexpr}std::size_t {d};
+    {constexpr}std::size_t {n};
+    {constexpr}std::size_t {dsorted};
+    {sort}
     if (ctrlmask == 0){{
-        {pragma} omp for collapse(LOOP_COLLAPSE{nqubits}) schedule(static)
+        {pragma} omp for collapse({collapse}) schedule(static)
         for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
-{''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits))}{''.join('    ' for i in range (0, nqubits + 2))}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits - 1}]; ++i{nqubits}){{
-        {''.join('    '.format(i) for i in range (1, nqubits + 2))}kernel_core{'<d0' if ids != None else ''}{''.join(', d{}'.format(i) for i in range (1, nqubits)) if ids != None else ''}{'>' if ids != None else ''}(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else ''}m);
-        {''.join('    '.format(i) for i in range (1, nqubits + 1))}}}
+{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits_1}]; ++i{nqubits}){{
+        {offset_1}kernel_core{d_template}(psi, {i}, {d_args}m);
+        {offset}}}
         }}
     }}
     else{{
-        {pragma} omp for collapse(LOOP_COLLAPSE{nqubits}) schedule(static)
+        {pragma} omp for collapse({collapse}) schedule(static)
         for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
-{''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits))}{''.join('    ' for i in range (0, nqubits + 2))}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits - 1}]; ++i{nqubits}){{
-        {''.join('    '.format(i) for i in range (1, nqubits + 2))}if (((i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))})&ctrlmask) == ctrlmask)
-        {''.join('    '.format(i) for i in range (1, nqubits + 3))}kernel_core{'<d0' if ids != None else ''}{''.join(', d{}'.format(i) for i in range (1, nqubits)) if ids != None else ''}{'>' if ids != None else ''}(psi, i0{''.join(' + i{}'.format(i) for i in range (1, nqubits + 1))}, {''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else ''}m);
-        {''.join('    '.format(i) for i in range (1, nqubits + 1))}}}
+{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits_1}]; ++i{nqubits}){{
+        {offset_1}if ((({i})&ctrlmask) == ctrlmask)
+        {offset_2}kernel_core{d_template}(psi, {i}, {d_args}m);
+        {offset}}}
         }}
     }}
 }}
 
-{undef} LOOP_COLLAPSE{nqubits}
+{undef} add
+{undef} mul
 
-"""
+""".format( \
+        define      = "#define", \
+        undef       = "#undef", \
+        pragma      = "#pragma", \
+        nqubits     = nqubits, \
+        nqubits_1   = nqubits - 1, \
+        id_var      = ''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits)) if ids == None else '', \
+        constexpr   = 'constexpr ' if ids != None else '',
+        d           = f"d0 = 1UL << {'id0' if ids == None else ids[0]}{''.join(', d{} = 1UL << {}'.format(i, 'id{}'.format(i) if ids == None else ids[i]) for i in range (1, nqubits))}", \
+        d_args      = ''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else '', \
+        n           = 'n = 1' + ''.join(' + d{}'.format(i) for i in range (0, nqubits)), \
+        dsorted     = f"dsorted[] = {{ d{nqubits - 1}" + ''.join(', d{}'.format(nqubits - i - 1) for i in range (1, nqubits)) + f" }}", \
+        sort        = f'std::sort(dsorted, dsorted + {nqubits}, std::greater<std::size_t>());{newline}' if ids == None else '', \
+        collapse    = f"{nqubits + 1}", \
+        offset    = ''.join('    '.format(i) for i in range (0, nqubits)), \
+        offset_1    = ''.join('    '.format(i) for i in range (0, nqubits + 1)), \
+        offset_2    = ''.join('    '.format(i) for i in range (0, nqubits + 2)), \
+        d_template  = ('<d0' + ''.join(', d{}'.format(i) for i in range (1, nqubits)) + '>') if ids != None else '', \
+        i           = 'i0' + ''.join(' + i{}'.format(i) for i in range (1, nqubits + 1)), \
+        for_loops   = ''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits)))
 
     return kernel
 

From 5b131af10af5266d24c0ce4d967cfdc7374256b1 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 24 Aug 2022 15:27:23 +0200
Subject: [PATCH 26/82] Use sorted 'd' values as scalars instead of an array,
 in case of runtime compilation

---
 .../cppsim/include/nointrin/kernelgen.py      | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 6dd12599..61336b79 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -33,9 +33,9 @@ def rhs(n, j, i):
     for j in range(0, len(strcombs)):
         strrhs.append(rhs(len(strcombs), j, 0))
 
-    dsorted = []
+    ids_sorted = []
     if ids != None:
-    	dsorted = sorted(ids, reverse = True)
+    	ids_sorted = sorted(ids, reverse = True)
 
     # Some string constants clash with the {} syntax of print(), so we
     # substitute them as constants.
@@ -82,16 +82,16 @@ def rhs(n, j, i):
     {sort}
     if (ctrlmask == 0){{
         {pragma} omp for collapse({collapse}) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
-{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits_1}]; ++i{nqubits}){{
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * {dsorted_0}){{
+{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < {dsorted_last}; ++i{nqubits}){{
         {offset_1}kernel_core{d_template}(psi, {i}, {d_args}m);
         {offset}}}
         }}
     }}
     else{{
         {pragma} omp for collapse({collapse}) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){{
-{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < dsorted[{nqubits_1}]; ++i{nqubits}){{
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * {dsorted_0}){{
+{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < {dsorted_last}; ++i{nqubits}){{
         {offset_1}if ((({i})&ctrlmask) == ctrlmask)
         {offset_2}kernel_core{d_template}(psi, {i}, {d_args}m);
         {offset}}}
@@ -107,13 +107,14 @@ def rhs(n, j, i):
         undef       = "#undef", \
         pragma      = "#pragma", \
         nqubits     = nqubits, \
-        nqubits_1   = nqubits - 1, \
         id_var      = ''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits)) if ids == None else '', \
         constexpr   = 'constexpr ' if ids != None else '',
         d           = f"d0 = 1UL << {'id0' if ids == None else ids[0]}{''.join(', d{} = 1UL << {}'.format(i, 'id{}'.format(i) if ids == None else ids[i]) for i in range (1, nqubits))}", \
         d_args      = ''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else '', \
         n           = 'n = 1' + ''.join(' + d{}'.format(i) for i in range (0, nqubits)), \
-        dsorted     = f"dsorted[] = {{ d{nqubits - 1}" + ''.join(', d{}'.format(nqubits - i - 1) for i in range (1, nqubits)) + f" }}", \
+        dsorted     = (f"dsorted[] = {{ d{nqubits - 1}" + ''.join(', d{}'.format(nqubits - i - 1) for i in range (1, nqubits)) + f" }}") if ids == None else (f"dsorted0 = 1UL << {ids_sorted[0]}{''.join(', dsorted{} = 1UL << {}'.format(i, ids_sorted[i]) for i in range (1, nqubits))}"), \
+        dsorted_0    = "dsorted[0]" if ids == None else "dsorted0", \
+        dsorted_last = f"dsorted[{nqubits - 1}]" if ids == None else f"dsorted{nqubits - 1}", \
         sort        = f'std::sort(dsorted, dsorted + {nqubits}, std::greater<std::size_t>());{newline}' if ids == None else '', \
         collapse    = f"{nqubits + 1}", \
         offset    = ''.join('    '.format(i) for i in range (0, nqubits)), \
@@ -121,7 +122,9 @@ def rhs(n, j, i):
         offset_2    = ''.join('    '.format(i) for i in range (0, nqubits + 2)), \
         d_template  = ('<d0' + ''.join(', d{}'.format(i) for i in range (1, nqubits)) + '>') if ids != None else '', \
         i           = 'i0' + ''.join(' + i{}'.format(i) for i in range (1, nqubits + 1)), \
-        for_loops   = ''.join('{}for (std::size_t i{} = 0; i{} < dsorted[{}]; i{} += 2 * dsorted[{}]){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits)))
+        for_loops   = ''.join('{}for (std::size_t i{} = 0; i{} < dsorted{left}{}{right}; i{} += 2 * dsorted{left}{}{right}){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline,
+            left  = '[' if ids == None else '', \
+            right = ']' if ids == None else '') for i in range (1, nqubits)))
 
     return kernel
 

From d6c105404b572afd8a384beb251549f50d1c166f Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 24 Aug 2022 21:45:56 +0200
Subject: [PATCH 27/82] Use 'v' values as scalars instead of an array, in case
 of runtime compilation. On the other hand, we could possibly use arrays for
 matrix operations?

---
 third_party/cppsim/include/nointrin/kernelgen.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 61336b79..452e1f2f 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -24,9 +24,9 @@ def kernelgen(nqubits, ids=None):
 
     def rhs(n, j, i):
         if i < n - 1:
-            return f'add(mul(v[{i}], M({j}, {i})), ' + rhs(n, j, i + 1)
+            return f'add(mul(v_{i}, M({j}, {i})), ' + rhs(n, j, i + 1)
         else:
-            return f'mul(v[{i}], M({j}, {i})' + ''.join(')' for k in range(0, n))
+            return f'mul(v_{i}, M({j}, {i})' + ''.join(')' for k in range(0, n))
 
     # Pretty-print the right hand sides (recursively).
     strrhs = [] 
@@ -56,8 +56,7 @@ def rhs(n, j, i):
 template<{d_template}class T>
 inline void kernel_core(T* psi, std::size_t I{d_var}, const T* m)
 {{
-    std::array {v_array};
-
+    {v_assign}
     {psi_assign}
 }}
 
@@ -69,7 +68,7 @@ def rhs(n, j, i):
         nqubits    = nqubits, \
         d_template = ''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else '', \
         d_var      = ''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else '', \
-        v_array    = f"v = {{{newline}" + ''.join('{}{},{}'.format(' ' * 8, strcombs[i], newline) for i in range(0, len(strcombs))) + "{}}}".format(' ' * 4), \
+        v_assign    = ''.join('const auto v_{} = {};{}{}'.format(i, strcombs[i], newline, ' ' * 4) for i in range(0, len(strcombs))), \
         psi_assign = ''.join('{} = {};{}    '.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))) + \
 """
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)

From b3a7c6a838092d40f40ca7d23bb110c6a1d65098 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 25 Aug 2022 16:14:58 +0200
Subject: [PATCH 28/82] Using Eigen to do matrix-vector multiplication in
 kernel_core. Eigen should be able to generated vectorized matrix multiplies
 (given that the compiler options are set). Matrix-vector itself is not yet
 efficient though, moreover the gaps in 'psi' elements are bad for memory
 efficiency

---
 third_party/cppsim/.gitmodules                |  3 ++
 third_party/cppsim/CMakeLists.txt             |  1 +
 third_party/cppsim/ThirdParty/eigen           |  1 +
 .../cppsim/include/nointrin/kernelgen.py      | 32 ++++++++++++++-----
 third_party/cppsim/src/test/test_nointrin.cpp |  7 ++--
 5 files changed, 33 insertions(+), 11 deletions(-)
 create mode 160000 third_party/cppsim/ThirdParty/eigen

diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/.gitmodules
index dcb9d6d6..67d6ff8b 100644
--- a/third_party/cppsim/.gitmodules
+++ b/third_party/cppsim/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "build/ThirdParty/digestpp"]
 	path = ThirdParty/digestpp
 	url = https://github.com/kerukuro/digestpp.git
+[submodule "ThirdParty/eigen"]
+	path = ThirdParty/eigen
+	url = https://gitlab.com/libeigen/eigen.git
diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index e95da456..7c77edb8 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -50,6 +50,7 @@ endmacro()
 add_executable(test_nointrin "src/test/test_nointrin.cpp")
 set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
 target_link_libraries(test_nointrin PRIVATE gtest kernelgen)
 kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
diff --git a/third_party/cppsim/ThirdParty/eigen b/third_party/cppsim/ThirdParty/eigen
new file mode 160000
index 00000000..a7c1cac1
--- /dev/null
+++ b/third_party/cppsim/ThirdParty/eigen
@@ -0,0 +1 @@
+Subproject commit a7c1cac18bfef26ec61a73c1619ccf0f9b734745
diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 452e1f2f..9a70fc15 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -3,7 +3,12 @@
 import itertools
 import os
 
-def kernelgen(nqubits, ids=None):
+def kernelgen(nqubits, ids=None, matvec=True):
+    # Temporary falling back to no-matvec for runtime-generated
+    # kernels, because we don't have Eigen included there yet.
+    if ids:
+        matvec = False
+
     # All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
     # 0 0
     # 1 0
@@ -22,13 +27,19 @@ def kernelgen(nqubits, ids=None):
         strcomb += ']';
         strcombs.append(strcomb)
 
+    left = '_'
+    right = ''
+    if matvec:
+    	left = '['
+    	right = ']'
+
+    # Pretty-print the right hand sides (recursively).
     def rhs(n, j, i):
         if i < n - 1:
-            return f'add(mul(v_{i}, M({j}, {i})), ' + rhs(n, j, i + 1)
+            return f'add(mul(v{left}{i}{right}, M({j}, {i})), ' + rhs(n, j, i + 1)
         else:
-            return f'mul(v_{i}, M({j}, {i})' + ''.join(')' for k in range(0, n))
+            return f'mul(v{left}{i}{right}, M({j}, {i})' + ''.join(')' for k in range(0, n))
 
-    # Pretty-print the right hand sides (recursively).
     strrhs = [] 
     for j in range(0, len(strcombs)):
         strrhs.append(rhs(len(strcombs), j, 0))
@@ -47,16 +58,18 @@ def rhs(n, j, i):
 {include} <array>
 {include} <complex>
 {include} <cstdlib>
+{eigen}
 
 {define} add(a, b) (a + b)
 {define} mul(a, b) (a * b)
 
-{define} M(j, i) (m[j * {nqubits} + i])
+{define} M(j, i) (m[j * {n} + i])
 
 template<{d_template}class T>
 inline void kernel_core(T* psi, std::size_t I{d_var}, const T* m)
 {{
-    {v_assign}
+    {v}
+    {matvec}
     {psi_assign}
 }}
 
@@ -66,10 +79,13 @@ def rhs(n, j, i):
         define     = "#define", \
         undef      = "#undef", \
         nqubits    = nqubits, \
+        eigen      = "{define} EIGEN_DEFAULT_DENSE_INDEX_TYPE int{newline}{define} EIGEN_VECTORIZE{newline}{include} <Eigen/Dense>".format(define="#define", newline=newline, include = "#include") if matvec else '', \
+        n          = len(strcombs),
         d_template = ''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else '', \
         d_var      = ''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else '', \
-        v_assign    = ''.join('const auto v_{} = {};{}{}'.format(i, strcombs[i], newline, ' ' * 4) for i in range(0, len(strcombs))), \
-        psi_assign = ''.join('{} = {};{}    '.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs)))) + \
+        v          = f"const std::array v = {{{newline}" + ''.join('{}{},{}'.format(' ' * 8, strcombs[i], newline) for i in range(0, len(strcombs))) + "{}}};{}".format(' ' * 4, newline) if matvec else ''.join('const auto v_{} = {};{}{}'.format(i, strcombs[i], newline, ' ' * 4) for i in range(0, len(strcombs))), \
+        matvec     = "const auto result = Eigen::Map<const Eigen::Matrix<T, {n}, {n}, Eigen::RowMajor>>(m) * Eigen::Map<const Eigen::Vector<T, {n}>>(v.data());{newline}".format(n = len(strcombs), newline = newline) if matvec else '', \
+        psi_assign = ''.join('{} = {};{}    '.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs))) if not matvec else ''.join('{} = result[{}];{}    '.format(strcombs[i], i, newline) for i in range(0, len(strcombs)))) + \
 """
 // bit indices id[.] are given from high to low (e.g. control first for CNOT)
 template<class T>
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 0fb9096f..09293eec 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -24,10 +24,11 @@ template<int nqubits, typename Kernels, typename V>
 bool compare(Kernels kernels, V& psi1)
 {
 	std::default_random_engine dre;
+	dre.seed(0);
 	std::uniform_int_distribution<int> uid(-1000, 1000);
 
 	// Generate m matrix as integers.
-	std::array<std::array<int, nqubits>, nqubits> m;
+	std::array<std::array<int, 1UL << nqubits>, 1UL << nqubits> m;
 	for (int j = 0; j < m.size(); j++)
 		for (int i = 0; i < m.size(); i++)
 			m[j][i] = uid(dre);
@@ -49,10 +50,10 @@ bool compare(Kernels kernels, V& psi1)
 		return true;
 
 	if (diff2.first != psi1.end())
-		std::cout << "Mismatch at " << std::distance(psi1.begin(), diff2.first) <<
+		std::cout << "Mismatch in psi2 at " << std::distance(psi1.begin(), diff2.first) <<
 			" : " << *(diff2.first) << " != " << *(diff2.second) << std::endl;
 	if (diff3.first != psi1.end())
-		std::cout << "Mismatch at " << std::distance(psi1.begin(), diff3.first) <<
+		std::cout << "Mismatch in psi3 at " << std::distance(psi1.begin(), diff3.first) <<
 			" : " << *(diff3.first) << " != " << *(diff3.second) << std::endl;
 
 	return false;

From 39af62c516c51f40b835ea8ab93c66e622041dd7 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 25 Aug 2022 19:03:24 +0200
Subject: [PATCH 29/82] Letting runtime compilation to use the system-provided
 eigen library. The compilation times with eigen are an order of magnitude
 faster now, feels like the time is flat

---
 third_party/cppsim/include/nointrin/kernelgen.py | 5 -----
 third_party/cppsim/src/benchmark/benchmark.cpp   | 2 +-
 third_party/cppsim/src/compiler.cpp              | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 9a70fc15..04848d70 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -4,11 +4,6 @@
 import os
 
 def kernelgen(nqubits, ids=None, matvec=True):
-    # Temporary falling back to no-matvec for runtime-generated
-    # kernels, because we don't have Eigen included there yet.
-    if ids:
-        matvec = False
-
     # All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
     # 0 0
     # 1 0
diff --git a/third_party/cppsim/src/benchmark/benchmark.cpp b/third_party/cppsim/src/benchmark/benchmark.cpp
index 900beb63..0e8683a2 100644
--- a/third_party/cppsim/src/benchmark/benchmark.cpp
+++ b/third_party/cppsim/src/benchmark/benchmark.cpp
@@ -21,7 +21,7 @@ bool benchmark()
 	std::uniform_int_distribution<int> uid(-1000, 1000);
 
 	// Generate m matrix as integers.
-	std::array<std::array<int, nqubits>, nqubits> m;
+	std::array<std::array<int, 1UL << nqubits>, 1UL << nqubits> m;
 	for (int j = 0; j < m.size(); j++)
 		for (int i = 0; i < m.size(); i++)
 			m[j][i] = uid(dre);
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index e72a27d2..584021e7 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -150,7 +150,7 @@ void* Compiler::codegen(
 		ss << " -g -O3 -ffast-math -fopenmp -std=c++17 -x c++ ";
 #endif
 		ss << filename;
-		ss << " -fPIC -shared -o";
+		ss << " -I/usr/include/eigen3 -fPIC -shared -o";
 		ss << binname;
 		ss << " >";
 		ss << errlog;

From e49fab74dee7854e542806f2a354fdcb109f5bc8 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 26 Aug 2022 16:37:11 +0200
Subject: [PATCH 30/82] Working on adding combinations headers

---
 third_party/cppsim/CMakeLists.txt             |   2 +-
 third_party/cppsim/include/combinations.h     | 242 ++++++++++++++++++
 third_party/cppsim/include/gpu_support.h      |  68 +++++
 .../cppsim/include/nointrin/kernelgen.py      |  21 +-
 4 files changed, 323 insertions(+), 10 deletions(-)
 create mode 100644 third_party/cppsim/include/combinations.h
 create mode 100644 third_party/cppsim/include/gpu_support.h

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 7c77edb8..53140f05 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -37,7 +37,7 @@ macro(kernelgen)
 	# Call generator.
 	add_custom_command(
 		OUTPUT ${KERNEL_PATH}
-		COMMAND ${Python3_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH}
+		COMMAND ${Python3_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH} --combinations=True
 		COMMENT "Generating kernel for ${NQUBITS} qubits"
 		DEPENDS ${KERNELGEN})
 	set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
diff --git a/third_party/cppsim/include/combinations.h b/third_party/cppsim/include/combinations.h
new file mode 100644
index 00000000..b4565749
--- /dev/null
+++ b/third_party/cppsim/include/combinations.h
@@ -0,0 +1,242 @@
+#ifndef COMBINATIONS_H
+#define COMBINATIONS_H
+
+#define COMBINATIONS_RETVAL(expr) \
+	if constexpr (std::is_same<Result, bool>::value) \
+	{ \
+		if (!(expr)) return false; \
+	} \
+	else \
+		expr;
+
+#define COMBINATIONS_RETVAL_TRUE() \
+	if constexpr (std::is_same<Result, bool>::value) \
+		return true; \
+
+#include "gpu_support.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdio>
+#include <stdint.h>
+#include <type_traits>
+
+namespace combinations {
+
+// Iterate through the combinations using currying approach: https://stackoverflow.com/a/54508163/4063520
+class Combinations
+{
+// *****************************************************************************
+// Simple case: no sum constraint
+// *****************************************************************************
+private :
+
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m, // max allowed sequence element value
+		typename Result = void, // use bool return type to continue or stop
+		class Callable
+	>
+	GPU_SUPPORT
+	static constexpr Result _iterate(Callable&& c)
+	{
+		static_assert(k > 0);
+		for (uint32_t i = 0; i <= m; i++)
+		{
+			if constexpr(k == 1)
+			{
+				COMBINATIONS_RETVAL( c(i) );
+			}
+			else
+			{
+				auto bind_an_argument = [i, &c](auto... args)
+				{
+					COMBINATIONS_RETVAL(( c(i, args...) ));
+					COMBINATIONS_RETVAL_TRUE();
+				};
+
+				COMBINATIONS_RETVAL(( _iterate<k - 1, m, Result>(bind_an_argument) ));
+			}
+		}
+		
+		COMBINATIONS_RETVAL_TRUE();
+	}
+
+public :
+
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m  // max allowed sequence element value
+	>
+	struct Combination
+	{
+		using type = std::array<uint32_t, k>;
+	};
+
+	// Iterate through all combinations.
+	// For each combination, call a user-provided function.
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m, // max allowed sequence element value
+		typename Result = void, // use bool return type to continue or stop
+		class Callable
+	>
+	GPU_SUPPORT
+	static constexpr Result iterate(Callable&& c)
+	{
+		COMBINATIONS_RETVAL(( _iterate<k, m, Result>(c) ));
+		COMBINATIONS_RETVAL_TRUE();
+	}
+
+	// Tell the length of a combination supplied by the iterator
+	// configured with the given set of template parameters.
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m  // max allowed sequence element value
+	>
+	static constexpr uint32_t length()
+	{
+		return k;
+	}
+
+	// Tell the number of combinations supplied by the iterator
+	// configured with the given set of template parameters.
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m  // max allowed sequence element value
+	>
+	static uint32_t popcount()
+	{
+		if (k == 0) return 0;
+
+		uint32_t result = 1;
+		for (int i = 0; i < k; i++)
+			result *= m + 1;
+
+		return result;
+	}
+
+	// Tell the number of combinations supplied by the iterator
+	// configured with the given set of runtime parameters.
+	static uint32_t popcount(
+		const uint32_t k, // length of sequence
+		const uint32_t m) // max allowed sequence element value
+	{
+		if (k == 0) return 0;
+
+		uint32_t result = 1;
+		for (int i = 0; i < k; i++)
+			result *= m + 1;
+
+		return result;
+	}
+
+	// Reverse the order of elements in a combination
+	// configured with the given set of template parameters.	
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m  // max allowed sequence element value
+	>
+	static void reverse(typename Combination<k, m>::type& c)
+	{
+		std::reverse(c.begin(), c.end());
+	}
+
+	// Reverse the order of elements in a combination
+	// configured with the given set of runtime parameters.	
+	static void reverse(
+		const uint32_t k, // length of sequence
+		const uint32_t m, // max allowed sequence element value
+		uint32_t* c)
+	{
+		std::reverse(c, c + k);
+	}
+
+// *****************************************************************************
+// Simple case: no sum constraint with a user-defined range
+// *****************************************************************************
+public :
+
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m, // max allowed sequence element value
+		typename Result = void, // use bool return type to continue or stop
+		class Callable
+	>
+	GPU_SUPPORT
+	static constexpr Result _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
+	{
+		static_assert(k > 0);
+		for (uint32_t i = start[k - 1]; (i <= m) && limit; i++)
+		{
+			// Flush starting point to zero, in order for all subsequent iterations
+			// to start from zero as usual.
+			start[k - 1] = 0;
+
+			if constexpr(k == 1)
+			{
+				COMBINATIONS_RETVAL( c(i) );
+				limit--;
+			}
+			else
+			{
+				auto bind_an_argument = [i, &c](auto... args)
+				{
+					COMBINATIONS_RETVAL(( c(i, args...) ));
+					COMBINATIONS_RETVAL_TRUE();
+				};
+
+				COMBINATIONS_RETVAL(( _iterate<k - 1, m, Result>(start, limit, bind_an_argument) ));
+			}
+		}
+		
+		COMBINATIONS_RETVAL_TRUE();
+	}
+
+public :
+
+	// Iterate through combinations with specific starting point and duration.
+	// For each combination, call a user-provided function.
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m, // max allowed sequence element value
+		typename Result = void, // use bool return type to continue or stop
+		class Callable
+	>
+	GPU_SUPPORT
+	static constexpr Result iterate(const std::array<uint32_t, k>& start_, const uint32_t limit_, Callable&& c)
+	{
+		std::array<uint32_t, k> start = start_;
+		uint32_t limit = limit_;
+		COMBINATIONS_RETVAL(( _iterate<k, m, Result>(start.data(), limit, c) ));
+		COMBINATIONS_RETVAL_TRUE();
+	}
+
+	// Tell the number of combinations supplied by the iterator
+	// configured with the given set of template parameters.
+	template<
+		uint32_t k, // length of sequence
+		uint32_t m  // max allowed sequence element value
+	>
+	static uint32_t popcount(const uint32_t limit)
+	{
+		// TODO Actually could be less than limit, if start is closer to the end.
+		return limit;
+	}
+
+	// Tell the number of combinations supplied by the iterator
+	// configured with the given set of runtime parameters.
+	static uint32_t popcount(
+		const uint32_t k, // length of sequence
+		const uint32_t m, // max allowed sequence element value
+		const uint32_t limit)
+	{
+		// TODO Actually could be less than limit, if start is closer to the end.
+		return limit;
+	}
+};
+
+} // namespace combinations
+
+#endif // COMBINATIONS_H
+
diff --git a/third_party/cppsim/include/gpu_support.h b/third_party/cppsim/include/gpu_support.h
new file mode 100644
index 00000000..b513fc80
--- /dev/null
+++ b/third_party/cppsim/include/gpu_support.h
@@ -0,0 +1,68 @@
+#ifndef GPU_SUPPORT_H
+#define GPU_SUPPORT_H
+
+#ifdef __CUDACC__
+#include <cuda.h>
+#define GPU_SUPPORT __host__ __device__
+#else
+#define GPU_SUPPORT
+#endif
+
+#include <sstream>
+#include <string>
+
+namespace gpu {
+
+// TODO 128 threads per block should be preferred,
+// but also need to respect the upper limit that could
+// be introduced by the kernel (e.g. due to the user code,
+// which requires a lot of registers).
+constexpr const int nthreadsPerBlock = 128;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+#ifdef __CUDACC__
+constexpr const auto gpuSuccess = cudaSuccess;
+constexpr const auto GPU_SUCCESS = CUDA_SUCCESS;
+#else
+constexpr const auto gpuSuccess = hipSuccess;
+constexpr const auto GPU_SUCCESS = HIP_SUCCESS;
+#endif
+
+template<typename gpuError_t>
+void checkErrorStatus(gpuError_t status)
+{
+	if (status == gpuSuccess) return;
+
+	std::stringstream ss;
+	ss << "GPU runtime error, errno = ";
+	ss << status;
+	ss << " (";
+#if defined(__CUDACC__)
+	ss << cudaGetErrorString(status);
+#elif defined(__HIPCC__)
+	ss << hipGetErrorString(status);
+#endif
+	ss << ")";
+	std::string errorString = ss.str();
+	throw std::invalid_argument(errorString);
+}
+
+template<typename CUresult>
+void checkErrorStatusDriver(CUresult status)
+{
+	if (status == GPU_SUCCESS) return;
+	
+	std::stringstream ss;
+	ss << "GPU driver runtime error, errno = ";
+	ss << status;
+	std::string errorString = ss.str();
+	throw std::invalid_argument(errorString);		
+}
+
+#endif // defined(__CUDACC__) || defined(__HIPCC__)
+
+} // namespace gpu
+
+#endif // GPU_SUPPORT_H
+
diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 04848d70..78376db1 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -3,7 +3,7 @@
 import itertools
 import os
 
-def kernelgen(nqubits, ids=None, matvec=True):
+def kernelgen(nqubits, ids=None, matvec=True, combinations=False):
     # All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
     # 0 0
     # 1 0
@@ -54,6 +54,7 @@ def rhs(n, j, i):
 {include} <complex>
 {include} <cstdlib>
 {eigen}
+{combinations}
 
 {define} add(a, b) (a + b)
 {define} mul(a, b) (a * b)
@@ -68,6 +69,8 @@ def rhs(n, j, i):
     {psi_assign}
 }}
 
+{undef} add
+{undef} mul
 {undef} M
 """.format( \
         include    = "#include", \
@@ -75,6 +78,7 @@ def rhs(n, j, i):
         undef      = "#undef", \
         nqubits    = nqubits, \
         eigen      = "{define} EIGEN_DEFAULT_DENSE_INDEX_TYPE int{newline}{define} EIGEN_VECTORIZE{newline}{include} <Eigen/Dense>".format(define="#define", newline=newline, include = "#include") if matvec else '', \
+        combinations = "#include \"combinations.h\"" if combinations else '', \
         n          = len(strcombs),
         d_template = ''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else '', \
         d_var      = ''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else '', \
@@ -109,12 +113,7 @@ def rhs(n, j, i):
     }}
 }}
 
-{undef} add
-{undef} mul
-
 """.format( \
-        define      = "#define", \
-        undef       = "#undef", \
         pragma      = "#pragma", \
         nqubits     = nqubits, \
         id_var      = ''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits)) if ids == None else '', \
@@ -140,17 +139,21 @@ def rhs(n, j, i):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Generate Haener-Steiger quantum kernels in the form used in ProjectQ simulator')
-    parser.add_argument('nqubits', type=int, help='The number of qubits to generate the kernel for')
-    parser.add_argument('output', type=str, help='Output file name')
+    parser.add_argument('nqubits', type=int, help='the number of qubits to generate the kernel for')
+    parser.add_argument('output', type=str, help='output file name')
+    parser.add_argument('-matvec', '--matvec', nargs='?', type=bool, default=True, help='kernel core implementation: as a matrix-vector operation (True, default), or as an explicit formula (False)')
+    parser.add_argument('-combinations', '--combinations', nargs='?', type=bool, default=True, help='multithreading implementation: combinations partitioner (True, default), or OpenMP collapse (False, only for CPU with small number of cores)')
     args = parser.parse_args()
     
     nqubits = int(args.nqubits)
     output = args.output
+    matvec = args.matvec
+    combinations = args.combinations
 
     try:
         os.makedirs(os.path.dirname(output))
     except:
         pass
     with open(output, "w") as o:
-        o.write(kernelgen(nqubits))
+        o.write(kernelgen(nqubits, matvec=matvec, combinations=combinations))
 

From 25527d5112b1026eb058d1c8292ee8067bdff0bd Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 26 Aug 2022 19:54:33 +0200
Subject: [PATCH 31/82] Preparing combinations for serving the kernels

---
 third_party/cppsim/include/combinations.h | 206 ++++++++--------------
 1 file changed, 70 insertions(+), 136 deletions(-)

diff --git a/third_party/cppsim/include/combinations.h b/third_party/cppsim/include/combinations.h
index b4565749..a0cbd4d5 100644
--- a/third_party/cppsim/include/combinations.h
+++ b/third_party/cppsim/include/combinations.h
@@ -1,18 +1,6 @@
 #ifndef COMBINATIONS_H
 #define COMBINATIONS_H
 
-#define COMBINATIONS_RETVAL(expr) \
-	if constexpr (std::is_same<Result, bool>::value) \
-	{ \
-		if (!(expr)) return false; \
-	} \
-	else \
-		expr;
-
-#define COMBINATIONS_RETVAL_TRUE() \
-	if constexpr (std::is_same<Result, bool>::value) \
-		return true; \
-
 #include "gpu_support.h"
 
 #include <algorithm>
@@ -32,165 +20,128 @@ class Combinations
 private :
 
 	template<
-		uint32_t k, // length of sequence
-		uint32_t m, // max allowed sequence element value
-		typename Result = void, // use bool return type to continue or stop
-		class Callable
+		class Callable,
+		uint32_t n0, uint32_t n1, uint32_t ...n // max allowed sequence element value
 	>
 	GPU_SUPPORT
-	static constexpr Result _iterate(Callable&& c)
+	static constexpr void _iterate(Callable&& c)
 	{
-		static_assert(k > 0);
-		for (uint32_t i = 0; i <= m; i++)
+		for (uint32_t i = 0; i <= n0; i += 2 * n1)
 		{
-			if constexpr(k == 1)
-			{
-				COMBINATIONS_RETVAL( c(i) );
-			}
-			else
+			auto bind_an_argument = [i, &c](auto... args)
 			{
-				auto bind_an_argument = [i, &c](auto... args)
-				{
-					COMBINATIONS_RETVAL(( c(i, args...) ));
-					COMBINATIONS_RETVAL_TRUE();
-				};
-
-				COMBINATIONS_RETVAL(( _iterate<k - 1, m, Result>(bind_an_argument) ));
-			}
+				c(i, args...);
+			};
+
+			_iterate<n1, n...>(bind_an_argument);
 		}
-		
-		COMBINATIONS_RETVAL_TRUE();
 	}
 
-public :
-
 	template<
-		uint32_t k, // length of sequence
-		uint32_t m  // max allowed sequence element value
+		class Callable,
+		uint32_t n0 // max allowed sequence element value
 	>
+	GPU_SUPPORT
+	static constexpr void _iterate(Callable&& c)
+	{
+		for (uint32_t i = 0; i <= n0; i++)
+		{
+			c(i);
+		}
+	}
+
+
+public :
+
+	template<uint32_t ...n>
 	struct Combination
 	{
-		using type = std::array<uint32_t, k>;
+		using type = std::array<uint32_t, sizeof...(n)>;
 	};
 
 	// Iterate through all combinations.
 	// For each combination, call a user-provided function.
 	template<
-		uint32_t k, // length of sequence
-		uint32_t m, // max allowed sequence element value
-		typename Result = void, // use bool return type to continue or stop
-		class Callable
+		class Callable,
+		uint32_t ...n // max allowed sequence element value
 	>
 	GPU_SUPPORT
-	static constexpr Result iterate(Callable&& c)
+	static constexpr void iterate(Callable&& c)
 	{
-		COMBINATIONS_RETVAL(( _iterate<k, m, Result>(c) ));
-		COMBINATIONS_RETVAL_TRUE();
+		_iterate<n...>(c);
 	}
 
 	// Tell the length of a combination supplied by the iterator
 	// configured with the given set of template parameters.
-	template<
-		uint32_t k, // length of sequence
-		uint32_t m  // max allowed sequence element value
-	>
+	template<uint32_t ...n>
 	static constexpr uint32_t length()
 	{
-		return k;
+		return sizeof...(n);
 	}
 
 	// Tell the number of combinations supplied by the iterator
 	// configured with the given set of template parameters.
-	template<
-		uint32_t k, // length of sequence
-		uint32_t m  // max allowed sequence element value
-	>
+	template<uint32_t ...n>
 	static uint32_t popcount()
 	{
-		if (k == 0) return 0;
+		if (sizeof...(n) == 0) return 0;
 
 		uint32_t result = 1;
-		for (int i = 0; i < k; i++)
-			result *= m + 1;
-
-		return result;
-	}
-
-	// Tell the number of combinations supplied by the iterator
-	// configured with the given set of runtime parameters.
-	static uint32_t popcount(
-		const uint32_t k, // length of sequence
-		const uint32_t m) // max allowed sequence element value
-	{
-		if (k == 0) return 0;
-
-		uint32_t result = 1;
-		for (int i = 0; i < k; i++)
-			result *= m + 1;
-
+		// TODO
 		return result;
 	}
 
 	// Reverse the order of elements in a combination
 	// configured with the given set of template parameters.	
-	template<
-		uint32_t k, // length of sequence
-		uint32_t m  // max allowed sequence element value
-	>
-	static void reverse(typename Combination<k, m>::type& c)
+	template<uint32_t ...n>
+	static void reverse(typename Combination<n...>::type& c)
 	{
 		std::reverse(c.begin(), c.end());
 	}
 
-	// Reverse the order of elements in a combination
-	// configured with the given set of runtime parameters.	
-	static void reverse(
-		const uint32_t k, // length of sequence
-		const uint32_t m, // max allowed sequence element value
-		uint32_t* c)
-	{
-		std::reverse(c, c + k);
-	}
-
 // *****************************************************************************
 // Simple case: no sum constraint with a user-defined range
 // *****************************************************************************
 public :
 
 	template<
-		uint32_t k, // length of sequence
-		uint32_t m, // max allowed sequence element value
-		typename Result = void, // use bool return type to continue or stop
-		class Callable
+		class Callable,
+		uint32_t n0, uint32_t n1, uint32_t ...n // max allowed sequence element value
 	>
 	GPU_SUPPORT
-	static constexpr Result _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
+	static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
 	{
-		static_assert(k > 0);
-		for (uint32_t i = start[k - 1]; (i <= m) && limit; i++)
+		for (uint32_t i = start; (i <= n0) && limit; i += 2 * n1)
 		{
 			// Flush starting point to zero, in order for all subsequent iterations
 			// to start from zero as usual.
-			start[k - 1] = 0;
+			*start = 0;
 
-			if constexpr(k == 1)
-			{
-				COMBINATIONS_RETVAL( c(i) );
-				limit--;
-			}
-			else
+			auto bind_an_argument = [i, &c](auto... args)
 			{
-				auto bind_an_argument = [i, &c](auto... args)
-				{
-					COMBINATIONS_RETVAL(( c(i, args...) ));
-					COMBINATIONS_RETVAL_TRUE();
-				};
-
-				COMBINATIONS_RETVAL(( _iterate<k - 1, m, Result>(start, limit, bind_an_argument) ));
-			}
+				c(i, args...);
+			};
+
+			_iterate<n1, n...>(start++, limit, bind_an_argument);
+		}
+	}
+
+	template<
+		class Callable,
+		uint32_t n0 // max allowed sequence element value
+	>
+	GPU_SUPPORT
+	static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
+	{
+		for (uint32_t i = start; (i <= n0) && limit; i++)
+		{
+			// Flush starting point to zero, in order for all subsequent iterations
+			// to start from zero as usual.
+			*start = 0;
+
+			c(i);
+			limit--;
 		}
-		
-		COMBINATIONS_RETVAL_TRUE();
 	}
 
 public :
@@ -198,42 +149,25 @@ public :
 	// Iterate through combinations with specific starting point and duration.
 	// For each combination, call a user-provided function.
 	template<
-		uint32_t k, // length of sequence
-		uint32_t m, // max allowed sequence element value
-		typename Result = void, // use bool return type to continue or stop
-		class Callable
+		class Callable,
+		uint32_t ...n // max allowed sequence element value
 	>
 	GPU_SUPPORT
-	static constexpr Result iterate(const std::array<uint32_t, k>& start_, const uint32_t limit_, Callable&& c)
+	static constexpr void iterate(const typename Combination<n...>::type& start_, const uint32_t limit_, Callable&& c)
 	{
-		std::array<uint32_t, k> start = start_;
+		auto start = start_;
 		uint32_t limit = limit_;
-		COMBINATIONS_RETVAL(( _iterate<k, m, Result>(start.data(), limit, c) ));
-		COMBINATIONS_RETVAL_TRUE();
+		_iterate<n...>(start.data(), limit, c);
 	}
 
 	// Tell the number of combinations supplied by the iterator
 	// configured with the given set of template parameters.
-	template<
-		uint32_t k, // length of sequence
-		uint32_t m  // max allowed sequence element value
-	>
+	template<uint32_t ...n>
 	static uint32_t popcount(const uint32_t limit)
 	{
 		// TODO Actually could be less than limit, if start is closer to the end.
 		return limit;
 	}
-
-	// Tell the number of combinations supplied by the iterator
-	// configured with the given set of runtime parameters.
-	static uint32_t popcount(
-		const uint32_t k, // length of sequence
-		const uint32_t m, // max allowed sequence element value
-		const uint32_t limit)
-	{
-		// TODO Actually could be less than limit, if start is closer to the end.
-		return limit;
-	}
 };
 
 } // namespace combinations

From c915f44f5ae67b33fee9ef97e5eb5cdbc68a7217 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 29 Aug 2022 17:43:16 +0200
Subject: [PATCH 32/82] Fixing bugs in combinations, adding a test to compare
 the reference kernel5 against kernel5 with combinations loop

---
 third_party/cppsim/CMakeLists.txt             |   6 +
 third_party/cppsim/include/combinations.h     |  36 ++-
 .../cppsim/src/test/test_combinations.cpp     | 218 ++++++++++++++++++
 3 files changed, 240 insertions(+), 20 deletions(-)
 create mode 100644 third_party/cppsim/src/test/test_combinations.cpp

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 53140f05..1975d6ca 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -58,6 +58,12 @@ kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
 
+add_executable(test_combinations "src/test/test_combinations.cpp")
+set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
+target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
+target_link_libraries(test_combinations PRIVATE gtest kernelgen)
+
 add_executable(benchmark "src/benchmark/benchmark.cpp")
 target_link_libraries(benchmark PRIVATE gtest kernelgen)
 
diff --git a/third_party/cppsim/include/combinations.h b/third_party/cppsim/include/combinations.h
index a0cbd4d5..68e08c0d 100644
--- a/third_party/cppsim/include/combinations.h
+++ b/third_party/cppsim/include/combinations.h
@@ -9,8 +9,6 @@
 #include <stdint.h>
 #include <type_traits>
 
-namespace combinations {
-
 // Iterate through the combinations using currying approach: https://stackoverflow.com/a/54508163/4063520
 class Combinations
 {
@@ -20,13 +18,13 @@ class Combinations
 private :
 
 	template<
-		class Callable,
-		uint32_t n0, uint32_t n1, uint32_t ...n // max allowed sequence element value
+		uint32_t n0, uint32_t n1, uint32_t ...n, // max allowed sequence element value
+		class Callable
 	>
 	GPU_SUPPORT
 	static constexpr void _iterate(Callable&& c)
 	{
-		for (uint32_t i = 0; i <= n0; i += 2 * n1)
+		for (uint32_t i = 0; i < n0; i += 2 * n1)
 		{
 			auto bind_an_argument = [i, &c](auto... args)
 			{
@@ -38,13 +36,13 @@ private :
 	}
 
 	template<
-		class Callable,
-		uint32_t n0 // max allowed sequence element value
+		uint32_t n0, // max allowed sequence element value
+		class Callable
 	>
 	GPU_SUPPORT
 	static constexpr void _iterate(Callable&& c)
 	{
-		for (uint32_t i = 0; i <= n0; i++)
+		for (uint32_t i = 0; i < n0; i++)
 		{
 			c(i);
 		}
@@ -62,8 +60,8 @@ public :
 	// Iterate through all combinations.
 	// For each combination, call a user-provided function.
 	template<
-		class Callable,
-		uint32_t ...n // max allowed sequence element value
+		uint32_t ...n, // max allowed sequence element value
+		class Callable
 	>
 	GPU_SUPPORT
 	static constexpr void iterate(Callable&& c)
@@ -105,13 +103,13 @@ public :
 public :
 
 	template<
-		class Callable,
-		uint32_t n0, uint32_t n1, uint32_t ...n // max allowed sequence element value
+		uint32_t n0, uint32_t n1, uint32_t ...n, // max allowed sequence element value
+		class Callable
 	>
 	GPU_SUPPORT
 	static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
 	{
-		for (uint32_t i = start; (i <= n0) && limit; i += 2 * n1)
+		for (uint32_t i = start; (i < n0) && limit; i += 2 * n1)
 		{
 			// Flush starting point to zero, in order for all subsequent iterations
 			// to start from zero as usual.
@@ -127,13 +125,13 @@ public :
 	}
 
 	template<
-		class Callable,
-		uint32_t n0 // max allowed sequence element value
+		uint32_t n0, // max allowed sequence element value
+		class Callable
 	>
 	GPU_SUPPORT
 	static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
 	{
-		for (uint32_t i = start; (i <= n0) && limit; i++)
+		for (uint32_t i = start; (i < n0) && limit; i++)
 		{
 			// Flush starting point to zero, in order for all subsequent iterations
 			// to start from zero as usual.
@@ -149,8 +147,8 @@ public :
 	// Iterate through combinations with specific starting point and duration.
 	// For each combination, call a user-provided function.
 	template<
-		class Callable,
-		uint32_t ...n // max allowed sequence element value
+		uint32_t ...n, // max allowed sequence element value
+		class Callable
 	>
 	GPU_SUPPORT
 	static constexpr void iterate(const typename Combination<n...>::type& start_, const uint32_t limit_, Callable&& c)
@@ -170,7 +168,5 @@ public :
 	}
 };
 
-} // namespace combinations
-
 #endif // COMBINATIONS_H
 
diff --git a/third_party/cppsim/src/test/test_combinations.cpp b/third_party/cppsim/src/test/test_combinations.cpp
new file mode 100644
index 00000000..1cb2e689
--- /dev/null
+++ b/third_party/cppsim/src/test/test_combinations.cpp
@@ -0,0 +1,218 @@
+#include <algorithm>
+#include <array>
+#include <complex>
+#include <cstdlib>
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_VECTORIZE
+#include <Eigen/Dense>
+#include "combinations.h"
+
+#define add(a, b) (a + b)
+#define mul(a, b) (a * b)
+
+#define M(j, i) (m[j * 32 + i])
+
+template<class T>
+inline void kernel_core(T* psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, const T* m)
+{
+    const std::array v = {
+        psi[I],
+        psi[I + d0],
+        psi[I + d1],
+        psi[I + d0 + d1],
+        psi[I + d2],
+        psi[I + d0 + d2],
+        psi[I + d1 + d2],
+        psi[I + d0 + d1 + d2],
+        psi[I + d3],
+        psi[I + d0 + d3],
+        psi[I + d1 + d3],
+        psi[I + d0 + d1 + d3],
+        psi[I + d2 + d3],
+        psi[I + d0 + d2 + d3],
+        psi[I + d1 + d2 + d3],
+        psi[I + d0 + d1 + d2 + d3],
+        psi[I + d4],
+        psi[I + d0 + d4],
+        psi[I + d1 + d4],
+        psi[I + d0 + d1 + d4],
+        psi[I + d2 + d4],
+        psi[I + d0 + d2 + d4],
+        psi[I + d1 + d2 + d4],
+        psi[I + d0 + d1 + d2 + d4],
+        psi[I + d3 + d4],
+        psi[I + d0 + d3 + d4],
+        psi[I + d1 + d3 + d4],
+        psi[I + d0 + d1 + d3 + d4],
+        psi[I + d2 + d3 + d4],
+        psi[I + d0 + d2 + d3 + d4],
+        psi[I + d1 + d2 + d3 + d4],
+        psi[I + d0 + d1 + d2 + d3 + d4],
+    };
+
+    const auto result = Eigen::Map<const Eigen::Matrix<T, 32, 32, Eigen::RowMajor>>(m) * Eigen::Map<const Eigen::Vector<T, 32>>(v.data());
+
+    psi[I] = result[0];
+    psi[I + d0] = result[1];
+    psi[I + d1] = result[2];
+    psi[I + d0 + d1] = result[3];
+    psi[I + d2] = result[4];
+    psi[I + d0 + d2] = result[5];
+    psi[I + d1 + d2] = result[6];
+    psi[I + d0 + d1 + d2] = result[7];
+    psi[I + d3] = result[8];
+    psi[I + d0 + d3] = result[9];
+    psi[I + d1 + d3] = result[10];
+    psi[I + d0 + d1 + d3] = result[11];
+    psi[I + d2 + d3] = result[12];
+    psi[I + d0 + d2 + d3] = result[13];
+    psi[I + d1 + d2 + d3] = result[14];
+    psi[I + d0 + d1 + d2 + d3] = result[15];
+    psi[I + d4] = result[16];
+    psi[I + d0 + d4] = result[17];
+    psi[I + d1 + d4] = result[18];
+    psi[I + d0 + d1 + d4] = result[19];
+    psi[I + d2 + d4] = result[20];
+    psi[I + d0 + d2 + d4] = result[21];
+    psi[I + d1 + d2 + d4] = result[22];
+    psi[I + d0 + d1 + d2 + d4] = result[23];
+    psi[I + d3 + d4] = result[24];
+    psi[I + d0 + d3 + d4] = result[25];
+    psi[I + d1 + d3 + d4] = result[26];
+    psi[I + d0 + d1 + d3 + d4] = result[27];
+    psi[I + d2 + d3 + d4] = result[28];
+    psi[I + d0 + d2 + d3 + d4] = result[29];
+    psi[I + d1 + d2 + d3 + d4] = result[30];
+    psi[I + d0 + d1 + d2 + d3 + d4] = result[31];
+    
+}
+
+#undef add
+#undef mul
+#undef M
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template<class T>
+void kernel(T* psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, const T* m, std::size_t ctrlmask)
+{
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4;
+    std::size_t n = 1 + d0 + d1 + d2 + d3 + d4;
+    std::size_t dsorted[] = { d4, d3, d2, d1, d0 };
+    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        #pragma omp for collapse(6) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1])
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2])
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3])
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4])
+                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+                                kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
+                            }
+        }
+    }
+    else{
+        #pragma omp for collapse(6) schedule(static)
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
+            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1])
+                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2])
+                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3])
+                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4])
+                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+                                if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
+                                    kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
+                            }
+        }
+    }
+}
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template<unsigned id0, unsigned id1, unsigned id2, unsigned id3, unsigned id4, class T>
+void kernel_combinations(T* psi, const T* m, std::size_t ctrlmask)
+{
+    constexpr std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4;
+    constexpr std::size_t n = 1 + d0 + d1 + d2 + d3 + d4;
+    std::size_t dsorted[] = { d4, d3, d2, d1, d0 };
+    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        Combinations::iterate<n, d4, d3, d2, d1, d0>([=](auto... i)
+        {
+	    kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m);
+	});
+    }
+    else{
+        Combinations::iterate<n, d4, d3, d2, d1, d0>([=](auto... i)
+        {
+            if (((i + ...) & ctrlmask) == ctrlmask)
+	        kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m);
+	});
+    }
+}
+
+#include <array>
+#include <iostream>
+#include <random>
+
+#include "gtest/gtest.h"
+
+template<int nqubits, typename Kernels, typename V>
+bool compare(Kernels kernels, V& psi1)
+{
+	std::default_random_engine dre;
+	dre.seed(0);
+	std::uniform_int_distribution<int> uid(-1000, 1000);
+
+	// Generate m matrix as integers.
+	std::array<std::array<int, 1UL << nqubits>, 1UL << nqubits> m;
+	for (int j = 0; j < m.size(); j++)
+		for (int i = 0; i < m.size(); i++)
+			m[j][i] = uid(dre);
+
+	// Generate psi matrix as integers.
+	for (int i = 0; i < psi1.size(); i++)
+		psi1[i] = uid(dre);
+	auto psi2 = psi1;
+	auto psi3 = psi1;
+
+	// Generate control mask.
+	std::size_t ctrlmask = 0; // uid(dre);
+
+	// Compare kernel against generated kernel.
+	kernels(psi1, psi2, m, ctrlmask);
+	auto diff = std::mismatch(psi1.begin(), psi1.end(), psi2.begin());
+	if (diff.first == psi1.end())
+		return true;
+
+	if (diff.first != psi1.end())
+		std::cout << "Mismatch in psi2 at " << std::distance(psi1.begin(), diff.first) <<
+			" : " << *(diff.first) << " != " << *(diff.second) << std::endl;
+
+	return false;
+}
+
+TEST(nointrin, kernel5)
+{
+	constexpr unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4;
+	size_t n = 1;
+	n += 1UL << id0;
+	n += 1UL << id1;
+	n += 1UL << id2;
+	n += 1UL << id3;
+	n += 1UL << id4;
+	std::vector<int> psi(n);
+	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	{
+		kernel(&psi1[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask);
+		kernel_combinations<id0, id1, id2, id3, id4>(&psi2[0], &m[0][0], ctrlmask);
+	},
+	psi));
+}
+
+int main(int argc, char* argv[])
+{
+	::testing::InitGoogleTest(&argc, argv);
+	return RUN_ALL_TESTS();
+}
+

From f42e628595ae601cdd9c2cfae79f0754d224b85f Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 30 Aug 2022 09:20:30 +0200
Subject: [PATCH 33/82] Importing combinations code for cpu/gpu partitioning

---
 third_party/cppsim/include/cpu/schedule.h | 115 +++++++++++++
 third_party/cppsim/include/gpu/schedule.h | 187 ++++++++++++++++++++++
 third_party/cppsim/include/partitioner.h  | 113 +++++++++++++
 third_party/cppsim/include/schedule.h     | 131 +++++++++++++++
 4 files changed, 546 insertions(+)
 create mode 100644 third_party/cppsim/include/cpu/schedule.h
 create mode 100644 third_party/cppsim/include/gpu/schedule.h
 create mode 100644 third_party/cppsim/include/partitioner.h
 create mode 100644 third_party/cppsim/include/schedule.h

diff --git a/third_party/cppsim/include/cpu/schedule.h b/third_party/cppsim/include/cpu/schedule.h
new file mode 100644
index 00000000..b03e8132
--- /dev/null
+++ b/third_party/cppsim/include/cpu/schedule.h
@@ -0,0 +1,115 @@
+#ifndef SCHEDULE_CPU_H
+#define SCHEDULE_CPU_H
+
+#include "combinations/distributed/partitioner.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <stdint.h>
+#include <sstream>
+#include <vector>
+
+namespace combinations {
+
+namespace distributed {
+
+namespace cpu {
+
+template<
+	class Contexts,
+	class Callable,
+	typename Combinations,
+	typename Earnest,
+	uint32_t ...Args // Underlying combination parameters
+>
+class Schedule
+{
+	using Combination = typename Combinations::template Combination<Args...>::type;
+
+	int nworkers;
+	uint32_t maxCombinationsPerWorker;
+	
+	std::vector<Combination> starts;
+
+	Callable c;
+
+public :
+
+	int getWorkersCount() const { return nworkers; }
+	
+	const char* getName() const { return "cpu"; }
+
+	Schedule(int nworkers_, Callable c_) :
+		nworkers(nworkers_), c(c_)
+	{
+		// Calculate workers partitions on host, which should be
+		// fast, as the iterator body is trivial. Then we re-use
+		// this schedule to perform the real iterations with a
+		// meaningful user-defined iterator body.		
+		Partitioner<Combinations, Earnest>::template partition<Args...>(
+			starts, nworkers, maxCombinationsPerWorker);
+	}
+	
+	void execute(Contexts ctxs)
+	{
+		#pragma omp parallel for
+		for (int iworker = 0; iworker < nworkers; iworker++)
+		{
+			auto& ctx = ctxs[iworker];
+			uint32_t limit = maxCombinationsPerWorker;
+			Combinations::template iterate<Args...>(starts[iworker], limit, [&](auto... args)
+			{
+				c(ctx, args...);
+			});
+		}
+	}
+};
+
+// We need to know all of the types participating in the used-defined
+// combination specialization, in order to estimate the maximum number
+// of blocks that could simultaneously fit into the GPU. By using this
+// number multipled by the number of SMs, we partition the workload
+// most evenly.
+template<
+	class Contexts,
+	typename Combinations,
+	typename Earnest,
+	uint32_t ...Args, // Underlying combination parameters
+	class Callable
+>
+auto make_schedule(Callable c, int nworkers = 0)
+{
+	if (nworkers == 0)
+	{
+#ifdef _OPENMP
+		#pragma omp parallel
+		{
+			#pragma omp master
+			{
+				nworkers = omp_get_num_threads();
+			}
+		}
+#else
+		nworkers = 1;
+#endif
+	}
+	
+	if (nworkers <= 0) nworkers = 1;
+	
+	return Schedule<
+		Contexts,
+		Callable,
+		Combinations,
+		Earnest,
+		Args...>(nworkers, c);
+}
+
+} // namespace cpu
+
+} // namespace distributed
+
+} // namespace combinations
+
+#endif // SCHEDULE_CPU_H
+
diff --git a/third_party/cppsim/include/gpu/schedule.h b/third_party/cppsim/include/gpu/schedule.h
new file mode 100644
index 00000000..173c50d8
--- /dev/null
+++ b/third_party/cppsim/include/gpu/schedule.h
@@ -0,0 +1,187 @@
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+#ifndef SCHEDULE_GPU_H
+#define SCHEDULE_GPU_H
+
+#include "combinations/distributed/partitioner.h"
+
+#include <stdint.h>
+#include <string>
+#include <sstream>
+#if defined(__CUDACC__)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#else
+#include <hipThrust/thrust/device_vector.h>
+#include <hipThrust/thrust/host_vector.h>
+#endif
+
+namespace combinations {
+
+namespace distributed {
+
+namespace gpu {
+
+template<
+	class Contexts,
+	class Callable,
+	class Starts,
+	typename Combinations,
+	uint32_t ...Args // Underlying combination parameters
+>
+__global__ void kernel(Contexts ctxs, Callable c,
+	int nworkers, Starts starts, uint32_t maxCombinationsPerWorker)
+{
+	int iworker = threadIdx.x + blockDim.x * blockIdx.x;
+	if (iworker >= nworkers) return;
+
+	auto& ctx = ctxs[iworker];
+	Combinations::template iterate<Args...>(
+		starts[iworker], maxCombinationsPerWorker, [&] __device__ (auto... args)
+	{
+		c(ctx, args...);
+	});
+}
+
+template<
+	class Contexts,
+	class Callable,
+	typename Combinations,
+	typename Earnest,
+	uint32_t ...Args // Underlying combination parameters
+>
+class Schedule
+{
+	using Combination = typename Combinations::template Combination<Args...>::type;
+		
+	int nblocks;
+
+	int nworkers;
+	uint32_t maxCombinationsPerWorker;
+	
+	thrust::device_vector<Combination> starts;
+
+	Callable c;
+	
+	std::string name;
+
+public :
+	
+	int getWorkersCount() const { return nworkers; }
+	
+	const char* getName() const { return name.c_str(); }
+	
+	Schedule(int nworkers_, int nblocks_, Callable c_) :
+		nworkers(nworkers_), nblocks(nblocks_), c(c_)
+	{		
+		// Calculate workers partitions on host, which should be
+		// fast, as the iterator body is trivial. Then we re-use
+		// this schedule to perform the real iterations with a
+		// meaningful user-defined iterator body.		
+		thrust::host_vector<Combination> startsHost;
+		Partitioner<Combinations, Earnest>::template partition<Args...>(
+			startsHost, nworkers, maxCombinationsPerWorker);
+		starts = startsHost;
+		
+		// Get the GPU name.
+#if defined(__CUDACC__)
+		cudaDeviceProp props;
+		::gpu::checkErrorStatus(cudaGetDeviceProperties(&props, 0));
+#elif defined(__HIPCC__)
+		hipDeviceProp_t props;
+		::gpu::checkErrorStatus(hipGetDeviceProperties(&props, 0));
+#endif
+		name = props.name;		
+	}
+
+	void execute(Contexts ctxs)
+	{
+		auto startsPtr = thrust::raw_pointer_cast(starts.data());
+		kernel<
+			Contexts,
+			Callable,
+			Combination*,
+			Combinations,
+			Args...><<<nblocks, ::gpu::nthreadsPerBlock>>>(
+			ctxs, c, nworkers, startsPtr, maxCombinationsPerWorker);
+#if defined(__CUDACC__)
+		::gpu::checkErrorStatus(cudaGetLastError());
+		::gpu::checkErrorStatus(cudaDeviceSynchronize());
+#elif defined(__HIPCC__)
+		::gpu::checkErrorStatus(hipGetLastError());
+		::gpu::checkErrorStatus(hipDeviceSynchronize());
+#endif
+	}
+};
+
+// We need to know all of the types participating in the used-defined
+// combination specialization, in order to estimate the maximum number
+// of blocks that could simultaneously fit into the GPU. By using this
+// number multipled by the number of SMs, we partition the workload
+// most evenly.
+template<
+	class Contexts,
+	typename Combinations,
+	typename Earnest,
+	uint32_t ...Args, // Underlying combination parameters
+	class Callable
+>
+auto make_schedule(Callable c, int nworkers = 0)
+{
+	using Combination = typename Combinations::template Combination<Args...>::type;
+
+	struct cudaFuncAttributes attrs;
+	::gpu::checkErrorStatus(cudaFuncGetAttributes(&attrs,
+		kernel<Contexts, Callable, Combination*, Combinations, Args...>));
+	printf("%d registers per thread\n", attrs.numRegs);
+	
+	if (nworkers)
+	{
+		// Get the GPU compute grid from the user-specified
+		// number of workers.
+		int nblocks = nworkers / ::gpu::nthreadsPerBlock;
+		if (nworkers % ::gpu::nthreadsPerBlock) nblocks++;
+		
+		return Schedule<
+			Contexts,
+			Callable,
+			Combinations,
+			Earnest,
+			Args...>(nworkers, nblocks, c);			
+	}
+
+	int nblocks = 0;
+	const size_t dynamicSMemSize = 0;
+#if defined(__CUDACC__)
+	::gpu::checkErrorStatus(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+		&nblocks, kernel<Contexts, Callable, Combination*, Combinations, Args...>,
+		::gpu::nthreadsPerBlock, dynamicSMemSize));
+	cudaDeviceProp props;
+	::gpu::checkErrorStatus(cudaGetDeviceProperties(&props, 0));
+#elif defined(__HIPCC__)
+	::gpu::checkErrorStatus(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+		&nblocks, kernel<Contexts, Callable, Combination*, Combinations, Args...>,
+		::gpu::nthreadsPerBlock, dynamicSMemSize));
+	hipDeviceProp_t props;
+	::gpu::checkErrorStatus(hipGetDeviceProperties(&props, 0));
+#endif
+	nblocks *= props.multiProcessorCount;
+
+	return Schedule<
+		Contexts,
+		Callable,
+		Combinations,
+		Earnest,
+		Args...>(nblocks * ::gpu::nthreadsPerBlock, nblocks, c);
+}
+
+} // namespace gpu
+
+} // namespace distributed
+
+} // namespace combinations
+
+#endif // SCHEDULE_GPU_H
+
+#endif // defined(__CUDACC__) || defined(__HIPCC__)
+
diff --git a/third_party/cppsim/include/partitioner.h b/third_party/cppsim/include/partitioner.h
new file mode 100644
index 00000000..502ba9e3
--- /dev/null
+++ b/third_party/cppsim/include/partitioner.h
@@ -0,0 +1,113 @@
+#ifndef COMBINATIONS_PARTITIONER_H
+#define COMBINATIONS_PARTITIONER_H 
+
+#include "earnest/earnest.h"
+#include "earnest/sum_equal/earnest.h"
+#include "earnest/sum_less_or_equal/earnest.h"
+
+#include <cstdio>
+
+namespace combinations {
+
+namespace distributed {
+
+template<
+	class Combinations,
+	class Earnest
+>
+class Partitioner
+{
+public :
+
+	template<
+		uint32_t ...Args, // Underlying combination parameters
+		class Starts
+	>
+	static void partition(Starts& starts, int& nworkers, uint32_t& maxCombinationsPerWorker)
+	{
+		using Combination = typename Combinations::template Combination<Args...>::type;
+
+		uint32_t totalNumberOfCombinations = Combinations::template popcount<Args...>();
+		maxCombinationsPerWorker = totalNumberOfCombinations / nworkers;
+		if (totalNumberOfCombinations % nworkers) maxCombinationsPerWorker++;
+
+		// Record starting points for workers' cooperative processing.
+		starts.reserve(nworkers);
+		uint32_t i = maxCombinationsPerWorker;
+		Combinations::template iterate<Args...>([&](auto... args)
+		{
+			if (i < maxCombinationsPerWorker)
+			{
+				i++;
+				return;
+			}
+
+			// Combinations::iterate uses reversed order of starting point indices.
+			// We revert it here, in order to make the Combinations::iterate
+			// code more generic.
+			Combination start { args... };
+			Combinations::template reverse<Args...>(start);
+			starts.push_back(start);
+			i = 1;
+		});
+
+		// Re-evaluate the number of workers, as their number could be eventually
+		// smaller than the initially proposed number of workers.
+		nworkers = starts.size();
+
+		printf("%u iterations in total, %u workers, %u iterations per worker\n",
+			totalNumberOfCombinations, nworkers, maxCombinationsPerWorker);
+	}
+
+	template<
+		typename Args, // Underlying combination parameters
+		class Starts
+	>
+	static void partition(Args& args, Starts& starts, int& nworkers, uint32_t& maxCombinationsPerWorker)
+	{
+		uint32_t totalNumberOfCombinations = std::apply([&](auto &&... args)
+		{
+			return Combinations::popcount(args...);
+		},
+		args);
+		maxCombinationsPerWorker = totalNumberOfCombinations / nworkers;
+		if (totalNumberOfCombinations % nworkers) maxCombinationsPerWorker++;
+
+		// Record starting points for workers' cooperative processing.
+		starts.reserve(nworkers);
+		for (uint32_t i = 0; i < totalNumberOfCombinations; i += maxCombinationsPerWorker)
+		{
+			auto start = std::apply([&](auto &&... args)
+			{
+				return Earnest::template sequence(args..., i);
+			},
+			args);
+
+			// Combinations::iterate uses reversed order of starting point indices.
+			// We revert it here, in order to make the Combinations::iterate
+			// code more generic.
+			std::apply([&](auto &&... args)
+			{
+				Combinations::template reverse(args..., start.data());
+			},
+			args);
+			for (auto element : start)
+				starts.push_back(element);
+		}
+
+		// Re-evaluate the number of workers, as their number could be eventually
+		// smaller than the initially proposed number of workers.
+		nworkers = starts.size();
+
+		printf("%u iterations in total, %u workers, %u iterations per worker\n",
+			totalNumberOfCombinations, nworkers, maxCombinationsPerWorker);
+	}
+
+};
+
+} // namespace distributed
+
+} // namespace combinations
+
+#endif // COMBINATIONS_PARTITIONER_H
+
diff --git a/third_party/cppsim/include/schedule.h b/third_party/cppsim/include/schedule.h
new file mode 100644
index 00000000..14e16ed8
--- /dev/null
+++ b/third_party/cppsim/include/schedule.h
@@ -0,0 +1,131 @@
+#ifndef COMBINATIONS_SCHEDULE_H
+#define COMBINATIONS_SCHEDULE_H
+
+#include "combinations/sum_equal/combinations.h"
+#include "combinations/sum_less_or_equal/combinations.h"
+#include "earnest/sum_equal/earnest.h"
+#include "earnest/sum_less_or_equal/earnest.h"
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include "combinations/distributed/gpu/schedule.h"
+#endif
+#include "combinations/distributed/cpu/schedule.h"
+
+enum BackendPreference
+{
+	BackendNoPreference = 0,
+	BackendPreferCPU = 1,
+	BackendPreferDiscreteGPU = 2,
+	BackendPreferIntegratedGPU = 3
+};
+
+namespace combinations {
+
+namespace distributed {
+
+namespace detail {
+
+template<
+	typename CombinationsT,
+	typename Earnest,
+	BackendPreference backend = BackendNoPreference>
+class Schedule
+{
+public :
+
+	using Combinations = CombinationsT;
+
+	// Iterate through combinations with specific starting point and duration.
+	// For each combination, call a user-provided function.
+	template<
+		class Contexts,
+		uint32_t ...Args, // Underlying combination parameters
+		class Callable
+	>
+	static auto schedule(Callable c, int nworkers = 0)
+	{
+		if constexpr (backend == BackendNoPreference)
+		{
+			// Prioritize GPU execution, if supported.
+#if defined(__CUDACC__) || defined(__HIPCC__)
+			return combinations::distributed::gpu::make_schedule<
+				Contexts,
+				Combinations,
+				Earnest,
+				Args...>(c, nworkers);
+#else
+			return combinations::distributed::cpu::make_schedule<
+				Contexts,
+				Combinations,
+				Earnest,
+				Args...>(c, nworkers);
+#endif
+		}
+#if defined(__CUDACC__) || defined(__HIPCC__)
+		else if constexpr (backend == BackendPreferDiscreteGPU)
+		{
+			return combinations::distributed::gpu::make_schedule<
+				Contexts,
+				Combinations,
+				Earnest,
+				Args...>(c, nworkers);
+		}
+#endif
+		else if constexpr (backend == BackendPreferCPU)
+		{
+			return combinations::distributed::cpu::make_schedule<
+				Contexts,
+				Combinations,
+				Earnest,
+				Args...>(c, nworkers);
+		}
+		else
+		{
+			throw std::invalid_argument("Unsupported backend");
+		}
+	}
+
+	template<typename Contexts, typename Schedule>
+	static void iterate(Contexts& ctxs, Schedule& schedule)
+	{
+		schedule.execute(ctxs);
+	}
+};
+
+} // namespace detail
+
+template<
+	BackendPreference backend = BackendNoPreference>
+using Schedule = detail::Schedule<
+	combinations::Combinations,
+	earnest::Earnest,
+	backend>;
+
+namespace sum_equal {
+
+template<
+	BackendPreference backend = BackendNoPreference>
+using Schedule = detail::Schedule<
+	combinations::sum_equal::Combinations,
+	earnest::sum_equal::Earnest,
+	backend>;
+
+} // namespace sum_equal
+
+namespace sum_less_or_equal {
+
+template<
+	BackendPreference backend = BackendNoPreference>
+using Schedule = detail::Schedule<
+	combinations::sum_less_or_equal::Combinations,
+	earnest::sum_less_or_equal::Earnest,
+	backend>;
+
+} // namespace sum_equal
+
+} // namespace distributed
+
+} // namespace combinations
+
+#endif // COMBINATIONS_SCHEDULE_H
+

From b3fec7ad46abda733f30083b37457f0058178833 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 30 Aug 2022 16:49:10 +0200
Subject: [PATCH 34/82] Simplifying partitioning code to handle combinations
 specific to cppsim kernels

---
 third_party/cppsim/.gitmodules            |  3 ++
 third_party/cppsim/ThirdParty/hipThrust   |  1 +
 third_party/cppsim/include/cpu/schedule.h | 18 +------
 third_party/cppsim/include/gpu/schedule.h | 21 +-------
 third_party/cppsim/include/partitioner.h  | 61 +---------------------
 third_party/cppsim/include/schedule.h     | 62 ++---------------------
 6 files changed, 12 insertions(+), 154 deletions(-)
 create mode 160000 third_party/cppsim/ThirdParty/hipThrust

diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/.gitmodules
index 67d6ff8b..dd9d601a 100644
--- a/third_party/cppsim/.gitmodules
+++ b/third_party/cppsim/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "ThirdParty/eigen"]
 	path = ThirdParty/eigen
 	url = https://gitlab.com/libeigen/eigen.git
+[submodule "ThirdParty/hipThrust"]
+	path = ThirdParty/hipThrust
+	url = https://github.com/dmikushin/Thrust.git
diff --git a/third_party/cppsim/ThirdParty/hipThrust b/third_party/cppsim/ThirdParty/hipThrust
new file mode 160000
index 00000000..9a12c125
--- /dev/null
+++ b/third_party/cppsim/ThirdParty/hipThrust
@@ -0,0 +1 @@
+Subproject commit 9a12c1259805ed0a3a5fe9bdeb098a872deb936b
diff --git a/third_party/cppsim/include/cpu/schedule.h b/third_party/cppsim/include/cpu/schedule.h
index b03e8132..5dde71d8 100644
--- a/third_party/cppsim/include/cpu/schedule.h
+++ b/third_party/cppsim/include/cpu/schedule.h
@@ -1,7 +1,7 @@
 #ifndef SCHEDULE_CPU_H
 #define SCHEDULE_CPU_H
 
-#include "combinations/distributed/partitioner.h"
+#include "partitioner.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -10,17 +10,11 @@
 #include <sstream>
 #include <vector>
 
-namespace combinations {
-
-namespace distributed {
-
 namespace cpu {
 
 template<
 	class Contexts,
 	class Callable,
-	typename Combinations,
-	typename Earnest,
 	uint32_t ...Args // Underlying combination parameters
 >
 class Schedule
@@ -47,7 +41,7 @@ public :
 		// fast, as the iterator body is trivial. Then we re-use
 		// this schedule to perform the real iterations with a
 		// meaningful user-defined iterator body.		
-		Partitioner<Combinations, Earnest>::template partition<Args...>(
+		Partitioner::template partition<Args...>(
 			starts, nworkers, maxCombinationsPerWorker);
 	}
 	
@@ -73,8 +67,6 @@ public :
 // most evenly.
 template<
 	class Contexts,
-	typename Combinations,
-	typename Earnest,
 	uint32_t ...Args, // Underlying combination parameters
 	class Callable
 >
@@ -100,16 +92,10 @@ auto make_schedule(Callable c, int nworkers = 0)
 	return Schedule<
 		Contexts,
 		Callable,
-		Combinations,
-		Earnest,
 		Args...>(nworkers, c);
 }
 
 } // namespace cpu
 
-} // namespace distributed
-
-} // namespace combinations
-
 #endif // SCHEDULE_CPU_H
 
diff --git a/third_party/cppsim/include/gpu/schedule.h b/third_party/cppsim/include/gpu/schedule.h
index 173c50d8..3030ff97 100644
--- a/third_party/cppsim/include/gpu/schedule.h
+++ b/third_party/cppsim/include/gpu/schedule.h
@@ -3,7 +3,7 @@
 #ifndef SCHEDULE_GPU_H
 #define SCHEDULE_GPU_H
 
-#include "combinations/distributed/partitioner.h"
+#include "partitioner.h"
 
 #include <stdint.h>
 #include <string>
@@ -16,17 +16,12 @@
 #include <hipThrust/thrust/host_vector.h>
 #endif
 
-namespace combinations {
-
-namespace distributed {
-
 namespace gpu {
 
 template<
 	class Contexts,
 	class Callable,
 	class Starts,
-	typename Combinations,
 	uint32_t ...Args // Underlying combination parameters
 >
 __global__ void kernel(Contexts ctxs, Callable c,
@@ -46,8 +41,6 @@ __global__ void kernel(Contexts ctxs, Callable c,
 template<
 	class Contexts,
 	class Callable,
-	typename Combinations,
-	typename Earnest,
 	uint32_t ...Args // Underlying combination parameters
 >
 class Schedule
@@ -79,7 +72,7 @@ public :
 		// this schedule to perform the real iterations with a
 		// meaningful user-defined iterator body.		
 		thrust::host_vector<Combination> startsHost;
-		Partitioner<Combinations, Earnest>::template partition<Args...>(
+		Partitioner::template partition<Args...>(
 			startsHost, nworkers, maxCombinationsPerWorker);
 		starts = startsHost;
 		
@@ -121,8 +114,6 @@ public :
 // most evenly.
 template<
 	class Contexts,
-	typename Combinations,
-	typename Earnest,
 	uint32_t ...Args, // Underlying combination parameters
 	class Callable
 >
@@ -145,8 +136,6 @@ auto make_schedule(Callable c, int nworkers = 0)
 		return Schedule<
 			Contexts,
 			Callable,
-			Combinations,
-			Earnest,
 			Args...>(nworkers, nblocks, c);			
 	}
 
@@ -170,17 +159,11 @@ auto make_schedule(Callable c, int nworkers = 0)
 	return Schedule<
 		Contexts,
 		Callable,
-		Combinations,
-		Earnest,
 		Args...>(nblocks * ::gpu::nthreadsPerBlock, nblocks, c);
 }
 
 } // namespace gpu
 
-} // namespace distributed
-
-} // namespace combinations
-
 #endif // SCHEDULE_GPU_H
 
 #endif // defined(__CUDACC__) || defined(__HIPCC__)
diff --git a/third_party/cppsim/include/partitioner.h b/third_party/cppsim/include/partitioner.h
index 502ba9e3..20e5d111 100644
--- a/third_party/cppsim/include/partitioner.h
+++ b/third_party/cppsim/include/partitioner.h
@@ -1,20 +1,10 @@
 #ifndef COMBINATIONS_PARTITIONER_H
 #define COMBINATIONS_PARTITIONER_H 
 
-#include "earnest/earnest.h"
-#include "earnest/sum_equal/earnest.h"
-#include "earnest/sum_less_or_equal/earnest.h"
+#include "combinations.h"
 
 #include <cstdio>
 
-namespace combinations {
-
-namespace distributed {
-
-template<
-	class Combinations,
-	class Earnest
->
 class Partitioner
 {
 public :
@@ -58,56 +48,7 @@ public :
 		printf("%u iterations in total, %u workers, %u iterations per worker\n",
 			totalNumberOfCombinations, nworkers, maxCombinationsPerWorker);
 	}
-
-	template<
-		typename Args, // Underlying combination parameters
-		class Starts
-	>
-	static void partition(Args& args, Starts& starts, int& nworkers, uint32_t& maxCombinationsPerWorker)
-	{
-		uint32_t totalNumberOfCombinations = std::apply([&](auto &&... args)
-		{
-			return Combinations::popcount(args...);
-		},
-		args);
-		maxCombinationsPerWorker = totalNumberOfCombinations / nworkers;
-		if (totalNumberOfCombinations % nworkers) maxCombinationsPerWorker++;
-
-		// Record starting points for workers' cooperative processing.
-		starts.reserve(nworkers);
-		for (uint32_t i = 0; i < totalNumberOfCombinations; i += maxCombinationsPerWorker)
-		{
-			auto start = std::apply([&](auto &&... args)
-			{
-				return Earnest::template sequence(args..., i);
-			},
-			args);
-
-			// Combinations::iterate uses reversed order of starting point indices.
-			// We revert it here, in order to make the Combinations::iterate
-			// code more generic.
-			std::apply([&](auto &&... args)
-			{
-				Combinations::template reverse(args..., start.data());
-			},
-			args);
-			for (auto element : start)
-				starts.push_back(element);
-		}
-
-		// Re-evaluate the number of workers, as their number could be eventually
-		// smaller than the initially proposed number of workers.
-		nworkers = starts.size();
-
-		printf("%u iterations in total, %u workers, %u iterations per worker\n",
-			totalNumberOfCombinations, nworkers, maxCombinationsPerWorker);
-	}
-
 };
 
-} // namespace distributed
-
-} // namespace combinations
-
 #endif // COMBINATIONS_PARTITIONER_H
 
diff --git a/third_party/cppsim/include/schedule.h b/third_party/cppsim/include/schedule.h
index 14e16ed8..74586134 100644
--- a/third_party/cppsim/include/schedule.h
+++ b/third_party/cppsim/include/schedule.h
@@ -1,15 +1,12 @@
 #ifndef COMBINATIONS_SCHEDULE_H
 #define COMBINATIONS_SCHEDULE_H
 
-#include "combinations/sum_equal/combinations.h"
-#include "combinations/sum_less_or_equal/combinations.h"
-#include "earnest/sum_equal/earnest.h"
-#include "earnest/sum_less_or_equal/earnest.h"
+#include "combinations.h"
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
-#include "combinations/distributed/gpu/schedule.h"
+#include "gpu/schedule.h"
 #endif
-#include "combinations/distributed/cpu/schedule.h"
+#include "cpu/schedule.h"
 
 enum BackendPreference
 {
@@ -19,22 +16,12 @@ enum BackendPreference
 	BackendPreferIntegratedGPU = 3
 };
 
-namespace combinations {
-
-namespace distributed {
-
-namespace detail {
-
 template<
-	typename CombinationsT,
-	typename Earnest,
 	BackendPreference backend = BackendNoPreference>
 class Schedule
 {
 public :
 
-	using Combinations = CombinationsT;
-
 	// Iterate through combinations with specific starting point and duration.
 	// For each combination, call a user-provided function.
 	template<
@@ -50,14 +37,10 @@ public :
 #if defined(__CUDACC__) || defined(__HIPCC__)
 			return combinations::distributed::gpu::make_schedule<
 				Contexts,
-				Combinations,
-				Earnest,
 				Args...>(c, nworkers);
 #else
 			return combinations::distributed::cpu::make_schedule<
 				Contexts,
-				Combinations,
-				Earnest,
 				Args...>(c, nworkers);
 #endif
 		}
@@ -66,8 +49,6 @@ public :
 		{
 			return combinations::distributed::gpu::make_schedule<
 				Contexts,
-				Combinations,
-				Earnest,
 				Args...>(c, nworkers);
 		}
 #endif
@@ -75,8 +56,6 @@ public :
 		{
 			return combinations::distributed::cpu::make_schedule<
 				Contexts,
-				Combinations,
-				Earnest,
 				Args...>(c, nworkers);
 		}
 		else
@@ -92,40 +71,5 @@ public :
 	}
 };
 
-} // namespace detail
-
-template<
-	BackendPreference backend = BackendNoPreference>
-using Schedule = detail::Schedule<
-	combinations::Combinations,
-	earnest::Earnest,
-	backend>;
-
-namespace sum_equal {
-
-template<
-	BackendPreference backend = BackendNoPreference>
-using Schedule = detail::Schedule<
-	combinations::sum_equal::Combinations,
-	earnest::sum_equal::Earnest,
-	backend>;
-
-} // namespace sum_equal
-
-namespace sum_less_or_equal {
-
-template<
-	BackendPreference backend = BackendNoPreference>
-using Schedule = detail::Schedule<
-	combinations::sum_less_or_equal::Combinations,
-	earnest::sum_less_or_equal::Earnest,
-	backend>;
-
-} // namespace sum_equal
-
-} // namespace distributed
-
-} // namespace combinations
-
 #endif // COMBINATIONS_SCHEDULE_H
 

From 77ca828d2094e0758e2da2a95168e9b9eb0aedb4 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 31 Aug 2022 11:25:26 +0200
Subject: [PATCH 35/82] Adding popcount implementation and a correctness test

---
 third_party/cppsim/CMakeLists.txt             |  8 +++-
 third_party/cppsim/include/combinations.h     | 27 +++++++++---
 third_party/cppsim/src/test/test_popcount.cpp | 43 +++++++++++++++++++
 3 files changed, 70 insertions(+), 8 deletions(-)
 create mode 100644 third_party/cppsim/src/test/test_popcount.cpp

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 1975d6ca..0ed3340f 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -58,11 +58,17 @@ kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
 kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
 
+add_executable(test_popcount "src/test/test_popcount.cpp")
+set_target_properties(test_popcount PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
+target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
+target_link_libraries(test_popcount PRIVATE gtest)
+
 add_executable(test_combinations "src/test/test_combinations.cpp")
 set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
-target_link_libraries(test_combinations PRIVATE gtest kernelgen)
+target_link_libraries(test_combinations PRIVATE gtest)
 
 add_executable(benchmark "src/benchmark/benchmark.cpp")
 target_link_libraries(benchmark PRIVATE gtest kernelgen)
diff --git a/third_party/cppsim/include/combinations.h b/third_party/cppsim/include/combinations.h
index 68e08c0d..7698fb16 100644
--- a/third_party/cppsim/include/combinations.h
+++ b/third_party/cppsim/include/combinations.h
@@ -77,16 +77,29 @@ public :
 		return sizeof...(n);
 	}
 
+	// Tell the number of combinations supplied by the iterator
+	// configured with the given set of template parameters.
+	template<uint32_t n0>
+	static uint32_t _popcount()
+	{
+		return n0;
+	}
+
+	// Tell the number of combinations supplied by the iterator
+	// configured with the given set of template parameters.
+	template<uint32_t n0, uint32_t n1, uint32_t ...n>
+	static uint32_t _popcount()
+	{
+		return (n0 / (2 * n1)) * _popcount<n1, n...>();
+	}
+
 	// Tell the number of combinations supplied by the iterator
 	// configured with the given set of template parameters.
 	template<uint32_t ...n>
 	static uint32_t popcount()
 	{
 		if (sizeof...(n) == 0) return 0;
-
-		uint32_t result = 1;
-		// TODO
-		return result;
+		return _popcount<n...>();
 	}
 
 	// Reverse the order of elements in a combination
@@ -109,7 +122,7 @@ public :
 	GPU_SUPPORT
 	static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
 	{
-		for (uint32_t i = start; (i < n0) && limit; i += 2 * n1)
+		for (uint32_t i = *start; (i < n0) && limit; i += 2 * n1)
 		{
 			// Flush starting point to zero, in order for all subsequent iterations
 			// to start from zero as usual.
@@ -131,7 +144,7 @@ public :
 	GPU_SUPPORT
 	static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c)
 	{
-		for (uint32_t i = start; (i < n0) && limit; i++)
+		for (uint32_t i = *start; (i < n0) && limit; i++)
 		{
 			// Flush starting point to zero, in order for all subsequent iterations
 			// to start from zero as usual.
@@ -163,7 +176,7 @@ public :
 	template<uint32_t ...n>
 	static uint32_t popcount(const uint32_t limit)
 	{
-		// TODO Actually could be less than limit, if start is closer to the end.
+		// XXX Actually could be less than limit, if start is closer to the end.
 		return limit;
 	}
 };
diff --git a/third_party/cppsim/src/test/test_popcount.cpp b/third_party/cppsim/src/test/test_popcount.cpp
new file mode 100644
index 00000000..5688e2e6
--- /dev/null
+++ b/third_party/cppsim/src/test/test_popcount.cpp
@@ -0,0 +1,43 @@
+#include <algorithm>
+#include <array>
+#include <complex>
+#include <cstdlib>
+#include "combinations.h"
+
+#include "gtest/gtest.h"
+
+size_t popcount_reference(unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0)
+{
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4;
+    std::size_t n = 1 + d0 + d1 + d2 + d3 + d4;
+    std::size_t dsorted[] = { d4, d3, d2, d1, d0 };
+    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
+
+    size_t popcount = 0;
+    for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0])
+        for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1])
+            for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2])
+                for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3])
+                    for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4])
+                        for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5)
+                            popcount++;
+    
+    return popcount;
+}
+
+TEST(popcount, kernel5)
+{
+	constexpr unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4;
+	constexpr std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4;
+	constexpr std::size_t n = 1 + d0 + d1 + d2 + d3 + d4;
+	auto popcount1 = popcount_reference(id4, id3, id2, id1, id0);
+	auto popcount2 = Combinations::popcount<n, d4, d3, d2, d1, d0>();
+	ASSERT_EQ(popcount1, popcount2);
+}
+
+int main(int argc, char* argv[])
+{
+	::testing::InitGoogleTest(&argc, argv);
+	return RUN_ALL_TESTS();
+}
+

From a804e501a2bd73445cb0552bf6cde2563311369c Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 31 Aug 2022 17:28:08 +0200
Subject: [PATCH 36/82] Fixing combinations partitioning code, adding
 partitioned version into the test case

---
 third_party/cppsim/CMakeLists.txt             |  2 +-
 third_party/cppsim/include/schedule.h         |  8 +--
 .../cppsim/src/test/test_combinations.cpp     | 67 +++++++++++++++++--
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 0ed3340f..ec01ab05 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -68,7 +68,7 @@ add_executable(test_combinations "src/test/test_combinations.cpp")
 set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
-target_link_libraries(test_combinations PRIVATE gtest)
+target_link_libraries(test_combinations PRIVATE gtest OpenMP::OpenMP_CXX)
 
 add_executable(benchmark "src/benchmark/benchmark.cpp")
 target_link_libraries(benchmark PRIVATE gtest kernelgen)
diff --git a/third_party/cppsim/include/schedule.h b/third_party/cppsim/include/schedule.h
index 74586134..141f2461 100644
--- a/third_party/cppsim/include/schedule.h
+++ b/third_party/cppsim/include/schedule.h
@@ -35,11 +35,11 @@ public :
 		{
 			// Prioritize GPU execution, if supported.
 #if defined(__CUDACC__) || defined(__HIPCC__)
-			return combinations::distributed::gpu::make_schedule<
+			return gpu::make_schedule<
 				Contexts,
 				Args...>(c, nworkers);
 #else
-			return combinations::distributed::cpu::make_schedule<
+			return cpu::make_schedule<
 				Contexts,
 				Args...>(c, nworkers);
 #endif
@@ -47,14 +47,14 @@ public :
 #if defined(__CUDACC__) || defined(__HIPCC__)
 		else if constexpr (backend == BackendPreferDiscreteGPU)
 		{
-			return combinations::distributed::gpu::make_schedule<
+			return gpu::make_schedule<
 				Contexts,
 				Args...>(c, nworkers);
 		}
 #endif
 		else if constexpr (backend == BackendPreferCPU)
 		{
-			return combinations::distributed::cpu::make_schedule<
+			return cpu::make_schedule<
 				Contexts,
 				Args...>(c, nworkers);
 		}
diff --git a/third_party/cppsim/src/test/test_combinations.cpp b/third_party/cppsim/src/test/test_combinations.cpp
index 1cb2e689..6f1afe93 100644
--- a/third_party/cppsim/src/test/test_combinations.cpp
+++ b/third_party/cppsim/src/test/test_combinations.cpp
@@ -151,6 +151,54 @@ void kernel_combinations(T* psi, const T* m, std::size_t ctrlmask)
     }
 }
 
+#include "schedule.h"
+
+// bit indices id[.] are given from high to low (e.g. control first for CNOT)
+template<unsigned id0, unsigned id1, unsigned id2, unsigned id3, unsigned id4, class T>
+void kernel_combinations_partitioned(T* psi, const T* m, std::size_t ctrlmask)
+{
+    constexpr std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4;
+    constexpr std::size_t n = 1 + d0 + d1 + d2 + d3 + d4;
+    std::size_t dsorted[] = { d4, d3, d2, d1, d0 };
+    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
+
+    if (ctrlmask == 0){
+        // Here we do the "planning" of execution, not the execution itself.
+        // We do already specify though an interation loop body, in order
+        // for the backend to make the resources allocation.
+        auto backend = Schedule<BackendPreferCPU>::template schedule<uint32_t*, n, d4, d3, d2, d1, d0>(
+            [=](uint32_t& count_worker, auto... i)
+        {
+	    kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m);
+        });
+
+        printf("Using %s backend with %u workers\n",
+            backend.getName(), backend.getWorkersCount());
+
+        // Finally, execute the iterations.
+        uint32_t* ptr = nullptr;
+        Schedule<BackendPreferCPU>::iterate(ptr, backend);
+    }
+    else{
+        // Here we do the "planning" of execution, not the execution itself.
+        // We do already specify though an interation loop body, in order
+        // for the backend to make the resources allocation.
+        auto backend = Schedule<BackendPreferCPU>::template schedule<uint32_t*, n, d4, d3, d2, d1, d0>(
+            [=](uint32_t& count_worker, auto... i)
+        {
+            if (((i + ...) & ctrlmask) == ctrlmask)
+	        kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m);
+        });
+
+        printf("Using %s backend with %u workers\n",
+            backend.getName(), backend.getWorkersCount());
+
+        // Finally, execute the iterations.
+        uint32_t* ptr = nullptr;
+        Schedule<BackendPreferCPU>::iterate(ptr, backend);
+    }
+}
+
 #include <array>
 #include <iostream>
 #include <random>
@@ -180,14 +228,18 @@ bool compare(Kernels kernels, V& psi1)
 	std::size_t ctrlmask = 0; // uid(dre);
 
 	// Compare kernel against generated kernel.
-	kernels(psi1, psi2, m, ctrlmask);
-	auto diff = std::mismatch(psi1.begin(), psi1.end(), psi2.begin());
-	if (diff.first == psi1.end())
+	kernels(psi1, psi2, psi3, m, ctrlmask);
+	auto diff2 = std::mismatch(psi1.begin(), psi1.end(), psi2.begin());
+	auto diff3 = std::mismatch(psi1.begin(), psi1.end(), psi3.begin());
+	if ((diff2.first == psi1.end()) && (diff3.first == psi1.end()))
 		return true;
 
-	if (diff.first != psi1.end())
-		std::cout << "Mismatch in psi2 at " << std::distance(psi1.begin(), diff.first) <<
-			" : " << *(diff.first) << " != " << *(diff.second) << std::endl;
+	if (diff2.first != psi1.end())
+		std::cout << "Mismatch in psi2 at " << std::distance(psi1.begin(), diff2.first) <<
+			" : " << *(diff2.first) << " != " << *(diff2.second) << std::endl;
+	if (diff3.first != psi1.end())
+		std::cout << "Mismatch in psi3 at " << std::distance(psi1.begin(), diff3.first) <<
+			" : " << *(diff3.first) << " != " << *(diff3.second) << std::endl;
 
 	return false;
 }
@@ -202,10 +254,11 @@ TEST(nointrin, kernel5)
 	n += 1UL << id3;
 	n += 1UL << id4;
 	std::vector<int> psi(n);
-	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto m, auto ctrlmask)
+	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(&psi1[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask);
 		kernel_combinations<id0, id1, id2, id3, id4>(&psi2[0], &m[0][0], ctrlmask);
+		kernel_combinations_partitioned<id0, id1, id2, id3, id4>(&psi3[0], &m[0][0], ctrlmask);
 	},
 	psi));
 }

From 7b9548cf7fc744da1c2b4736f0adf33654a50deb Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 1 Sep 2022 16:43:34 +0200
Subject: [PATCH 37/82] Adding the missing compiler.h

---
 third_party/cppsim/include/compiler.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 third_party/cppsim/include/compiler.h

diff --git a/third_party/cppsim/include/compiler.h b/third_party/cppsim/include/compiler.h
new file mode 100644
index 00000000..f251bf9d
--- /dev/null
+++ b/third_party/cppsim/include/compiler.h
@@ -0,0 +1,19 @@
+#ifndef COMPILER_H
+#define COMPILER_H
+
+#include <string>
+
+class Compiler
+{
+public :
+
+	void* codegen(int nqubits, const std::string& source, std::string& errmsg);
+
+	Compiler();
+
+};
+
+Compiler& get_compiler();
+
+#endif // COMPILER_H
+

From eec35874d79ccc31c3aa48027c3c58469e438d9a Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Thu, 1 Sep 2022 19:19:04 +0200
Subject: [PATCH 38/82] Use literals to identify a named argument ids

---
 third_party/cppsim/src/kernelgen.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/cppsim/src/kernelgen.cpp b/third_party/cppsim/src/kernelgen.cpp
index 5fa91f91..ecd958df 100644
--- a/third_party/cppsim/src/kernelgen.cpp
+++ b/third_party/cppsim/src/kernelgen.cpp
@@ -7,6 +7,7 @@
 #include <pybind11/stl.h>
 
 namespace py = pybind11;
+using namespace pybind11::literals;
 
 std::string KernelGen::generate(int nqubits, unsigned* ids)
 {
@@ -25,7 +26,7 @@ std::string KernelGen::generate(int nqubits, unsigned* ids)
 		{
 			std::vector<unsigned> vids;
 			vids.assign(ids, ids + nqubits);
-			auto source = globals["kernelgen"](nqubits, vids).cast<std::string>();
+			auto source = globals["kernelgen"](nqubits, "ids"_a = vids).cast<std::string>();
 			return source;
 		}
 		else

From f90b685c5f2ea77de9bdeec51005c0c8e1673f8d Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Fri, 2 Sep 2022 16:53:44 +0200
Subject: [PATCH 39/82] Adding an option to substitute the M matrix directly
 into the runtime-compiled kernel

---
 .../cppsim/include/nointrin/kernelgen.py      | 37 ++++++++++++++-----
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 78376db1..d84ca53f 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -3,7 +3,13 @@
 import itertools
 import os
 
-def kernelgen(nqubits, ids=None, matvec=True, combinations=False):
+def kernelgen(nqubits, ids=None, m=None, matvec=True, combinations=False):
+    # If m matrix is given explicitly, do not use matrix-vector form
+    # of operation, because we expect many terms to be removed, due to
+    # multiplication by zero m element.
+    if not (m is None):
+        matvec = False
+
     # All combinations of qubits, excluding dupes, e.g. for nqubits = 2:
     # 0 0
     # 1 0
@@ -25,15 +31,28 @@ def kernelgen(nqubits, ids=None, matvec=True, combinations=False):
     left = '_'
     right = ''
     if matvec:
-    	left = '['
-    	right = ']'
+        left = '['
+        right = ']'
 
     # Pretty-print the right hand sides (recursively).
-    def rhs(n, j, i):
-        if i < n - 1:
-            return f'add(mul(v{left}{i}{right}, M({j}, {i})), ' + rhs(n, j, i + 1)
-        else:
-            return f'mul(v{left}{i}{right}, M({j}, {i})' + ''.join(')' for k in range(0, n))
+    if m is None:
+        def rhs(n, j, i):
+            if i < n - 1:
+                return f'add(mul(v{left}{i}{right}, M({j}, {i})), ' + rhs(n, j, i + 1) + ''.join(')')
+            else:
+                return f'mul(v{left}{i}{right}, M({j}, {i})'
+    else:
+        def rhs(n, j, i):
+            if i < n - 1:
+                if m(j, i) != 0:
+                    return f'add(mul(v{left}{i}{right}, M({j}, {i})), ' + rhs(n, j, i + 1) + ''.join(')')
+                else:
+                    return rhs(n, j, i + 1)
+            else:
+                if m(j, i) != 0:
+                    return f'mul(v{left}{i}{right}, M({j}, {i})'
+                else:
+                    return 0
 
     strrhs = [] 
     for j in range(0, len(strcombs)):
@@ -41,7 +60,7 @@ def rhs(n, j, i):
 
     ids_sorted = []
     if ids != None:
-    	ids_sorted = sorted(ids, reverse = True)
+        ids_sorted = sorted(ids, reverse = True)
 
     # Some string constants clash with the {} syntax of print(), so we
     # substitute them as constants.

From f0b8f25f3f89edb43d0a15180b14e2ddd624799f Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 6 Sep 2022 11:55:16 +0200
Subject: [PATCH 40/82] Fixing the missing vids vector length spec

Co-authored-by: Damien Nguyen <damien1@huawei.com>
---
 third_party/cppsim/src/kernelgen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/src/kernelgen.cpp b/third_party/cppsim/src/kernelgen.cpp
index ecd958df..44a9c9e7 100644
--- a/third_party/cppsim/src/kernelgen.cpp
+++ b/third_party/cppsim/src/kernelgen.cpp
@@ -24,7 +24,7 @@ std::string KernelGen::generate(int nqubits, unsigned* ids)
 		py::eval<py::eval_statements>(nointrin, globals, globals);
 		if (ids)
 		{
-			std::vector<unsigned> vids;
+			std::vector<unsigned> vids(nqubits);
 			vids.assign(ids, ids + nqubits);
 			auto source = globals["kernelgen"](nqubits, "ids"_a = vids).cast<std::string>();
 			return source;

From 1d14f9d2f51b45789e0c3eb0282596404ff8933e Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 6 Sep 2022 14:56:36 +0200
Subject: [PATCH 41/82] Correcting the way how the PSI length should be
 computed: it must be 2^(max_id + 1)

Co-authored-by: Damien Nguyen <damien1@huawei.com>
---
 third_party/cppsim/src/test/test_nointrin.cpp | 45 +++++++------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/third_party/cppsim/src/test/test_nointrin.cpp
index 09293eec..803455cc 100644
--- a/third_party/cppsim/src/test/test_nointrin.cpp
+++ b/third_party/cppsim/src/test/test_nointrin.cpp
@@ -61,15 +61,14 @@ bool compare(Kernels kernels, V& psi1)
 
 TEST(nointrin, kernel1)
 {
-	unsigned id0 = 0;
-	size_t n = 1;
-	n += 1UL << id0;
+	unsigned id0 = 1;
+	std::array ids { id0 };
+	size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1);
 	std::vector<int> psi(n);
 	ASSERT_TRUE(compare<1>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id0, m, ctrlmask);
 		generated_kernel(&psi2[0], id0, &m[0][0], ctrlmask);
-		std::array ids { id0 };
 		kernelgen(psi3, ids, m , ctrlmask);
 	},
 	psi));
@@ -77,16 +76,14 @@ TEST(nointrin, kernel1)
 
 TEST(nointrin, kernel2)
 {
-	unsigned id0 = 0, id1 = 1;
-	size_t n = 1;
-	n += 1UL << id0;
-	n += 1UL << id1;
+	unsigned id0 = 1, id1 = 3;
+	std::array ids { id0, id1 };
+	size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1);
 	std::vector<int> psi(n);
 	ASSERT_TRUE(compare<2>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id1, id0, m, ctrlmask);
 		generated_kernel(&psi2[0], id1, id0, &m[0][0], ctrlmask);
-		std::array ids { id0, id1 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));
@@ -94,17 +91,14 @@ TEST(nointrin, kernel2)
 
 TEST(nointrin, kernel3)
 {
-	unsigned id0 = 0, id1 = 1, id2 = 2;
-	size_t n = 1;
-	n += 1UL << id0;
-	n += 1UL << id1;
-	n += 1UL << id2;
+	unsigned id0 = 1, id1 = 3, id2 = 5;
+	std::array ids { id0, id1, id2 };
+	size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1);
 	std::vector<int> psi(n);
 	ASSERT_TRUE(compare<3>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id2, id1, id0, m, ctrlmask);
 		generated_kernel(&psi2[0], id2, id1, id0, &m[0][0], ctrlmask);
-		std::array ids { id0, id1, id2 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));
@@ -112,18 +106,14 @@ TEST(nointrin, kernel3)
 
 TEST(nointrin, kernel4)
 {
-	unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3;
-	size_t n = 1;
-	n += 1UL << id0;
-	n += 1UL << id1;
-	n += 1UL << id2;
-	n += 1UL << id3;
+	unsigned id0 = 1, id1 = 3, id2 = 5, id3 = 7;
+	std::array ids { id0, id1, id2, id3 };
+	size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1);
 	std::vector<int> psi(n);
 	ASSERT_TRUE(compare<4>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id3, id2, id1, id0, m, ctrlmask);
 		generated_kernel(&psi2[0], id3, id2, id1, id0, &m[0][0], ctrlmask);
-		std::array ids { id0, id1, id2, id3 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));
@@ -131,19 +121,14 @@ TEST(nointrin, kernel4)
 
 TEST(nointrin, kernel5)
 {
-	unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4;
-	size_t n = 1;
-	n += 1UL << id0;
-	n += 1UL << id1;
-	n += 1UL << id2;
-	n += 1UL << id3;
-	n += 1UL << id4;
+	unsigned id0 = 1, id1 = 3, id2 = 5, id3 = 7, id4 = 9;
+	std::array ids { id0, id1, id2, id3, id4 };
+	size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1);
 	std::vector<int> psi(n);
 	ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask)
 	{
 		kernel(psi1, id4, id3, id2, id1, id0, m, ctrlmask);
 		generated_kernel(&psi2[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask);
-		std::array ids { id0, id1, id2, id3, id4 };
 		kernelgen(psi3, ids, m, ctrlmask);
 	},
 	psi));

From eb5b3765fbbad398ca5bd66f798cf09f0feacf26 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 7 Sep 2022 16:54:13 +0200
Subject: [PATCH 42/82] Correcting the calculation of PSI length in the
 generator as well. In order to calculate it, we replace sorting of d by the
 following: sort ids first and get the largest id, then create dsorted out of
 already sorted ids

---
 .../cppsim/include/nointrin/kernelgen.py      | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index d84ca53f..286f7ae8 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -109,10 +109,12 @@ def rhs(n, j, i):
 template<class T>
 void kernel(T* psi, {id_var}const T* m, std::size_t ctrlmask)
 {{
-    {constexpr}std::size_t {d};
+    {ids_sorted}
+    {sort}
     {constexpr}std::size_t {n};
+    {constexpr}std::size_t {d};
     {constexpr}std::size_t {dsorted};
-    {sort}
+
     if (ctrlmask == 0){{
         {pragma} omp for collapse({collapse}) schedule(static)
         for (std::size_t i0 = 0; i0 < n; i0 += 2 * {dsorted_0}){{
@@ -139,20 +141,19 @@ def rhs(n, j, i):
         constexpr   = 'constexpr ' if ids != None else '',
         d           = f"d0 = 1UL << {'id0' if ids == None else ids[0]}{''.join(', d{} = 1UL << {}'.format(i, 'id{}'.format(i) if ids == None else ids[i]) for i in range (1, nqubits))}", \
         d_args      = ''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else '', \
-        n           = 'n = 1' + ''.join(' + d{}'.format(i) for i in range (0, nqubits)), \
-        dsorted     = (f"dsorted[] = {{ d{nqubits - 1}" + ''.join(', d{}'.format(nqubits - i - 1) for i in range (1, nqubits)) + f" }}") if ids == None else (f"dsorted0 = 1UL << {ids_sorted[0]}{''.join(', dsorted{} = 1UL << {}'.format(i, ids_sorted[i]) for i in range (1, nqubits))}"), \
-        dsorted_0    = "dsorted[0]" if ids == None else "dsorted0", \
-        dsorted_last = f"dsorted[{nqubits - 1}]" if ids == None else f"dsorted{nqubits - 1}", \
-        sort        = f'std::sort(dsorted, dsorted + {nqubits}, std::greater<std::size_t>());{newline}' if ids == None else '', \
+        n           = 'n = 1UL << (ids_sorted[0] + 1)' if ids == None else f'n = 1UL << {ids_sorted[0] + 1}', \
+        ids_sorted  = (f"{'constexpr ' if ids != None else ''}std::size_t ids_sorted[] = {{ id{nqubits - 1}" + ''.join(', id{}'.format(nqubits - i - 1) for i in range (1, nqubits)) + f" }};") if ids == None else '', \
+        sort        = f'std::sort(ids_sorted, ids_sorted + {nqubits}, std::greater<std::size_t>());' if ids == None else '', \
+        dsorted     = f"dsorted0 = 1UL << {'ids_sorted[0]' if ids == None else ids_sorted[0]}{''.join(', dsorted{} = 1UL << {}'.format(i, 'ids_sorted[{}]'.format(i) if ids == None else ids_sorted[i]) for i in range (1, nqubits))}", \
+        dsorted_0    = "dsorted0", \
+        dsorted_last = f"dsorted{nqubits - 1}", \
         collapse    = f"{nqubits + 1}", \
         offset    = ''.join('    '.format(i) for i in range (0, nqubits)), \
         offset_1    = ''.join('    '.format(i) for i in range (0, nqubits + 1)), \
         offset_2    = ''.join('    '.format(i) for i in range (0, nqubits + 2)), \
         d_template  = ('<d0' + ''.join(', d{}'.format(i) for i in range (1, nqubits)) + '>') if ids != None else '', \
         i           = 'i0' + ''.join(' + i{}'.format(i) for i in range (1, nqubits + 1)), \
-        for_loops   = ''.join('{}for (std::size_t i{} = 0; i{} < dsorted{left}{}{right}; i{} += 2 * dsorted{left}{}{right}){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline,
-            left  = '[' if ids == None else '', \
-            right = ']' if ids == None else '') for i in range (1, nqubits)))
+        for_loops   = ''.join('{}for (std::size_t i{} = 0; i{} < dsorted{}; i{} += 2 * dsorted{}){}'.format(''.join('    ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits)))
 
     return kernel
 

From 2768b8884dfe1b26a542c2923a30d3a9f2314dff Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 7 Sep 2022 19:08:29 +0200
Subject: [PATCH 43/82] Compute the PSI length in the hand-written kernels as
 well, instead of relying on psi.size()

---
 third_party/cppsim/include/intrin/kernel1.hpp | 15 ++++----
 third_party/cppsim/include/intrin/kernel2.hpp | 23 ++++++-----
 third_party/cppsim/include/intrin/kernel3.hpp | 28 +++++++-------
 third_party/cppsim/include/intrin/kernel4.hpp | 33 ++++++++--------
 third_party/cppsim/include/intrin/kernel5.hpp | 38 +++++++++----------
 third_party/cppsim/include/kernelgen.hpp      |  4 +-
 .../cppsim/include/nointrin/kernel1.hpp       | 15 ++++----
 .../cppsim/include/nointrin/kernel2.hpp       | 22 +++++------
 .../cppsim/include/nointrin/kernel3.hpp       | 27 +++++++------
 .../cppsim/include/nointrin/kernel4.hpp       | 32 ++++++++--------
 .../cppsim/include/nointrin/kernel5.hpp       | 37 +++++++++---------
 11 files changed, 130 insertions(+), 144 deletions(-)

diff --git a/third_party/cppsim/include/intrin/kernel1.hpp b/third_party/cppsim/include/intrin/kernel1.hpp
index 793a116f..119b5350 100644
--- a/third_party/cppsim/include/intrin/kernel1.hpp
+++ b/third_party/cppsim/include/intrin/kernel1.hpp
@@ -28,8 +28,11 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m, M con
 template <class V, class M>
 void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
+    std::size_t ids_sorted[] = { id0 };
+    std::sort(ids_sorted, ids_sorted + 1, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
     std::size_t d0 = 1UL << id0;
+    std::size_t dsorted0 = 1UL << ids_sorted[0];
 
     __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1])};
     __m256d mmt[2];
@@ -40,20 +43,18 @@ void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
         mmt[i] = _mm256_mul_pd(badc, neg);
     }
 
-    std::size_t dsorted[] = {d0};
-
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
                 kernel_core(psi, i0 + i1, d0, mm, mmt);
             }
         }
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
                 if (((i0 + i1)&ctrlmask) == ctrlmask)
                     kernel_core(psi, i0 + i1, d0, mm, mmt);
             }
diff --git a/third_party/cppsim/include/intrin/kernel2.hpp b/third_party/cppsim/include/intrin/kernel2.hpp
index e1a2c9a9..b4183739 100644
--- a/third_party/cppsim/include/intrin/kernel2.hpp
+++ b/third_party/cppsim/include/intrin/kernel2.hpp
@@ -31,9 +31,11 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M
 template <class V, class M>
 void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
+    std::size_t ids_sorted[] = { id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 2, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1];
 
     __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3])};
     __m256d mmt[8];
@@ -44,14 +46,11 @@ void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask
         mmt[i] = _mm256_mul_pd(badc, neg);
     }
 
-    std::size_t dsorted[] = {d0 , d1};
-    std::sort(dsorted, dsorted + 2, std::greater<std::size_t>());
-
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
                     kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt);
                 }
             }
@@ -59,9 +58,9 @@ void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
                     if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
                         kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt);
                 }
diff --git a/third_party/cppsim/include/intrin/kernel3.hpp b/third_party/cppsim/include/intrin/kernel3.hpp
index 2aac0f8a..a4cb3c55 100644
--- a/third_party/cppsim/include/intrin/kernel3.hpp
+++ b/third_party/cppsim/include/intrin/kernel3.hpp
@@ -45,10 +45,11 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, s
 template <class V, class M>
 void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
-    std::size_t d2 = 1UL << id2;
+    std::size_t ids_sorted[] = { id2, id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 3, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2];
 
     __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7])};
     __m256d mmt[32];
@@ -59,15 +60,12 @@ void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::s
         mmt[i] = _mm256_mul_pd(badc, neg);
     }
 
-    std::size_t dsorted[] = {d0 , d1, d2};
-    std::sort(dsorted, dsorted + 3, std::greater<std::size_t>());
-
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
                         kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt);
                     }
                 }
@@ -76,10 +74,10 @@ void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::s
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
                         if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
                             kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt);
                     }
diff --git a/third_party/cppsim/include/intrin/kernel4.hpp b/third_party/cppsim/include/intrin/kernel4.hpp
index 5523a556..94ef97a1 100644
--- a/third_party/cppsim/include/intrin/kernel4.hpp
+++ b/third_party/cppsim/include/intrin/kernel4.hpp
@@ -81,11 +81,11 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, s
 template <class V, class M>
 void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
-    std::size_t d2 = 1UL << id2;
-    std::size_t d3 = 1UL << id3;
+    std::size_t ids_sorted[] = { id3, id2, id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 4, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3];
 
     __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[8][0], &m[9][0]), load(&m[8][1], &m[9][1]), load(&m[8][2], &m[9][2]), load(&m[8][3], &m[9][3]), load(&m[10][0], &m[11][0]), load(&m[10][1], &m[11][1]), load(&m[10][2], &m[11][2]), load(&m[10][3], &m[11][3]), load(&m[12][0], &m[13][0]), load(&m[12][1], &m[13][1]), load(&m[12][2], &m[13][2]), load(&m[12][3], &m[13][3]), load(&m[14][0], &m[15][0]), load(&m[14][1], &m[15][1]), load(&m[14][2], &m[15][2]), load(&m[14][3], &m[15][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7]), load(&m[8][4], &m[9][4]), load(&m[8][5], &m[9][5]), load(&m[8][6], &m[9][6]), load(&m[8][7], &m[9][7]), load(&m[10][4], &m[11][4]), load(&m[10][5], &m[11][5]), load(&m[10][6], &m[11][6]), load(&m[10][7], &m[11][7]), load(&m[12][4], &m[13][4]), load(&m[12][5], &m[13][5]), load(&m[12][6], &m[13][6]), load(&m[12][7], &m[13][7]), load(&m[14][4], &m[15][4]), load(&m[14][5], &m[15][5]), load(&m[14][6], &m[15][6]), load(&m[14][7], &m[15][7]), load(&m[0][8], &m[1][8]), load(&m[0][9], &m[1][9]), load(&m[0][10], &m[1][10]), load(&m[0][11], &m[1][11]), load(&m[2][8], &m[3][8]), load(&m[2][9], &m[3][9]), load(&m[2][10], &m[3][10]), load(&m[2][11], &m[3][11]), load(&m[4][8], &m[5][8]), load(&m[4][9], &m[5][9]), load(&m[4][10], &m[5][10]), load(&m[4][11], &m[5][11]), load(&m[6][8], &m[7][8]), load(&m[6][9], &m[7][9]), load(&m[6][10], &m[7][10]), load(&m[6][11], &m[7][11]), load(&m[8][8], &m[9][8]), load(&m[8][9], &m[9][9]), load(&m[8][10], &m[9][10]), load(&m[8][11], &m[9][11]), load(&m[10][8], &m[11][8]), load(&m[10][9], &m[11][9]), load(&m[10][10], &m[11][10]), load(&m[10][11], &m[11][11]), load(&m[12][8], &m[13][8]), load(&m[12][9], &m[13][9]), load(&m[12][10], &m[13][10]), load(&m[12][11], &m[13][11]), load(&m[14][8], &m[15][8]), load(&m[14][9], &m[15][9]), load(&m[14][10], &m[15][10]), load(&m[14][11], &m[15][11]), load(&m[0][12], &m[1][12]), load(&m[0][13], &m[1][13]), load(&m[0][14], &m[1][14]), load(&m[0][15], &m[1][15]), load(&m[2][12], &m[3][12]), load(&m[2][13], &m[3][13]), load(&m[2][14], &m[3][14]), load(&m[2][15], &m[3][15]), load(&m[4][12], &m[5][12]), load(&m[4][13], &m[5][13]), load(&m[4][14], &m[5][14]), load(&m[4][15], &m[5][15]), load(&m[6][12], &m[7][12]), load(&m[6][13], &m[7][13]), load(&m[6][14], &m[7][14]), load(&m[6][15], &m[7][15]), load(&m[8][12], &m[9][12]), load(&m[8][13], &m[9][13]), load(&m[8][14], &m[9][14]), load(&m[8][15], &m[9][15]), load(&m[10][12], &m[11][12]), load(&m[10][13], &m[11][13]), load(&m[10][14], &m[11][14]), load(&m[10][15], &m[11][15]), load(&m[12][12], &m[13][12]), load(&m[12][13], &m[13][13]), load(&m[12][14], &m[13][14]), load(&m[12][15], &m[13][15]), load(&m[14][12], &m[15][12]), load(&m[14][13], &m[15][13]), load(&m[14][14], &m[15][14]), load(&m[14][15], &m[15][15])};
     __m256d mmt[128];
@@ -96,16 +96,13 @@ void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
         mmt[i] = _mm256_mul_pd(badc, neg);
     }
 
-    std::size_t dsorted[] = {d0 , d1, d2, d3};
-    std::sort(dsorted, dsorted + 4, std::greater<std::size_t>());
-
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
                             kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt);
                         }
                     }
@@ -115,11 +112,11 @@ void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
                             if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt);
                         }
diff --git a/third_party/cppsim/include/intrin/kernel5.hpp b/third_party/cppsim/include/intrin/kernel5.hpp
index 9cf781fa..f1608e1f 100644
--- a/third_party/cppsim/include/intrin/kernel5.hpp
+++ b/third_party/cppsim/include/intrin/kernel5.hpp
@@ -201,12 +201,11 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, s
 template <class V, class M>
 void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
-    std::size_t d2 = 1UL << id2;
-    std::size_t d3 = 1UL << id3;
-    std::size_t d4 = 1UL << id4;
+    std::size_t ids_sorted[] = { id4, id3, id2, id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 5, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3], dsorted4 = 1UL << ids_sorted[4];
 
     __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[8][0], &m[9][0]), load(&m[8][1], &m[9][1]), load(&m[8][2], &m[9][2]), load(&m[8][3], &m[9][3]), load(&m[10][0], &m[11][0]), load(&m[10][1], &m[11][1]), load(&m[10][2], &m[11][2]), load(&m[10][3], &m[11][3]), load(&m[12][0], &m[13][0]), load(&m[12][1], &m[13][1]), load(&m[12][2], &m[13][2]), load(&m[12][3], &m[13][3]), load(&m[14][0], &m[15][0]), load(&m[14][1], &m[15][1]), load(&m[14][2], &m[15][2]), load(&m[14][3], &m[15][3]), load(&m[16][0], &m[17][0]), load(&m[16][1], &m[17][1]), load(&m[16][2], &m[17][2]), load(&m[16][3], &m[17][3]), load(&m[18][0], &m[19][0]), load(&m[18][1], &m[19][1]), load(&m[18][2], &m[19][2]), load(&m[18][3], &m[19][3]), load(&m[20][0], &m[21][0]), load(&m[20][1], &m[21][1]), load(&m[20][2], &m[21][2]), load(&m[20][3], &m[21][3]), load(&m[22][0], &m[23][0]), load(&m[22][1], &m[23][1]), load(&m[22][2], &m[23][2]), load(&m[22][3], &m[23][3]), load(&m[24][0], &m[25][0]), load(&m[24][1], &m[25][1]), load(&m[24][2], &m[25][2]), load(&m[24][3], &m[25][3]), load(&m[26][0], &m[27][0]), load(&m[26][1], &m[27][1]), load(&m[26][2], &m[27][2]), load(&m[26][3], &m[27][3]), load(&m[28][0], &m[29][0]), load(&m[28][1], &m[29][1]), load(&m[28][2], &m[29][2]), load(&m[28][3], &m[29][3]), load(&m[30][0], &m[31][0]), load(&m[30][1], &m[31][1]), load(&m[30][2], &m[31][2]), load(&m[30][3], &m[31][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7]), load(&m[8][4], &m[9][4]), load(&m[8][5], &m[9][5]), load(&m[8][6], &m[9][6]), load(&m[8][7], &m[9][7]), load(&m[10][4], &m[11][4]), load(&m[10][5], &m[11][5]), load(&m[10][6], &m[11][6]), load(&m[10][7], &m[11][7]), load(&m[12][4], &m[13][4]), load(&m[12][5], &m[13][5]), load(&m[12][6], &m[13][6]), load(&m[12][7], &m[13][7]), load(&m[14][4], &m[15][4]), load(&m[14][5], &m[15][5]), load(&m[14][6], &m[15][6]), load(&m[14][7], &m[15][7]), load(&m[16][4], &m[17][4]), load(&m[16][5], &m[17][5]), load(&m[16][6], &m[17][6]), load(&m[16][7], &m[17][7]), load(&m[18][4], &m[19][4]), load(&m[18][5], &m[19][5]), load(&m[18][6], &m[19][6]), load(&m[18][7], &m[19][7]), load(&m[20][4], &m[21][4]), load(&m[20][5], &m[21][5]), load(&m[20][6], &m[21][6]), load(&m[20][7], &m[21][7]), load(&m[22][4], &m[23][4]), load(&m[22][5], &m[23][5]), load(&m[22][6], &m[23][6]), load(&m[22][7], &m[23][7]), load(&m[24][4], &m[25][4]), load(&m[24][5], &m[25][5]), load(&m[24][6], &m[25][6]), load(&m[24][7], &m[25][7]), load(&m[26][4], &m[27][4]), load(&m[26][5], &m[27][5]), load(&m[26][6], &m[27][6]), load(&m[26][7], &m[27][7]), load(&m[28][4], &m[29][4]), load(&m[28][5], &m[29][5]), load(&m[28][6], &m[29][6]), load(&m[28][7], &m[29][7]), load(&m[30][4], &m[31][4]), load(&m[30][5], &m[31][5]), load(&m[30][6], &m[31][6]), load(&m[30][7], &m[31][7]), load(&m[0][8], &m[1][8]), load(&m[0][9], &m[1][9]), load(&m[0][10], &m[1][10]), load(&m[0][11], &m[1][11]), load(&m[2][8], &m[3][8]), load(&m[2][9], &m[3][9]), load(&m[2][10], &m[3][10]), load(&m[2][11], &m[3][11]), load(&m[4][8], &m[5][8]), load(&m[4][9], &m[5][9]), load(&m[4][10], &m[5][10]), load(&m[4][11], &m[5][11]), load(&m[6][8], &m[7][8]), load(&m[6][9], &m[7][9]), load(&m[6][10], &m[7][10]), load(&m[6][11], &m[7][11]), load(&m[8][8], &m[9][8]), load(&m[8][9], &m[9][9]), load(&m[8][10], &m[9][10]), load(&m[8][11], &m[9][11]), load(&m[10][8], &m[11][8]), load(&m[10][9], &m[11][9]), load(&m[10][10], &m[11][10]), load(&m[10][11], &m[11][11]), load(&m[12][8], &m[13][8]), load(&m[12][9], &m[13][9]), load(&m[12][10], &m[13][10]), load(&m[12][11], &m[13][11]), load(&m[14][8], &m[15][8]), load(&m[14][9], &m[15][9]), load(&m[14][10], &m[15][10]), load(&m[14][11], &m[15][11]), load(&m[16][8], &m[17][8]), load(&m[16][9], &m[17][9]), load(&m[16][10], &m[17][10]), load(&m[16][11], &m[17][11]), load(&m[18][8], &m[19][8]), load(&m[18][9], &m[19][9]), load(&m[18][10], &m[19][10]), load(&m[18][11], &m[19][11]), load(&m[20][8], &m[21][8]), load(&m[20][9], &m[21][9]), load(&m[20][10], &m[21][10]), load(&m[20][11], &m[21][11]), load(&m[22][8], &m[23][8]), load(&m[22][9], &m[23][9]), load(&m[22][10], &m[23][10]), load(&m[22][11], &m[23][11]), load(&m[24][8], &m[25][8]), load(&m[24][9], &m[25][9]), load(&m[24][10], &m[25][10]), load(&m[24][11], &m[25][11]), load(&m[26][8], &m[27][8]), load(&m[26][9], &m[27][9]), load(&m[26][10], &m[27][10]), load(&m[26][11], &m[27][11]), load(&m[28][8], &m[29][8]), load(&m[28][9], &m[29][9]), load(&m[28][10], &m[29][10]), load(&m[28][11], &m[29][11]), load(&m[30][8], &m[31][8]), load(&m[30][9], &m[31][9]), load(&m[30][10], &m[31][10]), load(&m[30][11], &m[31][11]), load(&m[0][12], &m[1][12]), load(&m[0][13], &m[1][13]), load(&m[0][14], &m[1][14]), load(&m[0][15], &m[1][15]), load(&m[2][12], &m[3][12]), load(&m[2][13], &m[3][13]), load(&m[2][14], &m[3][14]), load(&m[2][15], &m[3][15]), load(&m[4][12], &m[5][12]), load(&m[4][13], &m[5][13]), load(&m[4][14], &m[5][14]), load(&m[4][15], &m[5][15]), load(&m[6][12], &m[7][12]), load(&m[6][13], &m[7][13]), load(&m[6][14], &m[7][14]), load(&m[6][15], &m[7][15]), load(&m[8][12], &m[9][12]), load(&m[8][13], &m[9][13]), load(&m[8][14], &m[9][14]), load(&m[8][15], &m[9][15]), load(&m[10][12], &m[11][12]), load(&m[10][13], &m[11][13]), load(&m[10][14], &m[11][14]), load(&m[10][15], &m[11][15]), load(&m[12][12], &m[13][12]), load(&m[12][13], &m[13][13]), load(&m[12][14], &m[13][14]), load(&m[12][15], &m[13][15]), load(&m[14][12], &m[15][12]), load(&m[14][13], &m[15][13]), load(&m[14][14], &m[15][14]), load(&m[14][15], &m[15][15]), load(&m[16][12], &m[17][12]), load(&m[16][13], &m[17][13]), load(&m[16][14], &m[17][14]), load(&m[16][15], &m[17][15]), load(&m[18][12], &m[19][12]), load(&m[18][13], &m[19][13]), load(&m[18][14], &m[19][14]), load(&m[18][15], &m[19][15]), load(&m[20][12], &m[21][12]), load(&m[20][13], &m[21][13]), load(&m[20][14], &m[21][14]), load(&m[20][15], &m[21][15]), load(&m[22][12], &m[23][12]), load(&m[22][13], &m[23][13]), load(&m[22][14], &m[23][14]), load(&m[22][15], &m[23][15]), load(&m[24][12], &m[25][12]), load(&m[24][13], &m[25][13]), load(&m[24][14], &m[25][14]), load(&m[24][15], &m[25][15]), load(&m[26][12], &m[27][12]), load(&m[26][13], &m[27][13]), load(&m[26][14], &m[27][14]), load(&m[26][15], &m[27][15]), load(&m[28][12], &m[29][12]), load(&m[28][13], &m[29][13]), load(&m[28][14], &m[29][14]), load(&m[28][15], &m[29][15]), load(&m[30][12], &m[31][12]), load(&m[30][13], &m[31][13]), load(&m[30][14], &m[31][14]), load(&m[30][15], &m[31][15]), load(&m[0][16], &m[1][16]), load(&m[0][17], &m[1][17]), load(&m[0][18], &m[1][18]), load(&m[0][19], &m[1][19]), load(&m[2][16], &m[3][16]), load(&m[2][17], &m[3][17]), load(&m[2][18], &m[3][18]), load(&m[2][19], &m[3][19]), load(&m[4][16], &m[5][16]), load(&m[4][17], &m[5][17]), load(&m[4][18], &m[5][18]), load(&m[4][19], &m[5][19]), load(&m[6][16], &m[7][16]), load(&m[6][17], &m[7][17]), load(&m[6][18], &m[7][18]), load(&m[6][19], &m[7][19]), load(&m[8][16], &m[9][16]), load(&m[8][17], &m[9][17]), load(&m[8][18], &m[9][18]), load(&m[8][19], &m[9][19]), load(&m[10][16], &m[11][16]), load(&m[10][17], &m[11][17]), load(&m[10][18], &m[11][18]), load(&m[10][19], &m[11][19]), load(&m[12][16], &m[13][16]), load(&m[12][17], &m[13][17]), load(&m[12][18], &m[13][18]), load(&m[12][19], &m[13][19]), load(&m[14][16], &m[15][16]), load(&m[14][17], &m[15][17]), load(&m[14][18], &m[15][18]), load(&m[14][19], &m[15][19]), load(&m[16][16], &m[17][16]), load(&m[16][17], &m[17][17]), load(&m[16][18], &m[17][18]), load(&m[16][19], &m[17][19]), load(&m[18][16], &m[19][16]), load(&m[18][17], &m[19][17]), load(&m[18][18], &m[19][18]), load(&m[18][19], &m[19][19]), load(&m[20][16], &m[21][16]), load(&m[20][17], &m[21][17]), load(&m[20][18], &m[21][18]), load(&m[20][19], &m[21][19]), load(&m[22][16], &m[23][16]), load(&m[22][17], &m[23][17]), load(&m[22][18], &m[23][18]), load(&m[22][19], &m[23][19]), load(&m[24][16], &m[25][16]), load(&m[24][17], &m[25][17]), load(&m[24][18], &m[25][18]), load(&m[24][19], &m[25][19]), load(&m[26][16], &m[27][16]), load(&m[26][17], &m[27][17]), load(&m[26][18], &m[27][18]), load(&m[26][19], &m[27][19]), load(&m[28][16], &m[29][16]), load(&m[28][17], &m[29][17]), load(&m[28][18], &m[29][18]), load(&m[28][19], &m[29][19]), load(&m[30][16], &m[31][16]), load(&m[30][17], &m[31][17]), load(&m[30][18], &m[31][18]), load(&m[30][19], &m[31][19]), load(&m[0][20], &m[1][20]), load(&m[0][21], &m[1][21]), load(&m[0][22], &m[1][22]), load(&m[0][23], &m[1][23]), load(&m[2][20], &m[3][20]), load(&m[2][21], &m[3][21]), load(&m[2][22], &m[3][22]), load(&m[2][23], &m[3][23]), load(&m[4][20], &m[5][20]), load(&m[4][21], &m[5][21]), load(&m[4][22], &m[5][22]), load(&m[4][23], &m[5][23]), load(&m[6][20], &m[7][20]), load(&m[6][21], &m[7][21]), load(&m[6][22], &m[7][22]), load(&m[6][23], &m[7][23]), load(&m[8][20], &m[9][20]), load(&m[8][21], &m[9][21]), load(&m[8][22], &m[9][22]), load(&m[8][23], &m[9][23]), load(&m[10][20], &m[11][20]), load(&m[10][21], &m[11][21]), load(&m[10][22], &m[11][22]), load(&m[10][23], &m[11][23]), load(&m[12][20], &m[13][20]), load(&m[12][21], &m[13][21]), load(&m[12][22], &m[13][22]), load(&m[12][23], &m[13][23]), load(&m[14][20], &m[15][20]), load(&m[14][21], &m[15][21]), load(&m[14][22], &m[15][22]), load(&m[14][23], &m[15][23]), load(&m[16][20], &m[17][20]), load(&m[16][21], &m[17][21]), load(&m[16][22], &m[17][22]), load(&m[16][23], &m[17][23]), load(&m[18][20], &m[19][20]), load(&m[18][21], &m[19][21]), load(&m[18][22], &m[19][22]), load(&m[18][23], &m[19][23]), load(&m[20][20], &m[21][20]), load(&m[20][21], &m[21][21]), load(&m[20][22], &m[21][22]), load(&m[20][23], &m[21][23]), load(&m[22][20], &m[23][20]), load(&m[22][21], &m[23][21]), load(&m[22][22], &m[23][22]), load(&m[22][23], &m[23][23]), load(&m[24][20], &m[25][20]), load(&m[24][21], &m[25][21]), load(&m[24][22], &m[25][22]), load(&m[24][23], &m[25][23]), load(&m[26][20], &m[27][20]), load(&m[26][21], &m[27][21]), load(&m[26][22], &m[27][22]), load(&m[26][23], &m[27][23]), load(&m[28][20], &m[29][20]), load(&m[28][21], &m[29][21]), load(&m[28][22], &m[29][22]), load(&m[28][23], &m[29][23]), load(&m[30][20], &m[31][20]), load(&m[30][21], &m[31][21]), load(&m[30][22], &m[31][22]), load(&m[30][23], &m[31][23]), load(&m[0][24], &m[1][24]), load(&m[0][25], &m[1][25]), load(&m[0][26], &m[1][26]), load(&m[0][27], &m[1][27]), load(&m[2][24], &m[3][24]), load(&m[2][25], &m[3][25]), load(&m[2][26], &m[3][26]), load(&m[2][27], &m[3][27]), load(&m[4][24], &m[5][24]), load(&m[4][25], &m[5][25]), load(&m[4][26], &m[5][26]), load(&m[4][27], &m[5][27]), load(&m[6][24], &m[7][24]), load(&m[6][25], &m[7][25]), load(&m[6][26], &m[7][26]), load(&m[6][27], &m[7][27]), load(&m[8][24], &m[9][24]), load(&m[8][25], &m[9][25]), load(&m[8][26], &m[9][26]), load(&m[8][27], &m[9][27]), load(&m[10][24], &m[11][24]), load(&m[10][25], &m[11][25]), load(&m[10][26], &m[11][26]), load(&m[10][27], &m[11][27]), load(&m[12][24], &m[13][24]), load(&m[12][25], &m[13][25]), load(&m[12][26], &m[13][26]), load(&m[12][27], &m[13][27]), load(&m[14][24], &m[15][24]), load(&m[14][25], &m[15][25]), load(&m[14][26], &m[15][26]), load(&m[14][27], &m[15][27]), load(&m[16][24], &m[17][24]), load(&m[16][25], &m[17][25]), load(&m[16][26], &m[17][26]), load(&m[16][27], &m[17][27]), load(&m[18][24], &m[19][24]), load(&m[18][25], &m[19][25]), load(&m[18][26], &m[19][26]), load(&m[18][27], &m[19][27]), load(&m[20][24], &m[21][24]), load(&m[20][25], &m[21][25]), load(&m[20][26], &m[21][26]), load(&m[20][27], &m[21][27]), load(&m[22][24], &m[23][24]), load(&m[22][25], &m[23][25]), load(&m[22][26], &m[23][26]), load(&m[22][27], &m[23][27]), load(&m[24][24], &m[25][24]), load(&m[24][25], &m[25][25]), load(&m[24][26], &m[25][26]), load(&m[24][27], &m[25][27]), load(&m[26][24], &m[27][24]), load(&m[26][25], &m[27][25]), load(&m[26][26], &m[27][26]), load(&m[26][27], &m[27][27]), load(&m[28][24], &m[29][24]), load(&m[28][25], &m[29][25]), load(&m[28][26], &m[29][26]), load(&m[28][27], &m[29][27]), load(&m[30][24], &m[31][24]), load(&m[30][25], &m[31][25]), load(&m[30][26], &m[31][26]), load(&m[30][27], &m[31][27]), load(&m[0][28], &m[1][28]), load(&m[0][29], &m[1][29]), load(&m[0][30], &m[1][30]), load(&m[0][31], &m[1][31]), load(&m[2][28], &m[3][28]), load(&m[2][29], &m[3][29]), load(&m[2][30], &m[3][30]), load(&m[2][31], &m[3][31]), load(&m[4][28], &m[5][28]), load(&m[4][29], &m[5][29]), load(&m[4][30], &m[5][30]), load(&m[4][31], &m[5][31]), load(&m[6][28], &m[7][28]), load(&m[6][29], &m[7][29]), load(&m[6][30], &m[7][30]), load(&m[6][31], &m[7][31]), load(&m[8][28], &m[9][28]), load(&m[8][29], &m[9][29]), load(&m[8][30], &m[9][30]), load(&m[8][31], &m[9][31]), load(&m[10][28], &m[11][28]), load(&m[10][29], &m[11][29]), load(&m[10][30], &m[11][30]), load(&m[10][31], &m[11][31]), load(&m[12][28], &m[13][28]), load(&m[12][29], &m[13][29]), load(&m[12][30], &m[13][30]), load(&m[12][31], &m[13][31]), load(&m[14][28], &m[15][28]), load(&m[14][29], &m[15][29]), load(&m[14][30], &m[15][30]), load(&m[14][31], &m[15][31]), load(&m[16][28], &m[17][28]), load(&m[16][29], &m[17][29]), load(&m[16][30], &m[17][30]), load(&m[16][31], &m[17][31]), load(&m[18][28], &m[19][28]), load(&m[18][29], &m[19][29]), load(&m[18][30], &m[19][30]), load(&m[18][31], &m[19][31]), load(&m[20][28], &m[21][28]), load(&m[20][29], &m[21][29]), load(&m[20][30], &m[21][30]), load(&m[20][31], &m[21][31]), load(&m[22][28], &m[23][28]), load(&m[22][29], &m[23][29]), load(&m[22][30], &m[23][30]), load(&m[22][31], &m[23][31]), load(&m[24][28], &m[25][28]), load(&m[24][29], &m[25][29]), load(&m[24][30], &m[25][30]), load(&m[24][31], &m[25][31]), load(&m[26][28], &m[27][28]), load(&m[26][29], &m[27][29]), load(&m[26][30], &m[27][30]), load(&m[26][31], &m[27][31]), load(&m[28][28], &m[29][28]), load(&m[28][29], &m[29][29]), load(&m[28][30], &m[29][30]), load(&m[28][31], &m[29][31]), load(&m[30][28], &m[31][28]), load(&m[30][29], &m[31][29]), load(&m[30][30], &m[31][30]), load(&m[30][31], &m[31][31])};
     __m256d mmt[512];
@@ -217,17 +216,14 @@ void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
         mmt[i] = _mm256_mul_pd(badc, neg);
     }
 
-    std::size_t dsorted[] = {d0 , d1, d2, d3, d4};
-    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
-
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
-                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt);
                             }
                         }
@@ -238,12 +234,12 @@ void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
-                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
                                 if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
                                     kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt);
                             }
diff --git a/third_party/cppsim/include/kernelgen.hpp b/third_party/cppsim/include/kernelgen.hpp
index e5fd130d..12511736 100644
--- a/third_party/cppsim/include/kernelgen.hpp
+++ b/third_party/cppsim/include/kernelgen.hpp
@@ -39,10 +39,10 @@ void kernelgen(V &psi, Id& ids, M const& m, std::size_t ctrlmask)
 	}
 	
 	// Call the generated kernel.
-	typedef void (*kernel_t)(int* /*psi*/, unsigned int* /*ids*/, const int* /*m*/, size_t /*ctrlmask*/);
+	typedef void (*kernel_t)(void* /*psi*/, unsigned int* /*ids*/, const int* /*m*/, size_t /*ctrlmask*/);
 	auto kernel = (kernel_t)handle;
 	#pragma omp parallel
-	kernel(reinterpret_cast<int*>(&psi[0]), &ids[0], reinterpret_cast<const int*>(&m[0][0]), ctrlmask);
+	kernel(reinterpret_cast<void*>(&psi[0]), &ids[0], reinterpret_cast<const int*>(&m[0][0]), ctrlmask);
 }
 
 #endif // KERNELGEN_HPP
diff --git a/third_party/cppsim/include/nointrin/kernel1.hpp b/third_party/cppsim/include/nointrin/kernel1.hpp
index cbc8b928..a0a5952c 100644
--- a/third_party/cppsim/include/nointrin/kernel1.hpp
+++ b/third_party/cppsim/include/nointrin/kernel1.hpp
@@ -32,23 +32,24 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m)
 template <class V, class M>
 void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
+    std::size_t ids_sorted[] = { id0 };
+    std::sort(ids_sorted, ids_sorted + 1, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
     std::size_t d0 = 1UL << id0;
-    std::size_t dsorted[] = {d0 };
-    std::sort(dsorted, dsorted + 1, std::greater<std::size_t>());
+    std::size_t dsorted0 = 1UL << ids_sorted[0];
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
                 kernel_core(psi, i0 + i1, d0, m);
             }
         }
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
                 if (((i0 + i1)&ctrlmask) == ctrlmask)
                     kernel_core(psi, i0 + i1, d0, m);
             }
diff --git a/third_party/cppsim/include/nointrin/kernel2.hpp b/third_party/cppsim/include/nointrin/kernel2.hpp
index 84d85830..eb8a5521 100644
--- a/third_party/cppsim/include/nointrin/kernel2.hpp
+++ b/third_party/cppsim/include/nointrin/kernel2.hpp
@@ -36,17 +36,17 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M
 template <class V, class M>
 void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
-    std::size_t dsorted[] = {d0 , d1};
-    std::sort(dsorted, dsorted + 2, std::greater<std::size_t>());
+    std::size_t ids_sorted[] = { id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 2, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1];
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
                     kernel_core(psi, i0 + i1 + i2, d0, d1, m);
                 }
             }
@@ -54,9 +54,9 @@ void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; ++i2){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
                     if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
                         kernel_core(psi, i0 + i1 + i2, d0, d1, m);
                 }
diff --git a/third_party/cppsim/include/nointrin/kernel3.hpp b/third_party/cppsim/include/nointrin/kernel3.hpp
index 9369803f..aad532ea 100644
--- a/third_party/cppsim/include/nointrin/kernel3.hpp
+++ b/third_party/cppsim/include/nointrin/kernel3.hpp
@@ -57,19 +57,18 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, s
 template <class V, class M>
 void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
-    std::size_t d2 = 1UL << id2;
-    std::size_t dsorted[] = {d0 , d1, d2};
-    std::sort(dsorted, dsorted + 3, std::greater<std::size_t>());
+    std::size_t ids_sorted[] = { id2, id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 3, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2];
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
                         kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m);
                     }
                 }
@@ -78,10 +77,10 @@ void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::s
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; ++i3){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
                         if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
                             kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m);
                     }
diff --git a/third_party/cppsim/include/nointrin/kernel4.hpp b/third_party/cppsim/include/nointrin/kernel4.hpp
index aafac39e..176cf69d 100644
--- a/third_party/cppsim/include/nointrin/kernel4.hpp
+++ b/third_party/cppsim/include/nointrin/kernel4.hpp
@@ -117,21 +117,19 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, s
 template <class V, class M>
 void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
-    std::size_t d2 = 1UL << id2;
-    std::size_t d3 = 1UL << id3;
-    std::size_t dsorted[] = {d0 , d1, d2, d3};
-    std::sort(dsorted, dsorted + 4, std::greater<std::size_t>());
+    std::size_t ids_sorted[] = { id3, id2, id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 4, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3];
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
                             kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m);
                         }
                     }
@@ -141,11 +139,11 @@ void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; ++i4){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
                             if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m);
                         }
diff --git a/third_party/cppsim/include/nointrin/kernel5.hpp b/third_party/cppsim/include/nointrin/kernel5.hpp
index 363f4e9a..dff08c2f 100644
--- a/third_party/cppsim/include/nointrin/kernel5.hpp
+++ b/third_party/cppsim/include/nointrin/kernel5.hpp
@@ -333,23 +333,20 @@ inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, s
 template <class V, class M>
 void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
 {
-    std::size_t n = psi.size();
-    std::size_t d0 = 1UL << id0;
-    std::size_t d1 = 1UL << id1;
-    std::size_t d2 = 1UL << id2;
-    std::size_t d3 = 1UL << id3;
-    std::size_t d4 = 1UL << id4;
-    std::size_t dsorted[] = {d0 , d1, d2, d3, d4};
-    std::sort(dsorted, dsorted + 5, std::greater<std::size_t>());
+    std::size_t ids_sorted[] = { id4, id3, id2, id1, id0 };
+    std::sort(ids_sorted, ids_sorted + 5, std::greater<std::size_t>());
+    std::size_t n = 1UL << (ids_sorted[0] + 1);
+    std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4;
+    std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3], dsorted4 = 1UL << ids_sorted[4];
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
-                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
                             }
                         }
@@ -360,12 +357,12 @@ void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){
-            for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]){
-                for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]){
-                    for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]){
-                        for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]){
-                            for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){
+        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
                                 if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
                                     kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
                             }

From 0447f5a89db942ae9939efac7033fbb1aab6ce74 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 13 Sep 2022 14:16:01 +0200
Subject: [PATCH 44/82] Right hand side is returned as string, so '0' must be
 returned intead of 0

Signed-off-by: Dmitry Mikushin <dmitry@kernelgen.org>
---
 third_party/cppsim/include/nointrin/kernelgen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/third_party/cppsim/include/nointrin/kernelgen.py
index 286f7ae8..0eeb7214 100644
--- a/third_party/cppsim/include/nointrin/kernelgen.py
+++ b/third_party/cppsim/include/nointrin/kernelgen.py
@@ -52,7 +52,7 @@ def rhs(n, j, i):
                 if m(j, i) != 0:
                     return f'mul(v{left}{i}{right}, M({j}, {i})'
                 else:
-                    return 0
+                    return '0'
 
     strrhs = [] 
     for j in range(0, len(strcombs)):

From 611dd4f7a86764c6607fda8f432456e2f3522b46 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Wed, 14 Sep 2022 14:25:18 +0200
Subject: [PATCH 45/82] Adding a required linking against libdl library, which
 is used by runtime compilation to dynamically bind a function from the
 compiled code

Signed-off-by: Dmitry Mikushin <dmitry@kernelgen.org>
---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index ec01ab05..8c65f0e3 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -20,7 +20,7 @@ set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/digestpp)
 res_embed(TARGET kernelgen NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
-target_link_libraries(kernelgen PUBLIC pybind11::pybind11 Python3::Python OpenMP::OpenMP_CXX)
+target_link_libraries(kernelgen PUBLIC pybind11::pybind11 Python3::Python OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})
 
 pybind11_add_module(${PROJECT_NAME} SHARED "src/${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)

From 9b04b022327f4158fd2960e09a18270845d06fe5 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <damien1@huawei.com>
Date: Thu, 20 Oct 2022 14:28:25 +0200
Subject: [PATCH 46/82] Replace git submodules with CMake FetchContent

---
 third_party/cppsim/.gitmodules           |  18 ---
 third_party/cppsim/CMakeLists.txt        | 145 ++++++++++++++++-------
 third_party/cppsim/ThirdParty/digestpp   |   1 -
 third_party/cppsim/ThirdParty/eigen      |   1 -
 third_party/cppsim/ThirdParty/googletest |   1 -
 third_party/cppsim/ThirdParty/hipThrust  |   1 -
 third_party/cppsim/ThirdParty/pybind11   |   1 -
 third_party/cppsim/ThirdParty/res_embed  |   1 -
 8 files changed, 104 insertions(+), 65 deletions(-)
 delete mode 160000 third_party/cppsim/ThirdParty/digestpp
 delete mode 160000 third_party/cppsim/ThirdParty/eigen
 delete mode 160000 third_party/cppsim/ThirdParty/googletest
 delete mode 160000 third_party/cppsim/ThirdParty/hipThrust
 delete mode 160000 third_party/cppsim/ThirdParty/pybind11
 delete mode 160000 third_party/cppsim/ThirdParty/res_embed

diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/.gitmodules
index dd9d601a..e69de29b 100644
--- a/third_party/cppsim/.gitmodules
+++ b/third_party/cppsim/.gitmodules
@@ -1,18 +0,0 @@
-[submodule "googletest"]
-	path = ThirdParty/googletest
-	url = https://github.com/google/googletest.git
-[submodule "ThirdParty/res_embed"]
-	path = ThirdParty/res_embed
-	url = https://github.com/dmikushin/res_embed.git
-[submodule "ThirdParty/pybind11"]
-	path = ThirdParty/pybind11
-	url = https://github.com/pybind/pybind11.git
-[submodule "build/ThirdParty/digestpp"]
-	path = ThirdParty/digestpp
-	url = https://github.com/kerukuro/digestpp.git
-[submodule "ThirdParty/eigen"]
-	path = ThirdParty/eigen
-	url = https://gitlab.com/libeigen/eigen.git
-[submodule "ThirdParty/hipThrust"]
-	path = ThirdParty/hipThrust
-	url = https://github.com/dmikushin/Thrust.git
diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 8c65f0e3..5e0b2ab3 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -1,28 +1,91 @@
-cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
-
-project(_cppsim)
-
-add_subdirectory(ThirdParty/googletest EXCLUDE_FROM_ALL)
-
-add_subdirectory(ThirdParty/res_embed EXCLUDE_FROM_ALL)
-
-add_subdirectory(ThirdParty/pybind11 EXCLUDE_FROM_ALL)
-
-find_package(Python3 COMPONENTS Interpreter Development)
+cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
+
+project(cppsim)
+
+list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/Modules)
+list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/commands)
+
+option(BUILD_TESTING "Build the test suite?" OFF)
+
+include(FetchContent)
+set(FETCHCONTENT_QUIET OFF)
+
+if(BUILD_TESTING)
+  FetchContent_Declare(
+    googletest
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG        25cc5777a17820a6339204a3552aa1dd5e428669
+  )
+  FetchContent_Declare(
+    hipThrust
+    GIT_REPOSITORY https://github.com/dmikushin/Thrust.git
+    GIT_TAG        9a12c1259805ed0a3a5fe9bdeb098a872deb936b
+  )
+  FetchContent_MakeAvailable(hipThrust googletest)
+
+  if(NOT DEFINED Eigen3_DIR)
+    FetchContent_Declare(
+      Eigen3
+      GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
+      GIT_TAG        3147391d946bb4b6c68edd901f2add6ac1f31f8c
+    )
+    FetchContent_MakeAvailable(googletest Eigen3)
+  else()
+    find_package(Eigen3 CONFIG REQUIRED)
+  endif()
+endif()
+
+if(NOT DEFINED digestpp_DIR)
+  message(STATUS "Checking out ")
+  FetchContent_Declare(
+    digestpp
+    GIT_REPOSITORY https://github.com/kerukuro/digestpp.git
+    GIT_TAG        4ec4106677e652a90716ad929d657a622089ef16
+  )
+  FetchContent_MakeAvailable(digestpp)
+  add_library(digestpp::digestpp INTERFACE IMPORTED)
+  target_include_directories(digestpp::digestpp INTERFACE ${digestpp_SOURCE_DIR})
+else()
+  find_package(digestpp CONFIG REQUIRED)
+endif()
+
+if(NOT DEFINED res_embed_CMAKE_DIR)
+  FetchContent_Declare(
+    res_embed
+    GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
+    GIT_TAG        26a18b27794c1fcf698e603beb8b122218dae490
+  )
+  FetchContent_MakeAvailable(res_embed)
+  include(ResEmbed)
+else()
+  include(${res_embed_CMAKE_DIR}/ResEmbed.cmake)
+endif()
+
+if(NOT pybind11_DIR)
+  FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG        68e6fdaa90fc93979e6d5d1e9f788f464593e8f2
+  )
+  FetchContent_MakeAvailable(pybind11)
+else()
+  find_package(pybind11 REQUIRED)
+endif()
+
+find_package(Python 3.7.0 COMPONENTS Interpreter Development.Embed)
 
 find_package(OpenMP REQUIRED)
 
-include(ResEmbed)
+# ==============================================================================
 
 add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp" "src/tempfile.cpp")
 set_target_properties(kernelgen PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/digestpp)
 res_embed(TARGET kernelgen NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
-target_link_libraries(kernelgen PUBLIC pybind11::pybind11 Python3::Python OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})
+target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Python OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})
 
-pybind11_add_module(${PROJECT_NAME} SHARED "src/${PROJECT_NAME}.cpp")
+pybind11_add_module(${PROJECT_NAME} SHARED "src/_${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
 
 macro(kernelgen)
@@ -37,7 +100,7 @@ macro(kernelgen)
 	# Call generator.
 	add_custom_command(
 		OUTPUT ${KERNEL_PATH}
-		COMMAND ${Python3_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH} --combinations=True
+		COMMAND ${Python_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH} --combinations=True
 		COMMENT "Generating kernel for ${NQUBITS} qubits"
 		DEPENDS ${KERNELGEN})
 	set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
@@ -47,29 +110,29 @@ macro(kernelgen)
 	target_include_directories(${KERNELGEN_TARGET} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 endmacro()
 
-add_executable(test_nointrin "src/test/test_nointrin.cpp")
-set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
-target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
-target_link_libraries(test_nointrin PRIVATE gtest kernelgen)
-kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
-kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
-kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
-kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
-kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
-
-add_executable(test_popcount "src/test/test_popcount.cpp")
-set_target_properties(test_popcount PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
-target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
-target_link_libraries(test_popcount PRIVATE gtest)
-
-add_executable(test_combinations "src/test/test_combinations.cpp")
-set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
-target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/eigen)
-target_link_libraries(test_combinations PRIVATE gtest OpenMP::OpenMP_CXX)
-
-add_executable(benchmark "src/benchmark/benchmark.cpp")
-target_link_libraries(benchmark PRIVATE gtest kernelgen)
+if(BUILD_TESTING)
+  add_executable(test_nointrin "src/test/test_nointrin.cpp")
+  set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
+  target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  target_include_directories(test_nointrin PRIVATE Eigen3::Eigen)
+  target_link_libraries(test_nointrin PRIVATE gtest kernelgen)
+  kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
+  kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
+  kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
+  kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
+  kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
+
+  add_executable(test_popcount "src/test/test_popcount.cpp")
+  set_target_properties(test_popcount PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
+  target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  target_link_libraries(test_popcount PRIVATE Eigen3::Eigen gtest)
+
+  add_executable(test_combinations "src/test/test_combinations.cpp")
+  set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
+  target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  target_link_libraries(test_combinations PRIVATE gtest Eigen3::Eigen OpenMP::OpenMP_CXX)
+
+  add_executable(benchmark "src/benchmark/benchmark.cpp")
+  target_link_libraries(benchmark PRIVATE gtest kernelgen)   
+endif()
 
diff --git a/third_party/cppsim/ThirdParty/digestpp b/third_party/cppsim/ThirdParty/digestpp
deleted file mode 160000
index 4ec41066..00000000
--- a/third_party/cppsim/ThirdParty/digestpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4ec4106677e652a90716ad929d657a622089ef16
diff --git a/third_party/cppsim/ThirdParty/eigen b/third_party/cppsim/ThirdParty/eigen
deleted file mode 160000
index a7c1cac1..00000000
--- a/third_party/cppsim/ThirdParty/eigen
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a7c1cac18bfef26ec61a73c1619ccf0f9b734745
diff --git a/third_party/cppsim/ThirdParty/googletest b/third_party/cppsim/ThirdParty/googletest
deleted file mode 160000
index 25cc5777..00000000
--- a/third_party/cppsim/ThirdParty/googletest
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 25cc5777a17820a6339204a3552aa1dd5e428669
diff --git a/third_party/cppsim/ThirdParty/hipThrust b/third_party/cppsim/ThirdParty/hipThrust
deleted file mode 160000
index 9a12c125..00000000
--- a/third_party/cppsim/ThirdParty/hipThrust
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 9a12c1259805ed0a3a5fe9bdeb098a872deb936b
diff --git a/third_party/cppsim/ThirdParty/pybind11 b/third_party/cppsim/ThirdParty/pybind11
deleted file mode 160000
index 68e6fdaa..00000000
--- a/third_party/cppsim/ThirdParty/pybind11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 68e6fdaa90fc93979e6d5d1e9f788f464593e8f2
diff --git a/third_party/cppsim/ThirdParty/res_embed b/third_party/cppsim/ThirdParty/res_embed
deleted file mode 160000
index 26a18b27..00000000
--- a/third_party/cppsim/ThirdParty/res_embed
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 26a18b27794c1fcf698e603beb8b122218dae490

From ce522a0b5e35610665fdac637206a0c82fe38d07 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 13:21:59 +0200
Subject: [PATCH 47/82] Add proper CMake installation files

---
 third_party/cppsim/CMakeLists.txt             | 221 ++++++++++++------
 .../cppsim/cmake/cppsimConfig.cmake.in        |  22 ++
 2 files changed, 175 insertions(+), 68 deletions(-)
 create mode 100644 third_party/cppsim/cmake/cppsimConfig.cmake.in

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 5e0b2ab3..27b0d9b5 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -1,12 +1,28 @@
 cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
 
-project(cppsim)
-
-list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/Modules)
-list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/commands)
+project(
+  cppsim
+  VERSION 1.0.0
+  LANGUAGES C CXX)
 
 option(BUILD_TESTING "Build the test suite?" OFF)
 
+# ==============================================================================
+
+include(GNUInstallDirs)
+
+set(CPPSIM_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}")
+set(CPPSIM_INSTALL_SBINDIR "${CMAKE_INSTALL_SBINDIR}")
+set(CPPSIM_INSTALL_SYSCONFDIR "${CMAKE_INSTALL_SYSCONFDIR}")
+set(CPPSIM_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/cppsim")
+set(CPPSIM_INSTALL_DATADIR "${CMAKE_INSTALL_DATADIR}/cppsim")
+set(CPPSIM_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}/cppsim")
+set(CPPSIM_INSTALL_DOCDIR "${CMAKE_INSTALL_DATADIR}/doc/cppsim")
+set(CPPSIM_INSTALL_CMAKEDIR "${CPPSIM_INSTALL_DATADIR}/cmake")
+set(CPPSIM_INSTALL_3RDPARTYDIR "${CPPSIM_INSTALL_LIBDIR}/third_party")
+
+# ==============================================================================
+
 include(FetchContent)
 set(FETCHCONTENT_QUIET OFF)
 
@@ -14,106 +30,171 @@ if(BUILD_TESTING)
   FetchContent_Declare(
     googletest
     GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG        25cc5777a17820a6339204a3552aa1dd5e428669
-  )
+    GIT_TAG 25cc5777a17820a6339204a3552aa1dd5e428669)
   FetchContent_Declare(
     hipThrust
     GIT_REPOSITORY https://github.com/dmikushin/Thrust.git
-    GIT_TAG        9a12c1259805ed0a3a5fe9bdeb098a872deb936b
-  )
+    GIT_TAG 9a12c1259805ed0a3a5fe9bdeb098a872deb936b)
   FetchContent_MakeAvailable(hipThrust googletest)
 
   if(NOT DEFINED Eigen3_DIR)
     FetchContent_Declare(
       Eigen3
       GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
-      GIT_TAG        3147391d946bb4b6c68edd901f2add6ac1f31f8c
-    )
+      GIT_TAG 3147391d946bb4b6c68edd901f2add6ac1f31f8c)
     FetchContent_MakeAvailable(googletest Eigen3)
   else()
     find_package(Eigen3 CONFIG REQUIRED)
   endif()
 endif()
 
-if(NOT DEFINED digestpp_DIR)
-  message(STATUS "Checking out ")
-  FetchContent_Declare(
-    digestpp
-    GIT_REPOSITORY https://github.com/kerukuro/digestpp.git
-    GIT_TAG        4ec4106677e652a90716ad929d657a622089ef16
-  )
-  FetchContent_MakeAvailable(digestpp)
-  add_library(digestpp::digestpp INTERFACE IMPORTED)
-  target_include_directories(digestpp::digestpp INTERFACE ${digestpp_SOURCE_DIR})
-else()
-  find_package(digestpp CONFIG REQUIRED)
-endif()
-
-if(NOT DEFINED res_embed_CMAKE_DIR)
+FetchContent_Declare(
+  digestpp
+  GIT_REPOSITORY https://github.com/kerukuro/digestpp.git
+  GIT_TAG 4ec4106677e652a90716ad929d657a622089ef16)
+FetchContent_MakeAvailable(digestpp)
+
+install(
+  DIRECTORY ${digestpp_SOURCE_DIR}
+  DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR}/third_party
+  PATTERN docs EXCLUDE
+  PATTERN .git EXCLUDE)
+
+add_library(digestpp::digestpp INTERFACE IMPORTED)
+target_include_directories(
+  digestpp::digestpp
+  INTERFACE
+    $<BUILD_INTERFACE:${digestpp_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CPPSIM_INSTALL_INCLUDEDIR}/third_party/digestpp-src>)
+
+find_package(res_embed QUIET CONFIG)
+if(NOT res_embed_FOUND)
   FetchContent_Declare(
     res_embed
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
-    GIT_TAG        26a18b27794c1fcf698e603beb8b122218dae490
-  )
+    GIT_TAG 26a18b27794c1fcf698e603beb8b122218dae490)
   FetchContent_MakeAvailable(res_embed)
-  include(ResEmbed)
-else()
-  include(${res_embed_CMAKE_DIR}/ResEmbed.cmake)
 endif()
+include(ResEmbed)
 
-if(NOT pybind11_DIR)
+find_package(pybind11 CONFIG QUIET)
+if(NOT pybind11_FOUND)
   FetchContent_Declare(
     pybind11
     GIT_REPOSITORY https://github.com/pybind/pybind11.git
-    GIT_TAG        68e6fdaa90fc93979e6d5d1e9f788f464593e8f2
-  )
+    GIT_TAG 68e6fdaa90fc93979e6d5d1e9f788f464593e8f2)
   FetchContent_MakeAvailable(pybind11)
-else()
-  find_package(pybind11 REQUIRED)
 endif()
 
-find_package(Python 3.7.0 COMPONENTS Interpreter Development.Embed)
+set(CPPSIM_PYTHON_VERSION_MIN 3.7.0)
+find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter
+                                                            Development.Embed)
 
 find_package(OpenMP REQUIRED)
 
 # ==============================================================================
 
-add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp" "src/tempfile.cpp")
+include(CMakePackageConfigHelpers)
+
+set(_namespace cppsim::)
+
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/cppsimConfig.cmake.in
+  ${PROJECT_BINARY_DIR}/cppsimConfig.cmake
+  INSTALL_DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+
+write_basic_package_version_file(${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
+                                 COMPATIBILITY SameMajorVersion)
+
+install(FILES ${PROJECT_BINARY_DIR}/cppsimConfig.cmake
+              ${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
+        DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+
+file(GLOB _headers ${CMAKE_CURRENT_LIST_DIR}/include/*.h
+     ${CMAKE_CURRENT_LIST_DIR}/include/*.hpp LIST_DIRECTORIES FALSE)
+install(FILES ${_headers} DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
+install(
+  DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/include/cpu
+            ${CMAKE_CURRENT_LIST_DIR}/include/gpu
+            ${CMAKE_CURRENT_LIST_DIR}/include/intrin
+            ${CMAKE_CURRENT_LIST_DIR}/include/nointrin
+  DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
+
+# ==============================================================================
+
+add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp"
+                             "src/tempfile.cpp")
 set_target_properties(kernelgen PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
 set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_include_directories(kernelgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
-res_embed(TARGET kernelgen NAME "nointrin" PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py" KEYWORD)
-target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Python OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})
+target_include_directories(
+  kernelgen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+                   $<INSTALL_INTERFACE:${CPPSIM_INSTALL_INCLUDEDIR}>)
+res_embed(
+  TARGET
+  kernelgen
+  NAME
+  "nointrin"
+  PATH
+  "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py"
+  KEYWORD)
+
+target_link_libraries(
+  kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Python
+                   OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})
 
 pybind11_add_module(${PROJECT_NAME} SHARED "src/_${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
 
 macro(kernelgen)
-	set(oneValueArgs NQUBITS VARIANT TARGET)
-	cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-	set(NQUBITS ${KERNELGEN_NQUBITS})
-	set(VARIANT ${KERNELGEN_VARIANT})
-	set(KERNELGEN "${CMAKE_CURRENT_SOURCE_DIR}/include/${VARIANT}/kernelgen.py")
-	set(KERNEL_PATH "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
-
-	# Call generator.
-	add_custom_command(
-		OUTPUT ${KERNEL_PATH}
-		COMMAND ${Python_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH} --combinations=True
-		COMMENT "Generating kernel for ${NQUBITS} qubits"
-		DEPENDS ${KERNELGEN})
-	set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
-
-	# Append the generated file to the target sources.
-	target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH})
-	target_include_directories(${KERNELGEN_TARGET} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+  set(oneValueArgs NQUBITS VARIANT TARGET)
+  cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  set(NQUBITS ${KERNELGEN_NQUBITS})
+  set(VARIANT ${KERNELGEN_VARIANT})
+  set(KERNELGEN "${CMAKE_CURRENT_SOURCE_DIR}/include/${VARIANT}/kernelgen.py")
+  set(KERNEL_PATH
+      "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
+
+  # Call generator.
+  add_custom_command(
+    OUTPUT ${KERNEL_PATH}
+    COMMAND ${Python_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH}
+            --combinations=True
+    COMMENT "Generating kernel for ${NQUBITS} qubits"
+    DEPENDS ${KERNELGEN})
+  set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
+
+  # Append the generated file to the target sources.
+  target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH})
+  target_include_directories(${KERNELGEN_TARGET}
+                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 endmacro()
 
+# ==============================================================================
+
+install(
+  TARGETS kernelgen kernelgen_nointrin
+  EXPORT cppsimTargets
+  PRIVATE_HEADER DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR}
+  PUBLIC_HEADER DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR}
+  ARCHIVE DESTINATION ${CPPSIM_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CPPSIM_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CPPSIM_INSTALL_BINDIR})
+
+install(
+  EXPORT cppsimTargets
+  NAMESPACE ${_namespace}
+  DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+
+# ==============================================================================
+
 if(BUILD_TESTING)
   add_executable(test_nointrin "src/test/test_nointrin.cpp")
-  set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
-  target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS
+                                                                 OFF)
+  target_include_directories(test_nointrin
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_include_directories(test_nointrin PRIVATE Eigen3::Eigen)
   target_link_libraries(test_nointrin PRIVATE gtest kernelgen)
   kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
@@ -123,16 +204,20 @@ if(BUILD_TESTING)
   kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
 
   add_executable(test_popcount "src/test/test_popcount.cpp")
-  set_target_properties(test_popcount PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
-  target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  set_target_properties(test_popcount PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS
+                                                                 OFF)
+  target_include_directories(test_popcount
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_link_libraries(test_popcount PRIVATE Eigen3::Eigen gtest)
 
   add_executable(test_combinations "src/test/test_combinations.cpp")
-  set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
-  target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-  target_link_libraries(test_combinations PRIVATE gtest Eigen3::Eigen OpenMP::OpenMP_CXX)
+  set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17
+                                                     CXX_EXTENSIONS OFF)
+  target_include_directories(test_combinations
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  target_link_libraries(test_combinations PRIVATE gtest Eigen3::Eigen
+                                                  OpenMP::OpenMP_CXX)
 
   add_executable(benchmark "src/benchmark/benchmark.cpp")
-  target_link_libraries(benchmark PRIVATE gtest kernelgen)   
+  target_link_libraries(benchmark PRIVATE gtest kernelgen)
 endif()
-
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
new file mode 100644
index 00000000..0108dfba
--- /dev/null
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -0,0 +1,22 @@
+@PACKAGE_INIT@
+
+if(TARGET cppsim::cppsim)
+  # Protect against double definitions due to previous call or
+  # add_subdirectory()
+  return()
+endif()
+
+# ==============================================================================
+
+find_package(pybind11 REQUIRED)
+find_package(
+  Python @CPPSIM_PYTHON_VERSION_MIN@
+  COMPONENTS Interpreter Development.Embed
+  REQUIRED)
+find_package(OpenMP REQUIRED)
+
+# ==============================================================================
+
+include(${CMAKE_CURRENT_LIST_DIR}/cppsimTargets.cmake)
+
+# ==============================================================================

From 63383463638118799db41204605c70da9df1ab19 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 14:13:49 +0200
Subject: [PATCH 48/82] Update git tag for res_embed

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 27b0d9b5..239e2a66 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -72,7 +72,7 @@ if(NOT res_embed_FOUND)
   FetchContent_Declare(
     res_embed
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
-    GIT_TAG 26a18b27794c1fcf698e603beb8b122218dae490)
+    GIT_TAG b803e4df20b09bdd65477a9662530a6feeb228e6)
   FetchContent_MakeAvailable(res_embed)
 endif()
 include(ResEmbed)

From 9c8c23680bd99e374a9c20f47f775c2ba1b4b3e1 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 15:18:54 +0200
Subject: [PATCH 49/82] Fix installation target for digestpp::digestpp

---
 third_party/cppsim/CMakeLists.txt              | 9 ++++-----
 third_party/cppsim/cmake/cppsimConfig.cmake.in | 8 ++++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 239e2a66..3188f4d9 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -61,11 +61,10 @@ install(
   PATTERN .git EXCLUDE)
 
 add_library(digestpp::digestpp INTERFACE IMPORTED)
-target_include_directories(
-  digestpp::digestpp
-  INTERFACE
-    $<BUILD_INTERFACE:${digestpp_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:${CPPSIM_INSTALL_INCLUDEDIR}/third_party/digestpp-src>)
+# NB: No INSTALL_INTERFACE here since that is taken care of within
+# cppsimConfig.cmake
+target_include_directories(digestpp::digestpp
+                           INTERFACE $<BUILD_INTERFACE:${digestpp_SOURCE_DIR}>)
 
 find_package(res_embed QUIET CONFIG)
 if(NOT res_embed_FOUND)
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index 0108dfba..5569b0a6 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -15,6 +15,14 @@ find_package(
   REQUIRED)
 find_package(OpenMP REQUIRED)
 
+# ------------------------------------------------------------------------------
+
+add_library(digestpp::digestpp INTERFACE)
+target_include_directories(
+  digestpp::digestpp
+  INTERFACE
+    ${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src)
+
 # ==============================================================================
 
 include(${CMAKE_CURRENT_LIST_DIR}/cppsimTargets.cmake)

From 4aefaeaa839e568a13ba15385a0294f89bab5bbd Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 15:27:41 +0200
Subject: [PATCH 50/82] Cleanup code and better handling of digestpp within
 installed CMake configuration files

---
 third_party/cppsim/CMakeLists.txt             | 30 +++++++++++++++----
 .../cppsim/cmake/cppsimConfig.cmake.in        | 13 ++++----
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 3188f4d9..2e4b3cdb 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -5,6 +5,10 @@ project(
   VERSION 1.0.0
   LANGUAGES C CXX)
 
+# Minimum required Python version (used both in this file and in the installed
+# CMake configuration)
+set(CPPSIM_PYTHON_VERSION_MIN 3.7.0)
+
 option(BUILD_TESTING "Build the test suite?" OFF)
 
 # ==============================================================================
@@ -48,23 +52,32 @@ if(BUILD_TESTING)
   endif()
 endif()
 
+# ------------------------------------------------------------------------------
+# In the case of digestpp, the repository only consists of a list of
+# files/directories that we need both our build process to find as well as
+# future users of an installed copy of cppsim.
+#
+# We therefore provide a target that we use for our current build and also
+# install its files into a sub-directory of the installation prefix. We then
+# define the digestpp::digestpp target directly within cppsimConfig.cmake
+
 FetchContent_Declare(
   digestpp
   GIT_REPOSITORY https://github.com/kerukuro/digestpp.git
   GIT_TAG 4ec4106677e652a90716ad929d657a622089ef16)
 FetchContent_MakeAvailable(digestpp)
 
+add_library(digestpp::digestpp INTERFACE IMPORTED)
+target_include_directories(digestpp::digestpp
+                           INTERFACE $<BUILD_INTERFACE:${digestpp_SOURCE_DIR}>)
+
 install(
   DIRECTORY ${digestpp_SOURCE_DIR}
   DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR}/third_party
   PATTERN docs EXCLUDE
   PATTERN .git EXCLUDE)
 
-add_library(digestpp::digestpp INTERFACE IMPORTED)
-# NB: No INSTALL_INTERFACE here since that is taken care of within
-# cppsimConfig.cmake
-target_include_directories(digestpp::digestpp
-                           INTERFACE $<BUILD_INTERFACE:${digestpp_SOURCE_DIR}>)
+# ------------------------------------------------------------------------------
 
 find_package(res_embed QUIET CONFIG)
 if(NOT res_embed_FOUND)
@@ -76,6 +89,8 @@ if(NOT res_embed_FOUND)
 endif()
 include(ResEmbed)
 
+# ------------------------------------------------------------------------------
+
 find_package(pybind11 CONFIG QUIET)
 if(NOT pybind11_FOUND)
   FetchContent_Declare(
@@ -85,10 +100,13 @@ if(NOT pybind11_FOUND)
   FetchContent_MakeAvailable(pybind11)
 endif()
 
-set(CPPSIM_PYTHON_VERSION_MIN 3.7.0)
+# ------------------------------------------------------------------------------
+
 find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter
                                                             Development.Embed)
 
+# ------------------------------------------------------------------------------
+
 find_package(OpenMP REQUIRED)
 
 # ==============================================================================
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index 5569b0a6..eea343d2 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -17,11 +17,14 @@ find_package(OpenMP REQUIRED)
 
 # ------------------------------------------------------------------------------
 
-add_library(digestpp::digestpp INTERFACE)
-target_include_directories(
-  digestpp::digestpp
-  INTERFACE
-    ${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src)
+if(NOT TARGET digestpp::digestpp)
+  add_library(digestpp::digestpp INTERFACE IMPORTED)
+  target_include_directories(
+    digestpp::digestpp
+    INTERFACE
+      ${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src
+  )
+endif()
 
 # ==============================================================================
 

From bae78c71d09a030396e38c46188a9898291bdcba Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 15:30:13 +0200
Subject: [PATCH 51/82] Added missing find_package(res_embed) in installed
 CMake configuration

---
 third_party/cppsim/cmake/cppsimConfig.cmake.in | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index eea343d2..d875276f 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -8,12 +8,13 @@ endif()
 
 # ==============================================================================
 
-find_package(pybind11 REQUIRED)
+find_package(pybind11 CONFIG REQUIRED)
+find_package(res_embed CONFIG REQUIRED)
+find_package(OpenMP REQUIRED)
 find_package(
   Python @CPPSIM_PYTHON_VERSION_MIN@
   COMPONENTS Interpreter Development.Embed
   REQUIRED)
-find_package(OpenMP REQUIRED)
 
 # ------------------------------------------------------------------------------
 

From bd73c0ee3d0d346d683f578c79cb1ad4df5cbe8d Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 15:47:11 +0200
Subject: [PATCH 52/82] Fix issues with kernelgen() macro when installed

---
 third_party/cppsim/CMakeLists.txt             | 33 ++++-------------
 .../cppsim/cmake/commands/kernelgen.cmake     | 35 +++++++++++++++++++
 .../cppsim/cmake/cppsimConfig.cmake.in        |  4 +++
 3 files changed, 46 insertions(+), 26 deletions(-)
 create mode 100644 third_party/cppsim/cmake/commands/kernelgen.cmake

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 2e4b3cdb..63ce1f26 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -5,6 +5,8 @@ project(
   VERSION 1.0.0
   LANGUAGES C CXX)
 
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/commands)
+
 # Minimum required Python version (used both in this file and in the installed
 # CMake configuration)
 set(CPPSIM_PYTHON_VERSION_MIN 3.7.0)
@@ -127,6 +129,9 @@ install(FILES ${PROJECT_BINARY_DIR}/cppsimConfig.cmake
               ${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
         DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/commands
+        DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+
 file(GLOB _headers ${CMAKE_CURRENT_LIST_DIR}/include/*.h
      ${CMAKE_CURRENT_LIST_DIR}/include/*.hpp LIST_DIRECTORIES FALSE)
 install(FILES ${_headers} DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
@@ -162,32 +167,6 @@ target_link_libraries(
 pybind11_add_module(${PROJECT_NAME} SHARED "src/_${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
 
-macro(kernelgen)
-  set(oneValueArgs NQUBITS VARIANT TARGET)
-  cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  set(NQUBITS ${KERNELGEN_NQUBITS})
-  set(VARIANT ${KERNELGEN_VARIANT})
-  set(KERNELGEN "${CMAKE_CURRENT_SOURCE_DIR}/include/${VARIANT}/kernelgen.py")
-  set(KERNEL_PATH
-      "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
-
-  # Call generator.
-  add_custom_command(
-    OUTPUT ${KERNEL_PATH}
-    COMMAND ${Python_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH}
-            --combinations=True
-    COMMENT "Generating kernel for ${NQUBITS} qubits"
-    DEPENDS ${KERNELGEN})
-  set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
-
-  # Append the generated file to the target sources.
-  target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH})
-  target_include_directories(${KERNELGEN_TARGET}
-                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-endmacro()
-
 # ==============================================================================
 
 install(
@@ -207,6 +186,8 @@ install(
 # ==============================================================================
 
 if(BUILD_TESTING)
+  include(kernelgen)
+
   add_executable(test_nointrin "src/test/test_nointrin.cpp")
   set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS
                                                                  OFF)
diff --git a/third_party/cppsim/cmake/commands/kernelgen.cmake b/third_party/cppsim/cmake/commands/kernelgen.cmake
new file mode 100644
index 00000000..7543f007
--- /dev/null
+++ b/third_party/cppsim/cmake/commands/kernelgen.cmake
@@ -0,0 +1,35 @@
+# ~~~
+# Generate some kernel functions
+#
+# kernelgen([NQUBITS <n_qubits>]
+#           [VARIANT <intrin>|<nointrin>]
+#           [TARGET <target-name>]
+# )
+# ~~~
+macro(kernelgen)
+  set(oneValueArgs NQUBITS VARIANT TARGET)
+  cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(NQUBITS ${KERNELGEN_NQUBITS})
+  set(VARIANT ${KERNELGEN_VARIANT})
+  if(NOT DEFINED CPPSIM_INCLUDE_DIR)
+    set(CPPSIM_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  endif()
+  set(KERNELGEN "${CPPSIM_INCLUDE_DIR}/${VARIANT}/kernelgen.py")
+  set(KERNEL_PATH
+      "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
+
+  # Call generator.
+  add_custom_command(
+    OUTPUT ${KERNEL_PATH}
+    COMMAND ${Python_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH}
+            --combinations=True
+    COMMENT "Generating kernel for ${NQUBITS} qubits"
+    DEPENDS ${KERNELGEN})
+  set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
+
+  # Append the generated file to the target sources.
+  target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH})
+  target_include_directories(${KERNELGEN_TARGET}
+                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+endmacro()
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index d875276f..e3e82530 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -29,6 +29,10 @@ endif()
 
 # ==============================================================================
 
+set(CPPSIM_INCLUDE_DIR ${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@)
+
 include(${CMAKE_CURRENT_LIST_DIR}/cppsimTargets.cmake)
 
+include(${CMAKE_CURRENT_LIST_DIR}/commands/kernelgen.cmake)
+
 # ==============================================================================

From 7754828100e31606b075cf3ae21e7f3fd4477efa Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 15:50:35 +0200
Subject: [PATCH 53/82] Fix potential issues with paths containing spaces in
 installed configuration

---
 third_party/cppsim/cmake/cppsimConfig.cmake.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index e3e82530..1ee70c15 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -23,13 +23,13 @@ if(NOT TARGET digestpp::digestpp)
   target_include_directories(
     digestpp::digestpp
     INTERFACE
-      ${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src
+      "${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src"
   )
 endif()
 
 # ==============================================================================
 
-set(CPPSIM_INCLUDE_DIR ${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@)
+set(CPPSIM_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cppsimTargets.cmake)
 

From 595d2512909e51c951d69f0649dfaad83f69fa7a Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 16:02:34 +0200
Subject: [PATCH 54/82] Fix issues with cppsim CMake installation config due to
 PACKAGE_PREFIX_DIR

---
 third_party/cppsim/cmake/cppsimConfig.cmake.in | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index 1ee70c15..e59758ad 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -1,5 +1,10 @@
 @PACKAGE_INIT@
 
+# ------------------------------------------------------------------------------
+
+# NB: PACKAGE_PREFIX_DIR might get overwritten by the find_package() calls below
+set(CPPSIM_PREFIX_DIR "${PACKAGE_PREFIX_DIR}")
+
 if(TARGET cppsim::cppsim)
   # Protect against double definitions due to previous call or
   # add_subdirectory()
@@ -23,16 +28,22 @@ if(NOT TARGET digestpp::digestpp)
   target_include_directories(
     digestpp::digestpp
     INTERFACE
-      "${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src"
+      "${CPPSIM_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src"
   )
 endif()
 
 # ==============================================================================
 
-set(CPPSIM_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@")
+set(CPPSIM_INCLUDE_DIR
+    "${CPPSIM_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@"
+    CACHE FILEPATH "Path to include files for cppsim" FORCE)
 
 include(${CMAKE_CURRENT_LIST_DIR}/cppsimTargets.cmake)
 
 include(${CMAKE_CURRENT_LIST_DIR}/commands/kernelgen.cmake)
 
 # ==============================================================================
+
+unset(CPPSIM_PREFIX_DIR)
+
+# ==============================================================================

From 81a3177fd4902ff12492fb1d6a499ad08f511e30 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Fri, 21 Oct 2022 16:03:14 +0200
Subject: [PATCH 55/82] Fix error in settings target dependencies for
 test_nointrin

---
 third_party/cppsim/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 63ce1f26..7e81c664 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -193,8 +193,7 @@ if(BUILD_TESTING)
                                                                  OFF)
   target_include_directories(test_nointrin
                              PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-  target_include_directories(test_nointrin PRIVATE Eigen3::Eigen)
-  target_link_libraries(test_nointrin PRIVATE gtest kernelgen)
+  target_link_libraries(test_nointrin PRIVATE gtest kernelgen Eigen3::Eigen)
   kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
   kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
   kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)

From b4361190dacbc1cf4a7d6abca13d857db3b7af6b Mon Sep 17 00:00:00 2001
From: Nguyen Damien <ngn.damien@gmail.com>
Date: Sat, 22 Oct 2022 09:40:40 +0200
Subject: [PATCH 56/82] Add kernelgen path check and turn kernelgen into
 function

Add a check to make sure the kernelgen.py script does exists based on
the input to the kernelgen() function.

Also make kernelgen() a function instead of a macro and add a
`COMBINATIONS` arguments
---
 .../cppsim/cmake/commands/kernelgen.cmake     | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/third_party/cppsim/cmake/commands/kernelgen.cmake b/third_party/cppsim/cmake/commands/kernelgen.cmake
index 7543f007..00e40d8c 100644
--- a/third_party/cppsim/cmake/commands/kernelgen.cmake
+++ b/third_party/cppsim/cmake/commands/kernelgen.cmake
@@ -1,12 +1,14 @@
 # ~~~
 # Generate some kernel functions
 #
-# kernelgen([NQUBITS <n_qubits>]
+# kernelgen([COMBINATIONS]
+#           [NQUBITS <n_qubits>]
 #           [VARIANT <intrin>|<nointrin>]
 #           [TARGET <target-name>]
 # )
 # ~~~
-macro(kernelgen)
+function(kernelgen)
+  set(options COMBINATIONS)
   set(oneValueArgs NQUBITS VARIANT TARGET)
   cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -18,12 +20,21 @@ macro(kernelgen)
   set(KERNELGEN "${CPPSIM_INCLUDE_DIR}/${VARIANT}/kernelgen.py")
   set(KERNEL_PATH
       "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
+      
+  if(NOT EXISTS "${KERNELGEN}")
+    message(FATAL_ERROR "Cannot locate kernelgen Python script: ${KERNELGEN}")
+  endif()
+  
+  set(_args)
+  if(KERNELGEN_COMBINATIONS)
+    list(APPEND _args --combinations=True)
+  endif()
 
   # Call generator.
   add_custom_command(
     OUTPUT ${KERNEL_PATH}
     COMMAND ${Python_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH}
-            --combinations=True
+            ${_args}
     COMMENT "Generating kernel for ${NQUBITS} qubits"
     DEPENDS ${KERNELGEN})
   set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
@@ -32,4 +43,4 @@ macro(kernelgen)
   target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH})
   target_include_directories(${KERNELGEN_TARGET}
                              PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-endmacro()
+endfunction()

From bd79df808f7893ffec27569e0ee92bf65c85836b Mon Sep 17 00:00:00 2001
From: Nguyen Damien <ngn.damien@gmail.com>
Date: Sat, 22 Oct 2022 09:45:06 +0200
Subject: [PATCH 57/82] Fix changes missing from last commit

---
 third_party/cppsim/CMakeLists.txt                 | 10 +++++-----
 third_party/cppsim/cmake/commands/kernelgen.cmake |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 7e81c664..623b4e0b 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -194,11 +194,11 @@ if(BUILD_TESTING)
   target_include_directories(test_nointrin
                              PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_link_libraries(test_nointrin PRIVATE gtest kernelgen Eigen3::Eigen)
-  kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin)
-  kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin)
-  kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin)
-  kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin)
-  kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin)
+  kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin COMBINATIONS)
+  kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin COMBINATIONS)
+  kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin COMBINATIONS)
+  kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin COMBINATIONS)
+  kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin COMBINATIONS)
 
   add_executable(test_popcount "src/test/test_popcount.cpp")
   set_target_properties(test_popcount PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS
diff --git a/third_party/cppsim/cmake/commands/kernelgen.cmake b/third_party/cppsim/cmake/commands/kernelgen.cmake
index 00e40d8c..e03ea996 100644
--- a/third_party/cppsim/cmake/commands/kernelgen.cmake
+++ b/third_party/cppsim/cmake/commands/kernelgen.cmake
@@ -21,8 +21,8 @@ function(kernelgen)
   set(KERNEL_PATH
       "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
       
-  if(NOT EXISTS "${KERNELGEN}")
-    message(FATAL_ERROR "Cannot locate kernelgen Python script: ${KERNELGEN}")
+  if(NOT EXISTS "${KERNELGEN_PY}")
+    message(FATAL_ERROR "Cannot locate kernelgen Python script: ${KERNELGEN_PY}")
   endif()
   
   set(_args)
@@ -33,10 +33,10 @@ function(kernelgen)
   # Call generator.
   add_custom_command(
     OUTPUT ${KERNEL_PATH}
-    COMMAND ${Python_EXECUTABLE} ${KERNELGEN} ${NQUBITS} ${KERNEL_PATH}
+    COMMAND ${Python_EXECUTABLE} ${KERNELGEN_PY} ${NQUBITS} ${KERNEL_PATH}
             ${_args}
     COMMENT "Generating kernel for ${NQUBITS} qubits"
-    DEPENDS ${KERNELGEN})
+    DEPENDS ${KERNELGEN_PY})
   set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
 
   # Append the generated file to the target sources.

From c1f524928efdd997dcdc764fb404be8d58ba7cdc Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 25 Oct 2022 11:23:30 +0200
Subject: [PATCH 58/82] Add <array> and <functional> to kernels.hpp

---
 third_party/cppsim/include/nointrin/kernels.hpp | 6 ++++--
 third_party/cppsim/src/_cppsim.cpp              | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/third_party/cppsim/include/nointrin/kernels.hpp b/third_party/cppsim/include/nointrin/kernels.hpp
index d6608a72..bf0bd479 100644
--- a/third_party/cppsim/include/nointrin/kernels.hpp
+++ b/third_party/cppsim/include/nointrin/kernels.hpp
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cstddef> // size_t
-#include <complex>
 #include <algorithm>
+#include <array>
+#include <complex>
+#include <cstdint> // size_t
+#include <functional>
 
 #define add(a, b) (a + b)
 #define mul(a, b) (a * b)
diff --git a/third_party/cppsim/src/_cppsim.cpp b/third_party/cppsim/src/_cppsim.cpp
index a8122155..90119e47 100644
--- a/third_party/cppsim/src/_cppsim.cpp
+++ b/third_party/cppsim/src/_cppsim.cpp
@@ -18,8 +18,9 @@
 #include <vector>
 #include <complex>
 #include <iostream>
+
 #if defined(_OPENMP)
-#include <omp.h>
+#  include <omp.h>
 #endif
 #include "simulator.hpp"
 

From f4fdf202aa2c38d4e0d609ea3e11fdfdebfc00c7 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 25 Oct 2022 14:52:37 +0200
Subject: [PATCH 59/82] Improve CXX standard handling for all targets + add
 .cmake-format.yaml file

---
 third_party/cppsim/.cmake-format.yaml         |  44 +++++++
 third_party/cppsim/CMakeLists.txt             | 122 +++++++++---------
 .../cppsim/cmake/commands/kernelgen.cmake     |  14 +-
 .../cppsim/cmake/cppsimConfig.cmake.in        |  10 +-
 4 files changed, 111 insertions(+), 79 deletions(-)
 create mode 100644 third_party/cppsim/.cmake-format.yaml

diff --git a/third_party/cppsim/.cmake-format.yaml b/third_party/cppsim/.cmake-format.yaml
new file mode 100644
index 00000000..e38933b6
--- /dev/null
+++ b/third_party/cppsim/.cmake-format.yaml
@@ -0,0 +1,44 @@
+---
+
+markup:
+  first_comment_is_literal: true
+format:
+  disable: false
+  line_width: 120
+  tab_size: 2
+  use_tabchars: false
+  max_subgroups_hwrap: 2
+  max_pargs_hwrap: 6
+  max_rows_cmdline: 2
+  separate_ctrl_name_with_space: false
+  separate_fn_name_with_space: false
+  dangle_parens: false
+  dangle_align: prefix
+  min_prefix_chars: 4
+  max_prefix_chars: 10
+  max_lines_hwrap: 2
+  line_ending: unix
+  command_case: canonical
+  keyword_case: unchanged
+  enable_sort: true
+  autosort: false
+  require_valid_layout: false
+parse:
+  additional_commands:
+    res_embed:
+      pargs:
+        flags:
+          - KEYWORD
+      kwargs:
+        TARGET: 1
+        NAME: 1
+        PATH: 1
+        DEPENDS: 1+
+    kernelgen:
+      pargs:
+        flags:
+          - COMBINATIONS
+      kwargs:
+        TARGET: 1
+        NQUBITS: 1
+        VARIANT: 1
diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 623b4e0b..61b85607 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -7,8 +7,7 @@ project(
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/commands)
 
-# Minimum required Python version (used both in this file and in the installed
-# CMake configuration)
+# Minimum required Python version (used both in this file and in the installed CMake configuration)
 set(CPPSIM_PYTHON_VERSION_MIN 3.7.0)
 
 option(BUILD_TESTING "Build the test suite?" OFF)
@@ -55,13 +54,11 @@ if(BUILD_TESTING)
 endif()
 
 # ------------------------------------------------------------------------------
-# In the case of digestpp, the repository only consists of a list of
-# files/directories that we need both our build process to find as well as
-# future users of an installed copy of cppsim.
+# In the case of digestpp, the repository only consists of a list of files/directories that we need both our build
+# process to find as well as future users of an installed copy of cppsim.
 #
-# We therefore provide a target that we use for our current build and also
-# install its files into a sub-directory of the installation prefix. We then
-# define the digestpp::digestpp target directly within cppsimConfig.cmake
+# We therefore provide a target that we use for our current build and also install its files into a sub-directory of the
+# installation prefix. We then define the digestpp::digestpp target directly within cppsimConfig.cmake
 
 FetchContent_Declare(
   digestpp
@@ -70,8 +67,7 @@ FetchContent_Declare(
 FetchContent_MakeAvailable(digestpp)
 
 add_library(digestpp::digestpp INTERFACE IMPORTED)
-target_include_directories(digestpp::digestpp
-                           INTERFACE $<BUILD_INTERFACE:${digestpp_SOURCE_DIR}>)
+target_include_directories(digestpp::digestpp INTERFACE $<BUILD_INTERFACE:${digestpp_SOURCE_DIR}>)
 
 install(
   DIRECTORY ${digestpp_SOURCE_DIR}
@@ -104,8 +100,7 @@ endif()
 
 # ------------------------------------------------------------------------------
 
-find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter
-                                                            Development.Embed)
+find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Development.Embed)
 
 # ------------------------------------------------------------------------------
 
@@ -117,52 +112,37 @@ include(CMakePackageConfigHelpers)
 
 set(_namespace cppsim::)
 
-configure_package_config_file(
-  ${CMAKE_CURRENT_LIST_DIR}/cmake/cppsimConfig.cmake.in
-  ${PROJECT_BINARY_DIR}/cppsimConfig.cmake
-  INSTALL_DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/cmake/cppsimConfig.cmake.in
+                              ${PROJECT_BINARY_DIR}/cppsimConfig.cmake INSTALL_DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
-write_basic_package_version_file(${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
-                                 COMPATIBILITY SameMajorVersion)
+write_basic_package_version_file(${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake COMPATIBILITY SameMajorVersion)
 
-install(FILES ${PROJECT_BINARY_DIR}/cppsimConfig.cmake
-              ${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
+install(FILES ${PROJECT_BINARY_DIR}/cppsimConfig.cmake ${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
         DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/commands
-        DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/commands DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
-file(GLOB _headers ${CMAKE_CURRENT_LIST_DIR}/include/*.h
-     ${CMAKE_CURRENT_LIST_DIR}/include/*.hpp LIST_DIRECTORIES FALSE)
+file(GLOB _headers ${CMAKE_CURRENT_LIST_DIR}/include/*.h ${CMAKE_CURRENT_LIST_DIR}/include/*.hpp LIST_DIRECTORIES FALSE)
 install(FILES ${_headers} DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
-install(
-  DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/include/cpu
-            ${CMAKE_CURRENT_LIST_DIR}/include/gpu
-            ${CMAKE_CURRENT_LIST_DIR}/include/intrin
-            ${CMAKE_CURRENT_LIST_DIR}/include/nointrin
-  DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/include/cpu ${CMAKE_CURRENT_LIST_DIR}/include/gpu
+                  ${CMAKE_CURRENT_LIST_DIR}/include/intrin ${CMAKE_CURRENT_LIST_DIR}/include/nointrin
+        DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
 
 # ==============================================================================
 
-add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp"
-                             "src/tempfile.cpp")
-set_target_properties(kernelgen PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS OFF)
+add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp" "src/tempfile.cpp")
+target_compile_features(kernelgen PUBLIC cxx_std_17)
 set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_include_directories(
-  kernelgen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-                   $<INSTALL_INTERFACE:${CPPSIM_INSTALL_INCLUDEDIR}>)
+target_include_directories(kernelgen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+                                            $<INSTALL_INTERFACE:${CPPSIM_INSTALL_INCLUDEDIR}>)
 res_embed(
-  TARGET
-  kernelgen
-  NAME
-  "nointrin"
-  PATH
-  "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py"
+  TARGET kernelgen
+  NAME "nointrin"
+  PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py"
   KEYWORD)
 
-target_link_libraries(
-  kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Python
-                   OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})
+target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Python OpenMP::OpenMP_CXX
+                                       ${CMAKE_DL_LIBS})
 
 pybind11_add_module(${PROJECT_NAME} SHARED "src/_${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
@@ -189,31 +169,45 @@ if(BUILD_TESTING)
   include(kernelgen)
 
   add_executable(test_nointrin "src/test/test_nointrin.cpp")
-  set_target_properties(test_nointrin PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS
-                                                                 OFF)
-  target_include_directories(test_nointrin
-                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  target_compile_features(test_nointrin PRIVATE cxx_std_17)
+  target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_link_libraries(test_nointrin PRIVATE gtest kernelgen Eigen3::Eigen)
-  kernelgen(TARGET test_nointrin NQUBITS 1 VARIANT nointrin COMBINATIONS)
-  kernelgen(TARGET test_nointrin NQUBITS 2 VARIANT nointrin COMBINATIONS)
-  kernelgen(TARGET test_nointrin NQUBITS 3 VARIANT nointrin COMBINATIONS)
-  kernelgen(TARGET test_nointrin NQUBITS 4 VARIANT nointrin COMBINATIONS)
-  kernelgen(TARGET test_nointrin NQUBITS 5 VARIANT nointrin COMBINATIONS)
+
+  kernelgen(
+    TARGET test_nointrin
+    NQUBITS 1
+    VARIANT nointrin
+    COMBINATIONS)
+  kernelgen(
+    TARGET test_nointrin
+    NQUBITS 2
+    VARIANT nointrin
+    COMBINATIONS)
+  kernelgen(
+    TARGET test_nointrin
+    NQUBITS 3
+    VARIANT nointrin
+    COMBINATIONS)
+  kernelgen(
+    TARGET test_nointrin
+    NQUBITS 4
+    VARIANT nointrin
+    COMBINATIONS)
+  kernelgen(
+    TARGET test_nointrin
+    NQUBITS 5
+    VARIANT nointrin
+    COMBINATIONS)
 
   add_executable(test_popcount "src/test/test_popcount.cpp")
-  set_target_properties(test_popcount PROPERTIES CXX_STANDARD 17 CXX_EXTENSIONS
-                                                                 OFF)
-  target_include_directories(test_popcount
-                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  target_compile_features(test_popcount PRIVATE cxx_std_17)
+  target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_link_libraries(test_popcount PRIVATE Eigen3::Eigen gtest)
 
   add_executable(test_combinations "src/test/test_combinations.cpp")
-  set_target_properties(test_combinations PROPERTIES CXX_STANDARD 17
-                                                     CXX_EXTENSIONS OFF)
-  target_include_directories(test_combinations
-                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-  target_link_libraries(test_combinations PRIVATE gtest Eigen3::Eigen
-                                                  OpenMP::OpenMP_CXX)
+  target_compile_features(test_combinations PRIVATE cxx_std_17)
+  target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+  target_link_libraries(test_combinations PRIVATE gtest Eigen3::Eigen OpenMP::OpenMP_CXX)
 
   add_executable(benchmark "src/benchmark/benchmark.cpp")
   target_link_libraries(benchmark PRIVATE gtest kernelgen)
diff --git a/third_party/cppsim/cmake/commands/kernelgen.cmake b/third_party/cppsim/cmake/commands/kernelgen.cmake
index e03ea996..a20a89e0 100644
--- a/third_party/cppsim/cmake/commands/kernelgen.cmake
+++ b/third_party/cppsim/cmake/commands/kernelgen.cmake
@@ -18,13 +18,12 @@ function(kernelgen)
     set(CPPSIM_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
   endif()
   set(KERNELGEN "${CPPSIM_INCLUDE_DIR}/${VARIANT}/kernelgen.py")
-  set(KERNEL_PATH
-      "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
-      
+  set(KERNEL_PATH "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp")
+
   if(NOT EXISTS "${KERNELGEN_PY}")
     message(FATAL_ERROR "Cannot locate kernelgen Python script: ${KERNELGEN_PY}")
   endif()
-  
+
   set(_args)
   if(KERNELGEN_COMBINATIONS)
     list(APPEND _args --combinations=True)
@@ -33,14 +32,13 @@ function(kernelgen)
   # Call generator.
   add_custom_command(
     OUTPUT ${KERNEL_PATH}
-    COMMAND ${Python_EXECUTABLE} ${KERNELGEN_PY} ${NQUBITS} ${KERNEL_PATH}
-            ${_args}
+    COMMAND ${Python_EXECUTABLE} ${KERNELGEN_PY} ${NQUBITS} ${KERNEL_PATH} ${_args}
     COMMENT "Generating kernel for ${NQUBITS} qubits"
     DEPENDS ${KERNELGEN_PY})
   set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE)
 
   # Append the generated file to the target sources.
   target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH})
-  target_include_directories(${KERNELGEN_TARGET}
-                             PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+  target_include_directories(${KERNELGEN_TARGET} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+  target_compile_features(${KERNELGEN_TARGET} PRIVATE cxx_std_17)
 endfunction()
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index e59758ad..cf443c1d 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -6,8 +6,7 @@
 set(CPPSIM_PREFIX_DIR "${PACKAGE_PREFIX_DIR}")
 
 if(TARGET cppsim::cppsim)
-  # Protect against double definitions due to previous call or
-  # add_subdirectory()
+  # Protect against double definitions due to previous call or add_subdirectory()
   return()
 endif()
 
@@ -25,11 +24,8 @@ find_package(
 
 if(NOT TARGET digestpp::digestpp)
   add_library(digestpp::digestpp INTERFACE IMPORTED)
-  target_include_directories(
-    digestpp::digestpp
-    INTERFACE
-      "${CPPSIM_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src"
-  )
+  target_include_directories(digestpp::digestpp
+                             INTERFACE "${CPPSIM_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src")
 endif()
 
 # ==============================================================================

From 44e10844c28b79fd1382c101ebaa2109a247fc11 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 25 Oct 2022 14:54:44 +0200
Subject: [PATCH 60/82] Add compatibility with Boost::Filesystem for older
 compilers

---
 third_party/cppsim/CMakeLists.txt             | 13 ++++++-
 .../cmake/commands/check_code_compiles.cmake  | 36 +++++++++++++++++++
 .../cmake/compiler_has_std_filesystem.cmake   | 24 +++++++++++++
 .../cppsim/cmake/cppsimConfig.cmake.in        | 10 ++++++
 third_party/cppsim/src/compiler.cpp           | 11 ++++--
 5 files changed, 91 insertions(+), 3 deletions(-)
 create mode 100644 third_party/cppsim/cmake/commands/check_code_compiles.cmake
 create mode 100644 third_party/cppsim/cmake/compiler_has_std_filesystem.cmake

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 61b85607..32d0ceb1 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -28,6 +28,10 @@ set(CPPSIM_INSTALL_3RDPARTYDIR "${CPPSIM_INSTALL_LIBDIR}/third_party")
 
 # ==============================================================================
 
+include("${CMAKE_CURRENT_LIST_DIR}/cmake/compiler_has_std_filesystem.cmake")
+
+# ==============================================================================
+
 include(FetchContent)
 set(FETCHCONTENT_QUIET OFF)
 
@@ -106,6 +110,12 @@ find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Developm
 
 find_package(OpenMP REQUIRED)
 
+set(filesystem_LIBS)
+if(NOT CPPSIM_HAS_STD_FILESYSTEM)
+  find_package(Boost REQUIRED COMPONENTS filesystem)
+  set(filesystem_LIBS Boost::filesystem)
+endif()
+
 # ==============================================================================
 
 include(CMakePackageConfigHelpers)
@@ -121,6 +131,7 @@ install(FILES ${PROJECT_BINARY_DIR}/cppsimConfig.cmake ${PROJECT_BINARY_DIR}/cpp
         DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/commands DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+install(FILES ${PROJECT_SOURCE_DIR}/cmake/compiler_has_std_filesystem.cmake DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
 file(GLOB _headers ${CMAKE_CURRENT_LIST_DIR}/include/*.h ${CMAKE_CURRENT_LIST_DIR}/include/*.hpp LIST_DIRECTORIES FALSE)
 install(FILES ${_headers} DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
@@ -142,7 +153,7 @@ res_embed(
   KEYWORD)
 
 target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Python OpenMP::OpenMP_CXX
-                                       ${CMAKE_DL_LIBS})
+                                       ${CMAKE_DL_LIBS} ${filesystem_LIBS})
 
 pybind11_add_module(${PROJECT_NAME} SHARED "src/_${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
diff --git a/third_party/cppsim/cmake/commands/check_code_compiles.cmake b/third_party/cppsim/cmake/commands/check_code_compiles.cmake
new file mode 100644
index 00000000..f414b28d
--- /dev/null
+++ b/third_party/cppsim/cmake/commands/check_code_compiles.cmake
@@ -0,0 +1,36 @@
+include(CheckCXXSourceCompiles)
+
+# ~~~
+# Check whether some C++ code compiles
+#
+# check_cxx_code_compiles(<lang> <cmake_identifier> <out-var> <lang_standard> <code> [<lang>, ...])
+# ~~~
+function(check_code_compiles cmake_identifier var lang_standard code)
+  if(NOT "${ARGN}" STREQUAL "")
+    set(_lang_list "${ARGN}")
+  else()
+    set(_lang_list CXX)
+    if(_cuda_enabled)
+      list(APPEND _lang_list CUDA)
+    endif()
+  endif()
+
+  if(lang_standard MATCHES "std_([0-9]+)")
+    set(CMAKE_CXX_STANDARD ${CMAKE_MATCH_1})
+  endif()
+  set(CMAKE_CXX_EXTENSIONS OFF)
+
+  check_cxx_source_compiles("${code}" "${cmake_identifier}")
+
+  set(${var}
+      ${${cmake_identifier}}
+      PARENT_SCOPE)
+
+  set(${var} FALSE)
+  if(${cmake_identifier})
+    set(${var} TRUE)
+  endif()
+  set(${var}
+      ${${var}}
+      CACHE INTERNAL "${cmake_identifier}")
+endfunction()
diff --git a/third_party/cppsim/cmake/compiler_has_std_filesystem.cmake b/third_party/cppsim/cmake/compiler_has_std_filesystem.cmake
new file mode 100644
index 00000000..2bb12b5a
--- /dev/null
+++ b/third_party/cppsim/cmake/compiler_has_std_filesystem.cmake
@@ -0,0 +1,24 @@
+include(check_code_compiles)
+
+# ==============================================================================
+
+check_code_compiles(
+  compiler_has_std_filesystem
+  CPPSIM_HAS_STD_FILESYSTEM
+  cxx_std_17
+  [[
+#ifdef __has_include
+# if __has_include(<version>)
+#   include <version>
+# endif
+#endif
+int main() {
+#if __cpp_lib_filesystem >= 201703
+    return 0;
+#else
+#error std::filesystem not supported
+#endif
+}
+]])
+
+# ==============================================================================
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index cf443c1d..7170cca3 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -12,6 +12,8 @@ endif()
 
 # ==============================================================================
 
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/commands)
+
 find_package(pybind11 CONFIG REQUIRED)
 find_package(res_embed CONFIG REQUIRED)
 find_package(OpenMP REQUIRED)
@@ -20,6 +22,14 @@ find_package(
   COMPONENTS Interpreter Development.Embed
   REQUIRED)
 
+include(${CMAKE_CURRENT_LIST_DIR}/compiler_has_std_filesystem.cmake)
+
+if(NOT CPPSIM_HAS_STD_FILESYSTEM)
+  find_package(Boost REQUIRED COMPONENTS filesystem)
+endif()
+
+list(POP_BACK CMAKE_MODULE_PATH)
+
 # ------------------------------------------------------------------------------
 
 if(NOT TARGET digestpp::digestpp)
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index 584021e7..663b91bc 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -4,15 +4,22 @@
 
 #include <cstdlib>
 #include <dlfcn.h>
+#if __has_include(<version>)
+#  include <version>
+#endif
+#if __has_include(<filesystem>) && __cpp_lib_filesystem >= 201703
 #include <filesystem>
+namespace fs = std::filesystem;
+#else
+#include <boost/filesystem.hpp>
+namespace fs = boost::filesystem;
+#endif
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <sstream>
 #include <streambuf>
 
-namespace fs = std::filesystem;
-
 Compiler::Compiler() { }
 
 class Signature

From 522654a805c87df7cd7e2637a997f5c37c511182 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 25 Oct 2022 15:59:54 +0200
Subject: [PATCH 61/82] Fix compilation error on MSVC due to OpenMP

---
 third_party/cppsim/include/cppsim_omp.hpp     | 18 +++++++++
 third_party/cppsim/include/intrin/kernel1.hpp |  8 ++--
 third_party/cppsim/include/intrin/kernel2.hpp | 12 +++---
 third_party/cppsim/include/intrin/kernel3.hpp | 16 ++++----
 third_party/cppsim/include/intrin/kernel4.hpp | 20 +++++-----
 third_party/cppsim/include/intrin/kernel5.hpp | 24 ++++++------
 third_party/cppsim/include/intrin/kernels.hpp |  2 +
 .../cppsim/include/nointrin/kernel1.hpp       |  8 ++--
 .../cppsim/include/nointrin/kernel2.hpp       | 12 +++---
 .../cppsim/include/nointrin/kernel3.hpp       | 16 ++++----
 .../cppsim/include/nointrin/kernel4.hpp       | 20 +++++-----
 .../cppsim/include/nointrin/kernel5.hpp       | 24 ++++++------
 .../cppsim/include/nointrin/kernels.hpp       |  2 +
 third_party/cppsim/include/simulator.hpp      | 39 ++++++++++---------
 third_party/cppsim/src/_cppsim.cpp            |  4 +-
 15 files changed, 123 insertions(+), 102 deletions(-)
 create mode 100644 third_party/cppsim/include/cppsim_omp.hpp

diff --git a/third_party/cppsim/include/cppsim_omp.hpp b/third_party/cppsim/include/cppsim_omp.hpp
new file mode 100644
index 00000000..383290f1
--- /dev/null
+++ b/third_party/cppsim/include/cppsim_omp.hpp
@@ -0,0 +1,18 @@
+#ifndef CPPSIM_OMP_HPP
+#define CPPSIM_OMP_HPP
+
+#include <cstdint>
+
+#if defined(_OPENMP)
+#  include <omp.h>
+#endif
+
+namespace omp {
+#ifdef _MSC_VER
+using idx_t = int64_t;
+#else
+using idx_t = uint64_t;
+#endif  // _MSC_VER
+}  // namespace omp
+
+#endif /* CPPSIM_OMP_HPP */
diff --git a/third_party/cppsim/include/intrin/kernel1.hpp b/third_party/cppsim/include/intrin/kernel1.hpp
index 119b5350..39c519b6 100644
--- a/third_party/cppsim/include/intrin/kernel1.hpp
+++ b/third_party/cppsim/include/intrin/kernel1.hpp
@@ -45,16 +45,16 @@ void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){
                 kernel_core(psi, i0 + i1, d0, mm, mmt);
             }
         }
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){
                 if (((i0 + i1)&ctrlmask) == ctrlmask)
                     kernel_core(psi, i0 + i1, d0, mm, mmt);
             }
diff --git a/third_party/cppsim/include/intrin/kernel2.hpp b/third_party/cppsim/include/intrin/kernel2.hpp
index b4183739..58cb09cf 100644
--- a/third_party/cppsim/include/intrin/kernel2.hpp
+++ b/third_party/cppsim/include/intrin/kernel2.hpp
@@ -48,9 +48,9 @@ void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){
                     kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt);
                 }
             }
@@ -58,9 +58,9 @@ void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){
                     if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
                         kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt);
                 }
diff --git a/third_party/cppsim/include/intrin/kernel3.hpp b/third_party/cppsim/include/intrin/kernel3.hpp
index a4cb3c55..bb248337 100644
--- a/third_party/cppsim/include/intrin/kernel3.hpp
+++ b/third_party/cppsim/include/intrin/kernel3.hpp
@@ -62,10 +62,10 @@ void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::s
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){
                         kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt);
                     }
                 }
@@ -74,10 +74,10 @@ void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::s
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){
                         if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
                             kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt);
                     }
diff --git a/third_party/cppsim/include/intrin/kernel4.hpp b/third_party/cppsim/include/intrin/kernel4.hpp
index 94ef97a1..1a7e516a 100644
--- a/third_party/cppsim/include/intrin/kernel4.hpp
+++ b/third_party/cppsim/include/intrin/kernel4.hpp
@@ -98,11 +98,11 @@ void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){
                             kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt);
                         }
                     }
@@ -112,11 +112,11 @@ void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){
                             if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt);
                         }
diff --git a/third_party/cppsim/include/intrin/kernel5.hpp b/third_party/cppsim/include/intrin/kernel5.hpp
index f1608e1f..25002b26 100644
--- a/third_party/cppsim/include/intrin/kernel5.hpp
+++ b/third_party/cppsim/include/intrin/kernel5.hpp
@@ -218,12 +218,12 @@ void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
-                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt);
                             }
                         }
@@ -234,12 +234,12 @@ void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
-                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){
                                 if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
                                     kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt);
                             }
diff --git a/third_party/cppsim/include/intrin/kernels.hpp b/third_party/cppsim/include/intrin/kernels.hpp
index f592142d..0b4f8540 100644
--- a/third_party/cppsim/include/intrin/kernels.hpp
+++ b/third_party/cppsim/include/intrin/kernels.hpp
@@ -21,6 +21,8 @@
 #include "cintrin.hpp"
 #include "alignedallocator.hpp"
 
+#include "cppsim_omp.hpp"
+
 #define LOOP_COLLAPSE1 2
 #define LOOP_COLLAPSE2 3
 #define LOOP_COLLAPSE3 4
diff --git a/third_party/cppsim/include/nointrin/kernel1.hpp b/third_party/cppsim/include/nointrin/kernel1.hpp
index a0a5952c..173ab44c 100644
--- a/third_party/cppsim/include/nointrin/kernel1.hpp
+++ b/third_party/cppsim/include/nointrin/kernel1.hpp
@@ -40,16 +40,16 @@ void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){
                 kernel_core(psi, i0 + i1, d0, m);
             }
         }
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; ++i1){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){
                 if (((i0 + i1)&ctrlmask) == ctrlmask)
                     kernel_core(psi, i0 + i1, d0, m);
             }
diff --git a/third_party/cppsim/include/nointrin/kernel2.hpp b/third_party/cppsim/include/nointrin/kernel2.hpp
index eb8a5521..dfd6e067 100644
--- a/third_party/cppsim/include/nointrin/kernel2.hpp
+++ b/third_party/cppsim/include/nointrin/kernel2.hpp
@@ -44,9 +44,9 @@ void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){
                     kernel_core(psi, i0 + i1 + i2, d0, d1, m);
                 }
             }
@@ -54,9 +54,9 @@ void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; ++i2){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){
                     if (((i0 + i1 + i2)&ctrlmask) == ctrlmask)
                         kernel_core(psi, i0 + i1 + i2, d0, d1, m);
                 }
diff --git a/third_party/cppsim/include/nointrin/kernel3.hpp b/third_party/cppsim/include/nointrin/kernel3.hpp
index aad532ea..4b767e96 100644
--- a/third_party/cppsim/include/nointrin/kernel3.hpp
+++ b/third_party/cppsim/include/nointrin/kernel3.hpp
@@ -65,10 +65,10 @@ void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::s
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){
                         kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m);
                     }
                 }
@@ -77,10 +77,10 @@ void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::s
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; ++i3){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){
                         if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask)
                             kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m);
                     }
diff --git a/third_party/cppsim/include/nointrin/kernel4.hpp b/third_party/cppsim/include/nointrin/kernel4.hpp
index 176cf69d..fe31469b 100644
--- a/third_party/cppsim/include/nointrin/kernel4.hpp
+++ b/third_party/cppsim/include/nointrin/kernel4.hpp
@@ -125,11 +125,11 @@ void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){
                             kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m);
                         }
                     }
@@ -139,11 +139,11 @@ void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M co
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; ++i4){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){
                             if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask)
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m);
                         }
diff --git a/third_party/cppsim/include/nointrin/kernel5.hpp b/third_party/cppsim/include/nointrin/kernel5.hpp
index dff08c2f..7a8d0dc5 100644
--- a/third_party/cppsim/include/nointrin/kernel5.hpp
+++ b/third_party/cppsim/include/nointrin/kernel5.hpp
@@ -341,12 +341,12 @@ void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
 
     if (ctrlmask == 0){
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
-                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){
                                 kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
                             }
                         }
@@ -357,12 +357,12 @@ void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsi
     }
     else{
         #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static)
-        for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
-            for (std::size_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
-                for (std::size_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
-                    for (std::size_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
-                        for (std::size_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
-                            for (std::size_t i5 = 0; i5 < dsorted4; ++i5){
+        for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){
+            for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){
+                for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){
+                    for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){
+                        for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){
+                            for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){
                                 if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask)
                                     kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m);
                             }
diff --git a/third_party/cppsim/include/nointrin/kernels.hpp b/third_party/cppsim/include/nointrin/kernels.hpp
index bf0bd479..81abe524 100644
--- a/third_party/cppsim/include/nointrin/kernels.hpp
+++ b/third_party/cppsim/include/nointrin/kernels.hpp
@@ -21,6 +21,8 @@
 #define add(a, b) (a + b)
 #define mul(a, b) (a * b)
 
+#include "cppsim_omp.hpp"
+
 #include "kernel1.hpp"
 #include "kernel2.hpp"
 #include "kernel3.hpp"
diff --git a/third_party/cppsim/include/simulator.hpp b/third_party/cppsim/include/simulator.hpp
index b50afb61..f3c9df71 100644
--- a/third_party/cppsim/include/simulator.hpp
+++ b/third_party/cppsim/include/simulator.hpp
@@ -24,6 +24,7 @@
 #include "intrin/kernels.hpp"
 #endif
 
+#include "cppsim_omp.hpp"
 #include "intrin/alignedallocator.hpp"
 #include "fusion.hpp"
 #include "kernelgen.hpp"
@@ -62,7 +63,7 @@ class Simulator{
               std::swap(newvec, tmpBuff1_);
             newvec.resize(1UL << N_);
 #pragma omp parallel for schedule(static)
-            for (std::size_t i = 0; i < newvec.size(); ++i)
+            for (omp::idx_t i = 0; i < newvec.size(); ++i)
                 newvec[i] = (i < vec_.size())?vec_[i]:0.;
             std::swap(vec_, newvec);
             // recycle large memory
@@ -99,8 +100,8 @@ class Simulator{
 
         short up = 0, down = 0;
         #pragma omp parallel for schedule(static) reduction(|:up,down)
-        for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
-            for (std::size_t j = 0; j < delta; ++j){
+        for (omp::idx_t i = 0; i < vec_.size(); i += 2*delta){
+            for (omp::idx_t j = 0; j < delta; ++j){
                 up = up | ((std::norm(vec_[i+j]) > tol)&1);
                 down = down | ((std::norm(vec_[i+j+delta]) > tol)&1);
             }
@@ -116,7 +117,7 @@ class Simulator{
 
         if (!shrink){
             #pragma omp parallel for schedule(static)
-            for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
+            for (omp::idx_t i = 0; i < vec_.size(); i += 2*delta){
                 for (std::size_t j = 0; j < delta; ++j)
                     vec_[i+j+static_cast<std::size_t>(!value)*delta] = 0.;
             }
@@ -127,7 +128,7 @@ class Simulator{
               std::swap(tmpBuff1_, newvec);
             newvec.resize((1UL << (N_-1)));
             #pragma omp parallel for schedule(static) if(0)
-            for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
+            for (omp::idx_t i = 0; i < vec_.size(); i += 2*delta)
                 std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
                             delta, &newvec[i/2]);
             std::swap(vec_, newvec);
@@ -174,7 +175,7 @@ class Simulator{
         // set bad entries to 0
         calc_type N = 0.;
         #pragma omp parallel for reduction(+:N) schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); ++i){
+        for (omp::idx_t i = 0; i < vec_.size(); ++i){
             if ((i & mask) != val)
                 vec_[i] = 0.;
             else
@@ -183,7 +184,7 @@ class Simulator{
         // re-normalize
         N = 1./std::sqrt(N);
         #pragma omp parallel for schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); ++i)
+        for (omp::idx_t i = 0; i < vec_.size(); ++i)
             vec_[i] *= N;
     }
 
@@ -238,7 +239,7 @@ class Simulator{
           std::swap(newvec, tmpBuff1_);
         newvec.resize(vec_.size());
 #pragma omp parallel for schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); i++)
+        for (omp::idx_t i = 0; i < vec_.size(); i++)
           newvec[i] = 0;
 
 //#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
@@ -300,7 +301,7 @@ class Simulator{
           std::swap(tmpBuff1_, current_state);
         current_state.resize(vec_.size());
 #pragma omp parallel for schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); ++i)
+        for (omp::idx_t i = 0; i < vec_.size(); ++i)
           current_state[i] = vec_[i];
 
         for (auto const& term : td){
@@ -308,7 +309,7 @@ class Simulator{
             apply_term(term.first, ids, {});
             calc_type delta = 0.;
             #pragma omp parallel for reduction(+:delta) schedule(static)
-            for (std::size_t i = 0; i < vec_.size(); ++i){
+            for (omp::idx_t i = 0; i < vec_.size(); ++i){
                 auto const a1 = std::real(current_state[i]);
                 auto const b1 = -std::imag(current_state[i]);
                 auto const a2 = std::real(vec_[i]);
@@ -333,7 +334,7 @@ class Simulator{
         new_state.resize(vec_.size());
         current_state.resize(vec_.size());
 #pragma omp parallel for schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); ++i){
+        for (omp::idx_t i = 0; i < vec_.size(); ++i){
           new_state[i] = 0;
           current_state[i] = vec_[i];
         }
@@ -341,7 +342,7 @@ class Simulator{
             auto const& coefficient = term.second;
             apply_term(term.first, ids, {});
             #pragma omp parallel for schedule(static)
-            for (std::size_t i = 0; i < vec_.size(); ++i){
+            for (omp::idx_t i = 0; i < vec_.size(); ++i){
                 new_state[i] += coefficient * vec_[i];
                 vec_[i] = current_state[i];
             }
@@ -363,7 +364,7 @@ class Simulator{
         }
         calc_type probability = 0.;
         #pragma omp parallel for reduction(+:probability) schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); ++i)
+        for (omp::idx_t i = 0; i < vec_.size(); ++i)
             if ((i & mask) == bit_str)
                 probability += std::norm(vec_[i]);
         return probability;
@@ -413,14 +414,14 @@ class Simulator{
                 for (auto const& tup : td){
                     apply_term(tup.first, ids, {});
                     #pragma omp parallel for schedule(static)
-                    for (std::size_t j = 0; j < vec_.size(); ++j){
+                    for (omp::idx_t j = 0; j < vec_.size(); ++j){
                         update[j] += vec_[j] * tup.second;
                         vec_[j] = current_state[j];
                     }
                 }
                 nrm_change = 0.;
                 #pragma omp parallel for reduction(+:nrm_change) schedule(static)
-                for (std::size_t j = 0; j < vec_.size(); ++j){
+                for (omp::idx_t j = 0; j < vec_.size(); ++j){
                     update[j] *= coeff;
                     vec_[j] = update[j];
                     if ((j & ctrlmask) == ctrlmask){
@@ -431,7 +432,7 @@ class Simulator{
                 nrm_change = std::sqrt(nrm_change);
             }
             #pragma omp parallel for schedule(static)
-            for (std::size_t j = 0; j < vec_.size(); ++j){
+            for (omp::idx_t j = 0; j < vec_.size(); ++j){
                 if ((j & ctrlmask) == ctrlmask)
                     output_state[j] *= correction;
                 vec_[j] = output_state[j];
@@ -451,7 +452,7 @@ class Simulator{
         for (unsigned i = 0; i < ordering.size(); ++i)
             map_[ordering[i]] = i;
         #pragma omp parallel for schedule(static)
-        for (std::size_t i = 0; i < wavefunction.size(); ++i)
+        for (omp::idx_t i = 0; i < wavefunction.size(); ++i)
             vec_[i] = wavefunction[i];
     }
 
@@ -469,7 +470,7 @@ class Simulator{
         // set bad entries to 0 and compute probability of outcome to renormalize
         calc_type N = 0.;
         #pragma omp parallel for reduction(+:N) schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); ++i){
+        for (omp::idx_t i = 0; i < vec_.size(); ++i){
             if ((i & mask) == val)
                 N += std::norm(vec_[i]);
         }
@@ -478,7 +479,7 @@ class Simulator{
         // re-normalize (if possible)
         N = 1./std::sqrt(N);
         #pragma omp parallel for schedule(static)
-        for (std::size_t i = 0; i < vec_.size(); ++i){
+        for (omp::idx_t i = 0; i < vec_.size(); ++i){
             if ((i & mask) != val)
                 vec_[i] = 0.;
             else
diff --git a/third_party/cppsim/src/_cppsim.cpp b/third_party/cppsim/src/_cppsim.cpp
index 90119e47..8f17e0c0 100644
--- a/third_party/cppsim/src/_cppsim.cpp
+++ b/third_party/cppsim/src/_cppsim.cpp
@@ -19,9 +19,7 @@
 #include <complex>
 #include <iostream>
 
-#if defined(_OPENMP)
-#  include <omp.h>
-#endif
+#include "cppsim_omp.hpp"
 #include "simulator.hpp"
 
 namespace py = pybind11;

From ec2df8abcd24b169601bd737407410b077e9790b Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 25 Oct 2022 16:03:20 +0200
Subject: [PATCH 62/82] Fix missed #include <filesystem> for compatibility with
 older compilers

---
 third_party/cppsim/src/tempfile.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/third_party/cppsim/src/tempfile.cpp b/third_party/cppsim/src/tempfile.cpp
index 6d34166e..cc8b119a 100644
--- a/third_party/cppsim/src/tempfile.cpp
+++ b/third_party/cppsim/src/tempfile.cpp
@@ -1,12 +1,19 @@
 #include "tempfile.h"
 
 #include <errno.h>
+#if __has_include(<version>)
+#  include <version>
+#endif
+#if __has_include(<filesystem>) && __cpp_lib_filesystem >= 201703
 #include <filesystem>
+namespace fs = std::filesystem;
+#else
+#include <boost/filesystem.hpp>
+namespace fs = boost::filesystem;
+#endif
 #include <vector>
 #include <unistd.h>
 
-namespace fs = std::filesystem;
-
 const std::string& TempFile::string(std::error_code& ec_) const
 {
 	ec_ = ec;

From 067195585d80fc0b22b829ca575c86974e711f27 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 25 Oct 2022 16:25:37 +0200
Subject: [PATCH 63/82] Use Python.Module instead of Python.Embed

---
 third_party/cppsim/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 32d0ceb1..d819922d 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -104,7 +104,7 @@ endif()
 
 # ------------------------------------------------------------------------------
 
-find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Development.Embed)
+find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Development.Module)
 
 # ------------------------------------------------------------------------------
 
@@ -152,7 +152,7 @@ res_embed(
   PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py"
   KEYWORD)
 
-target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Python OpenMP::OpenMP_CXX
+target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Module OpenMP::OpenMP_CXX
                                        ${CMAKE_DL_LIBS} ${filesystem_LIBS})
 
 pybind11_add_module(${PROJECT_NAME} SHARED "src/_${PROJECT_NAME}.cpp")

From 166873601b3abcf670945f05cb93e95ffe38aa39 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 25 Oct 2022 22:36:02 +0200
Subject: [PATCH 64/82] Fix compilation issues on various compilers

---
 third_party/cppsim/include/tempfile.h | 18 +++++++++++++++++-
 third_party/cppsim/src/compiler.cpp   | 10 ----------
 third_party/cppsim/src/tempfile.cpp   | 21 ++++++---------------
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/third_party/cppsim/include/tempfile.h b/third_party/cppsim/include/tempfile.h
index 14e4f7d7..2763d23e 100644
--- a/third_party/cppsim/include/tempfile.h
+++ b/third_party/cppsim/include/tempfile.h
@@ -2,11 +2,27 @@
 #define TEMP_FILE_H
 
 #include <string>
+
+#if __has_include(<version>)
+#  include <version>
+#endif
+
+#if __has_include(<filesystem>) && __cpp_lib_filesystem >= 201703
+#include <filesystem>
 #include <system_error>
+namespace ec_ns = std;
+namespace fs = std::filesystem;
+#else
+#include <boost/system/error_code.hpp>
+#include <boost/filesystem.hpp>
+namespace ec_ns = boost::system;
+namespace fs = boost::filesystem;
+#endif
+
 
 class TempFile
 {
-	std::error_code ec;
+        ec_ns::error_code ec;
 
 	std::string filename;
 
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index 663b91bc..46b4c3bd 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -4,16 +4,6 @@
 
 #include <cstdlib>
 #include <dlfcn.h>
-#if __has_include(<version>)
-#  include <version>
-#endif
-#if __has_include(<filesystem>) && __cpp_lib_filesystem >= 201703
-#include <filesystem>
-namespace fs = std::filesystem;
-#else
-#include <boost/filesystem.hpp>
-namespace fs = boost::filesystem;
-#endif
 #include <fstream>
 #include <iostream>
 #include <map>
diff --git a/third_party/cppsim/src/tempfile.cpp b/third_party/cppsim/src/tempfile.cpp
index cc8b119a..1a9f3199 100644
--- a/third_party/cppsim/src/tempfile.cpp
+++ b/third_party/cppsim/src/tempfile.cpp
@@ -1,18 +1,9 @@
 #include "tempfile.h"
 
-#include <errno.h>
-#if __has_include(<version>)
-#  include <version>
-#endif
-#if __has_include(<filesystem>) && __cpp_lib_filesystem >= 201703
-#include <filesystem>
-namespace fs = std::filesystem;
-#else
-#include <boost/filesystem.hpp>
-namespace fs = boost::filesystem;
-#endif
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
 #include <vector>
-#include <unistd.h>
 
 const std::string& TempFile::string(std::error_code& ec_) const
 {
@@ -31,7 +22,7 @@ TempFile::TempFile(const std::string& mask_)
 	int fd = mkstemp(&vfilename[0]);
 	if (fd == -1)
 	{
-		ec = std::error_code(errno, std::generic_category());
+		ec = ec_ns::error_code(errno, std::generic_category());
 		return;
 	}
 
@@ -42,10 +33,10 @@ TempFile::TempFile(const std::string& mask_)
 TempFile::~TempFile()
 {
 	bool keepCache = false;
-	const char* keepCacheValue = getenv("KEEP_CACHE");
+	const char* keepCacheValue = std::getenv("KEEP_CACHE");
 	if (keepCacheValue)
 		keepCache = atoi(keepCacheValue);
 	if (!keepCache)
-		unlink(filename.c_str());
+                std::remove(filename.c_str());
 }
 

From c6b871fffa164b7689a1b7ff4f1ecb2d685a0d34 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Wed, 26 Oct 2022 14:45:49 +0200
Subject: [PATCH 65/82] Fix compilation issues linked to <unistd.h>

---
 third_party/cppsim/include/tempfile.h |  2 +-
 third_party/cppsim/src/tempfile.cpp   | 78 +++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/third_party/cppsim/include/tempfile.h b/third_party/cppsim/include/tempfile.h
index 2763d23e..b6422463 100644
--- a/third_party/cppsim/include/tempfile.h
+++ b/third_party/cppsim/include/tempfile.h
@@ -13,8 +13,8 @@
 namespace ec_ns = std;
 namespace fs = std::filesystem;
 #else
-#include <boost/system/error_code.hpp>
 #include <boost/filesystem.hpp>
+#include <boost/system/error_code.hpp>
 namespace ec_ns = boost::system;
 namespace fs = boost::filesystem;
 #endif
diff --git a/third_party/cppsim/src/tempfile.cpp b/third_party/cppsim/src/tempfile.cpp
index 1a9f3199..dc6729af 100644
--- a/third_party/cppsim/src/tempfile.cpp
+++ b/third_party/cppsim/src/tempfile.cpp
@@ -1,9 +1,62 @@
 #include "tempfile.h"
 
-#include <cstdio>
-#include <cstdlib>
-#include <cerrno>
-#include <vector>
+#if __has_include(<unistd.h>)
+#define HAS_UNISTD_H 1
+#else
+#define HAS_UNISTD_H 0
+#endif
+
+#include <cerrno>       // for errno
+#include <cstdio>       // for std::remove
+#include <cstdlib>      // for getenv, atoi
+#include <iterator>     // for begin, end
+
+#if HAS_UNISTD_H
+#include <unistd.h>     // for close
+#include <vector>       // for std::vector
+#else
+#include <algorithm>    // for std::generate_n
+#include <cstdint>      // for uint32_t
+#include <fstream>      // for ofstream
+#include <string_view>  // for std::string_view
+#include <random>       // for std::default_random_engine, etc.
+#include <utility>      // for std::move
+#endif // HAS_UNISTD_H
+
+#if !HAS_UNISTD_H
+namespace  {
+     std::string create_temp_file(std::string template_str, ec_ns::error_code& ec) {
+          static constexpr std::string_view chars =
+               "0123456789"
+               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+               "abcdefghijklmnopqrstuvwxyz";
+          thread_local std::random_device rd;
+          thread_local auto rng = std::default_random_engine(rd());
+          thread_local auto dist = std::uniform_int_distribution<uint32_t>{0U,
+               static_cast<uint32_t>(chars.size())};
+
+          auto dir = fs::temp_directory_path(ec);
+          if (ec) return {};
+
+          const auto size = template_str.size();
+
+          for(unsigned int i(0); i < (62 * 62 * 62) /* same as mkstemp */; ++i) {
+               std::generate_n(std::end(template_str)-6, 6, []() { return chars[dist(rng)]; });
+
+               std::ofstream fout(template_str);
+               if(fout)
+               {
+                    return template_str;
+               }
+          }
+
+          ec = ec_ns::error_code(errno, ec_ns::generic_category());
+          return {};
+     }
+}
+#endif // !HAS_UNISTD_H
+
+// =============================================================================
 
 const std::string& TempFile::string(std::error_code& ec_) const
 {
@@ -14,20 +67,27 @@ const std::string& TempFile::string(std::error_code& ec_) const
 TempFile::TempFile(const std::string& mask_)
 {
 	auto dir = fs::temp_directory_path(ec);
-	if (ec) return; 
+	if (ec) return;
 
+#if HAS_UNISTD_H
 	std::string mask = (dir / mask_).string();
 
 	std::vector<char> vfilename(mask.c_str(), mask.c_str() + mask.size() + 1);
-	int fd = mkstemp(&vfilename[0]);
+	auto fd = mkstemp(&vfilename[0]);
 	if (fd == -1)
 	{
-		ec = ec_ns::error_code(errno, std::generic_category());
+		ec = ec_ns::error_code(errno, ec_ns::generic_category());
 		return;
 	}
 
 	close(fd);
-	filename = (char*)&vfilename[0];
+	filename = std::string(begin(vfilename), end(vfilename));
+#else
+        auto fname = create_temp_file((dir / mask_).string(), ec);
+	if (ec) return;
+
+        filename = std::move(fname);
+#endif // HAS_UNISTD_H
 }
 
 TempFile::~TempFile()
@@ -35,7 +95,7 @@ TempFile::~TempFile()
 	bool keepCache = false;
 	const char* keepCacheValue = std::getenv("KEEP_CACHE");
 	if (keepCacheValue)
-		keepCache = atoi(keepCacheValue);
+                keepCache = std::atoi(keepCacheValue);
 	if (!keepCache)
                 std::remove(filename.c_str());
 }

From f8e438d70f935a03c7277071504f5c05228dfcbb Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Wed, 26 Oct 2022 14:52:14 +0200
Subject: [PATCH 66/82] Locate pybind11 after looking for Python

---
 third_party/cppsim/CMakeLists.txt              | 8 ++++----
 third_party/cppsim/cmake/cppsimConfig.cmake.in | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index d819922d..3c2953e0 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -93,6 +93,10 @@ include(ResEmbed)
 
 # ------------------------------------------------------------------------------
 
+find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Development.Module)
+
+# ------------------------------------------------------------------------------
+
 find_package(pybind11 CONFIG QUIET)
 if(NOT pybind11_FOUND)
   FetchContent_Declare(
@@ -104,10 +108,6 @@ endif()
 
 # ------------------------------------------------------------------------------
 
-find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Development.Module)
-
-# ------------------------------------------------------------------------------
-
 find_package(OpenMP REQUIRED)
 
 set(filesystem_LIBS)
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/third_party/cppsim/cmake/cppsimConfig.cmake.in
index 7170cca3..d108f0b1 100644
--- a/third_party/cppsim/cmake/cppsimConfig.cmake.in
+++ b/third_party/cppsim/cmake/cppsimConfig.cmake.in
@@ -14,13 +14,13 @@ endif()
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/commands)
 
-find_package(pybind11 CONFIG REQUIRED)
 find_package(res_embed CONFIG REQUIRED)
 find_package(OpenMP REQUIRED)
 find_package(
   Python @CPPSIM_PYTHON_VERSION_MIN@
   COMPONENTS Interpreter Development.Embed
   REQUIRED)
+find_package(pybind11 CONFIG REQUIRED)
 
 include(${CMAKE_CURRENT_LIST_DIR}/compiler_has_std_filesystem.cmake)
 

From b952ff7af3a5a4058a8b2b12ad18e07806c39fc6 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Wed, 26 Oct 2022 15:25:08 +0200
Subject: [PATCH 67/82] Use Python module instead of embed

---
 third_party/cppsim/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 3c2953e0..caf7016e 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -152,10 +152,10 @@ res_embed(
   PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py"
   KEYWORD)
 
-target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::pybind11 Python::Module OpenMP::OpenMP_CXX
-                                       ${CMAKE_DL_LIBS} ${filesystem_LIBS})
+target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::module OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS}
+                                       ${filesystem_LIBS})
 
-pybind11_add_module(${PROJECT_NAME} SHARED "src/_${PROJECT_NAME}.cpp")
+pybind11_add_module(${PROJECT_NAME} MODULE "src/_${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
 
 # ==============================================================================

From 7834aa2ea4b1fa565691b3f0ec392716d7a6a086 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Wed, 26 Oct 2022 20:31:42 +0200
Subject: [PATCH 68/82] Support dlopen on Windows

---
 third_party/cppsim/CMakeLists.txt   | 24 ++++++++++++++++++++++++
 third_party/cppsim/src/compiler.cpp | 17 ++++++++++-------
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index caf7016e..a02d6574 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -110,6 +110,30 @@ endif()
 
 find_package(OpenMP REQUIRED)
 
+# ------------------------------------------------------------------------------
+
+if(MSCV
+   OR MINGW
+   AND NOT MSYS)
+  find_package(dlfcn-win32 CONFIG QUIET)
+  if(NOT dlfcn-win32_FOUND)
+    set(_build_shared_libs ${BUILD_SHARED_LIBS})
+    set(BUILD_SHARED_LIBS OFF)
+    FetchContent_Declare(
+      dlfcn-win32
+      GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
+      GIT_TAG 9d0ef119d9fcb9139f831adc224857b791c81140)
+    FetchContent_MakeAvailable(dlfcn-win32)
+    set(BUILD_SHARED_LIBS ${_build_shared_libs})
+    target_include_directories(dl PUBLIC ${dlfcn-win32_SOURCE_DIR}/src)
+    list(APPEND CMAKE_DL_LIBS dl)
+  else()
+    list(APPEND CMAKE_DL_LIBS dlfcn-win32::dl)
+  endif()
+endif()
+
+# ------------------------------------------------------------------------------
+
 set(filesystem_LIBS)
 if(NOT CPPSIM_HAS_STD_FILESYSTEM)
   find_package(Boost REQUIRED COMPONENTS filesystem)
diff --git a/third_party/cppsim/src/compiler.cpp b/third_party/cppsim/src/compiler.cpp
index 46b4c3bd..c4e85257 100644
--- a/third_party/cppsim/src/compiler.cpp
+++ b/third_party/cppsim/src/compiler.cpp
@@ -2,13 +2,16 @@
 #include "tempfile.h"
 #include "digestpp.hpp"
 
-#include <cstdlib>
-#include <dlfcn.h>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <sstream>
-#include <streambuf>
+#include <array>               // for array
+#include <cstdlib>             // for system
+#include <fstream>             // for ofstream
+#include <iterator>            // for istreambuf_iterator, operator==
+#include <map>                 // for map
+#include <sstream>             // for stringstream
+#include <system_error>        // for error_code
+#include <utility>             // for make_pair, pair
+
+#include <dlfcn.h>             // for dlerror, dlopen, dlsym
 
 Compiler::Compiler() { }
 

From e08caf748a1cb9586ac2695ecc47225f24e34fe8 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Thu, 27 Oct 2022 10:29:45 +0200
Subject: [PATCH 69/82] Help CMake on macOS to locate OpenMP when using
 HomeBrew and MacPorts

---
 third_party/cppsim/CMakeLists.txt | 56 +++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index a02d6574..3ac10a73 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -108,6 +108,62 @@ endif()
 
 # ------------------------------------------------------------------------------
 
+if(APPLE)
+  find_program(BREW_CMD brew PATHS /usr/local/bin)
+  if(BREW_CMD)
+    # Homebrew installs libomp in ${LIBOMP_PREFIX}/lib and the headers in ${LIBOMP_PREFIX}/include
+    execute_process(COMMAND ${BREW_CMD} --prefix libomp OUTPUT_VARIABLE LIBOMP_PREFIX)
+    string(STRIP ${LIBOMP_PREFIX} LIBOMP_PREFIX)
+
+    find_library(
+      LIBOMP_LIB omp gomp libomp
+      HINTS ${LIBOMP_PREFIX}
+      PATH_SUFFIXES lib
+      NO_DEFAULT_PATH)
+    if(LIBOMP_LIB)
+      get_filename_component(LIBOMP_DIR ${LIBOMP_LIB} DIRECTORY)
+      list(APPEND CMAKE_LIBRARY_PATH ${LIBOMP_DIR})
+    endif()
+
+    find_path(
+      LIBOMP_INC omp.h
+      HINTS ${LIBOMP_PREFIX}
+      PATH_SUFFIXES include
+      NO_DEFAULT_PATH)
+    if(LIBOMP_INC)
+      list(APPEND CMAKE_INCLUDE_PATH ${LIBOMP_INC})
+    else()
+      message(WARNING "Unable to locate omp.h, the code might not compile properly.\n"
+                      "You might want to try installing the `libomp` Homebrew formula: brew install libomp")
+    endif()
+  else()
+    set(_macports_install_prefix "/opt/local")
+    # MacPorts install libomp in ${_macports_install_prefix}/lib/libomp and the headers in
+    # ${_macports_install_prefix}/include/libomp
+    find_library(
+      LIBOMP_LIB omp gomp libomp
+      PATHS "${_macports_install_prefix}/lib"
+      PATH_SUFFIXES libomp
+      NO_DEFAULT_PATH)
+    if(LIBOMP_LIB)
+      get_filename_component(LIBOMP_DIR ${LIBOMP_LIB} DIRECTORY)
+      list(APPEND CMAKE_LIBRARY_PATH ${LIBOMP_DIR})
+    endif()
+
+    find_path(
+      LIBOMP_INC omp.h
+      PATHS "${_macports_install_prefix}/include"
+      PATH_SUFFIXES libomp
+      NO_DEFAULT_PATH)
+    if(LIBOMP_INC)
+      list(APPEND CMAKE_INCLUDE_PATH ${LIBOMP_INC})
+    else()
+      message(WARNING "Unable to locate omp.h, the code might not compile properly.\n"
+                      "You might want to try installing the `libomp` MacPorts port: sudo port install libomp")
+    endif()
+  endif()
+endif()
+
 find_package(OpenMP REQUIRED)
 
 # ------------------------------------------------------------------------------

From 6dfbb9f582e77c343d7e71a40751d81aa93f4345 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Thu, 27 Oct 2022 10:30:20 +0200
Subject: [PATCH 70/82] Add GitHub workflows

---
 third_party/cppsim/.github/workflows/ci.yml | 479 ++++++++++++++++++++
 1 file changed, 479 insertions(+)
 create mode 100644 third_party/cppsim/.github/workflows/ci.yml

diff --git a/third_party/cppsim/.github/workflows/ci.yml b/third_party/cppsim/.github/workflows/ci.yml
new file mode 100644
index 00000000..b152b4ec
--- /dev/null
+++ b/third_party/cppsim/.github/workflows/ci.yml
@@ -0,0 +1,479 @@
+---
+
+name: CI
+
+on:
+  workflow_dispatch:
+  push:
+    branches-ignore:
+      - 'test'
+
+jobs:
+  standard:
+    strategy:
+      fail-fast: true
+      matrix:
+        runs-on: [ubuntu-latest]
+        python:
+          - 3.7
+          - 3.8
+          - 3.9
+          - '3.10'
+    name: "Python ${{ matrix.python }} • ${{ matrix.runs-on }} • x64"
+    runs-on: ${{ matrix.runs-on }}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Setup Python ${{ matrix.python }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Prepare env
+        run: |
+          python3 -m pip install -U wheel build cmake pybind11
+          sudo apt-get update && sudo apt-get install -y libboost-filesystem-dev --no-install-recommends
+
+      - name: Install res_embed
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install"
+          cmake --build res_embed/build --target install -j2
+
+      - name: Configure
+        run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)"
+
+      - name: Build & install
+        run: cmake --build build --target all -j2 -v
+
+
+  # ============================================================================
+
+  macos:
+    runs-on: macos-11
+    name: "MacOS 11 • ${{ matrix.xcode }} • x64"
+    env:
+      DEVELOPER_DIR: "/Applications/${{ matrix.xcode }}.app/Contents/Developer"
+      CC: /usr/bin/clang
+      CXX: /usr/bin/clang++
+    strategy:
+      fail-fast: false
+      matrix:
+        xcode:
+          - "Xcode_11.7"    # Not available on macos-12
+          - "Xcode_12.4"    # Not available on macos-12
+          - "Xcode_12.5.1"  # Not available on macos-12
+          # - "Xcode_13.0"    # Not available on macos-12
+          - "Xcode_13.1"
+          - "Xcode_13.2.1"
+          # - "Xcode_13.3.1"  # macos-12 only
+          # - "Xcode_13.4.1"  # macos-12 only
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Setup Python ${{ matrix.python }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+
+      - name: Install NASM
+        uses: ilammy/setup-nasm@v1
+
+      - name: Prepare env
+        run: |
+          python3 -m pip install -U wheel build cmake pybind11
+          brew install libomp boost
+
+      - name: Install res_embed
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install"
+          cmake --build res_embed/build --target install -j2
+
+      - name: Configure
+        run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)"
+
+      - name: Build & install
+        run: cmake --build build --target all -j2
+
+  # ============================================================================
+
+  gcc:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        gcc:
+          - 7  # C++17 earliest version
+          - 8
+          - 9
+          - 10
+          - 11
+          - 12
+    name: "GCC ${{ matrix.gcc }} • x64"
+    container: "gcc:${{ matrix.gcc }}"
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Prepare env
+        run: >
+          apt-get update && apt-get install -y python3-dev python3-pip python3-setuptools python3-wheel python3-venv
+          libboost-filesystem-dev --no-install-recommends
+
+      - name: Prepare env
+        run: |
+          python3 -m pip install -U wheel build
+          python3 -m pip install -U pybind11
+          python3 -m pip install cmake --only-binary :all:
+
+      - name: Install res_embed
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install"
+          cmake --build res_embed/build --target install -j2
+
+      - name: Configure
+        run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR=$(python3 -m pybind11 --cmakedir)
+
+      - name: Build & install
+        run: cmake --build build --target all -j2
+
+  # ============================================================================
+
+  clang:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        clang:
+          - 7
+          - 8
+          - 9
+          - 10   # first version for full C++17 support (with patches)
+          - 11
+          - 12
+          - 13
+          - 14
+    env:
+      CC: clang
+      CXX: clang++
+
+    name: "Clang ${{ matrix.clang }} • x64"
+    container: "silkeh/clang:${{ matrix.clang }}"
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Prepare env
+        run: >
+          apt-get update && apt-get install -y python3-dev python3-pip python3-setuptools python3-wheel python3-venv
+          libboost-filesystem-dev --no-install-recommends
+
+      - name: Prepare env
+        run: python3 -m pip install -U wheel build cmake pybind11
+
+      - name: Install res_embed
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install"
+          cmake --build res_embed/build --target install -j2
+
+      - name: Configure
+        run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR=$(python3 -m pybind11 --cmakedir)
+
+      - name: Build & install
+        run: cmake --build build --target all -j2
+
+  # ============================================================================
+
+  msvc:
+    runs-on: windows-latest
+    name: "MSVC • x64"
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Enable Developer Command Prompt
+        uses: ilammy/msvc-dev-cmd@v1.12.0
+
+      - name: Install NASM
+        uses: ilammy/setup-nasm@v1
+
+      - name: Prepare env
+        run: python3 -m pip install -U wheel build cmake pybind11
+
+      - name: Install res_embed
+        env:
+          CC: cl
+          CXX: cl
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" -DMSVC_ITERATOR_DEBUG=ON
+          cmake --build res_embed/build --target install -j2 --config Release
+
+      - name: Configure
+        env:
+          CC: cl
+          CXX: cl
+        run: >
+          cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="${{ github.workspace }}/install"
+          -DMSVC_ITERATOR_DEBUG=ON
+          -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)"
+
+      - name: Build & install
+        run: cmake --build build --config Release -j2
+
+  # ============================================================================
+
+  mingw64:
+    runs-on: windows-2022
+    strategy:
+      fail-fast: false
+    name: "MINGW64 • x64"
+    env:
+      BOOST_VERSION: 1.78.0
+      BOOST_PATH: ${{github.workspace}}/boost/boost
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Install NASM
+        uses: ilammy/setup-nasm@v1
+
+      - name: Set up MinGW64
+        uses: egor-tensin/setup-mingw@v2
+        id: mingw64-setup
+        with:
+          platform: x64
+
+      - name: Download and install Boost
+        uses: MarkusJx/install-boost@v2.4.0
+        if: steps.cache-boost.outputs.cache-hit != 'true'
+        id: install-boost
+        with:
+          boost_version: ${{ env.BOOST_VERSION }}
+          platform_version: 2022
+          toolset: mingw
+
+      - name: Prepare env
+        run: python3 -m pip install -U wheel build cmake pybind11
+
+      - name: Install res_embed
+        env:
+          CC: ${{ steps.mingw64-setup.outputs.gcc }}
+          CXX: ${{ steps.mingw64-setup.outputs.gxx }}
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" -DMSVC_ITERATOR_DEBUG=ON -G "MinGW Makefiles"
+          cmake --build res_embed/build --target install -j2
+
+      - name: Configure
+        env:
+          BOOST_ROOT: ${{ env.BOOST_PATH }}
+          CC: ${{ steps.mingw64-setup.outputs.gcc }}
+          CXX: ${{ steps.mingw64-setup.outputs.gxx }}
+        run: >
+          cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="${{ github.workspace }}/install"
+          -DMSVC_ITERATOR_DEBUG=ON
+          -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)"
+          -G "MinGW Makefiles"
+
+      - name: Build & install
+        run: cmake --build build --target all -j2
+
+  # ============================================================================
+
+  msys2:
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - msystem: MINGW64
+            installdeps: >-
+              git
+              patch
+              make
+              mingw-w64-x86_64-toolchain
+              mingw-w64-x86_64-cmake
+              mingw-w64-x86_64-boost
+              mingw-w64-x86_64-dlfcn
+              mingw-w64-x86_64-nasm
+              mingw-w64-x86_64-python
+              mingw-w64-x86_64-python-pip
+            cmake_generator: -G "MSYS Makefiles"
+            CC: gcc
+            CXX: g++
+
+          - msystem: CLANG64
+            installdeps: >-
+              git
+              patch
+              make
+              mingw-w64-clang-x86_64-toolchain
+              mingw-w64-clang-x86_64-libssp
+              mingw-w64-clang-x86_64-cmake
+              mingw-w64-clang-x86_64-boost
+              mingw-w64-clang-x86_64-dlfcn
+              mingw-w64-clang-x86_64-nasm
+              mingw-w64-clang-x86_64-python
+              mingw-w64-clang-x86_64-python-pip
+            cmake_generator: -G "MSYS Makefiles"
+            CC: clang
+            CXX: clang++
+
+          - msystem: UCRT64
+            installdeps: >-
+              git
+              patch
+              make
+              mingw-w64-ucrt-x86_64-toolchain
+              mingw-w64-ucrt-x86_64-cmake
+              mingw-w64-ucrt-x86_64-boost
+              mingw-w64-ucrt-x86_64-dlfcn
+              mingw-w64-ucrt-x86_64-nasm
+              mingw-w64-ucrt-x86_64-python
+              mingw-w64-ucrt-x86_64-python-pip
+            cmake_generator: -G "MSYS Makefiles"
+            CC: gcc
+            CXX: g++
+
+    name: "MSYS2 ${{ matrix.msystem }} • x64"
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Setup MSYS
+        uses: msys2/setup-msys2@v2
+        with:
+          install: ${{ matrix.installdeps }}
+          msystem: ${{ matrix.msystem }}
+          path-type: strict
+          update: false
+
+      - name: Prepare env
+        shell: msys2 {0}
+        env:
+          CC: ${{ matrix.CC }}
+          CXX: ${{ matrix.CXX }}
+        run: python3 -m pip install -U wheel build pybind11
+
+      - name: Install res_embed
+        shell: msys2 {0}
+        env:
+          CC: ${{ matrix.CC }}
+          CXX: ${{ matrix.CXX }}
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" \
+          ${{ matrix.cmake_generator }}
+          cmake --build res_embed/build --target install -j2 -v
+
+      - name: Configure
+        shell: msys2 {0}
+        env:
+          CC: ${{ matrix.CC }}
+          CXX: ${{ matrix.CXX }}
+        run: >
+          cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="${{ github.workspace }}/install"
+          -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)"
+          ${{ matrix.cmake_generator }}
+
+      - name: Build & install
+        run: cmake --build build --target all -j2 -v
+
+  # ============================================================================
+
+  cygwin:
+    runs-on: windows-latest
+    name: "Cygwin • x64"
+    env:
+      CYGWIN_NOWINPATH: 1  # only have cygwin's executables on PATH
+      CHERE_INVOKING: 1  # prevent profile script to change directory
+      CCACHE_VERSION: 4.6.1
+    steps:
+      - run: git config --global core.autocrlf input
+
+      - uses: actions/checkout@v3
+
+      - uses: actions/checkout@v3
+        with:
+          repository: Takishima/res_embed
+          ref: master
+          path: res_embed
+
+      - name: Setup Cygwin
+        uses: cygwin/cygwin-install-action@v2
+        with:
+          packages: >-
+            cygwin cygwin-devel
+            autoconf automake coreutils m4 make cmake patch git
+            gawk sed libtool gettext wget curl grep
+            gzip bzip2 tar xz nasm
+            binutils gcc-core gcc-g++ libboost-devel
+            python3 python3-devel python3-pip python3-virtualenv
+
+      - name: Prepare env
+        env:
+          PATH: C:\cygwin\bin
+        shell: bash --login -eo pipefail -o igncr {0}
+        run: |
+          python3 -m pip install -U wheel build
+          python3 -m pip install -U pybind11
+
+      - name: Install res_embed
+        env:
+          PATH: C:\cygwin\bin
+        shell: bash --login -eo pipefail -o igncr {0}
+        run: |
+          cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="$PWD/install"
+          cmake --build res_embed/build --target install -j2 -v
+
+      - name: Configure
+        env:
+          PATH: C:\cygwin\bin
+        shell: bash --login -eo pipefail -o igncr {0}
+        run: >
+          cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="$PWD/install"
+          -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)"
+
+      - name: Build & install
+        env:
+          PATH: C:\cygwin\bin
+        shell: bash --login -eo pipefail -o igncr {0}
+        run: cmake --build build --target all -j2 -v
+
+      - name: Restore PATH for git
+        run: Add-Content -Path $env:GITHUB_PATH -Value "C:\Program Files\Git\bin"

From 68811322318cbf8ff42a4e6c43f567b6d85d51d4 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Thu, 27 Oct 2022 11:05:32 +0200
Subject: [PATCH 71/82] More verbose output

---
 third_party/cppsim/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 3ac10a73..63ddf1c6 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -88,6 +88,8 @@ if(NOT res_embed_FOUND)
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
     GIT_TAG b803e4df20b09bdd65477a9662530a6feeb228e6)
   FetchContent_MakeAvailable(res_embed)
+else()
+  message(STATUS "Found res_embed at ${res_embed_DIR}")
 endif()
 include(ResEmbed)
 
@@ -104,6 +106,8 @@ if(NOT pybind11_FOUND)
     GIT_REPOSITORY https://github.com/pybind/pybind11.git
     GIT_TAG 68e6fdaa90fc93979e6d5d1e9f788f464593e8f2)
   FetchContent_MakeAvailable(pybind11)
+else()
+  message(STATUS "Found pybind11 at ${pybind11_DIR}")
 endif()
 
 # ------------------------------------------------------------------------------
@@ -184,6 +188,7 @@ if(MSCV
     target_include_directories(dl PUBLIC ${dlfcn-win32_SOURCE_DIR}/src)
     list(APPEND CMAKE_DL_LIBS dl)
   else()
+    message(STATUS "Found dlfcn-win32 at ${dlfcn-win32_DIR}")
     list(APPEND CMAKE_DL_LIBS dlfcn-win32::dl)
   endif()
 endif()

From 5779e9cc61418a883de8466b227e2aadbbc26892 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Thu, 27 Oct 2022 11:08:53 +0200
Subject: [PATCH 72/82] Fix typo

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 63ddf1c6..cd2410f6 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -172,7 +172,7 @@ find_package(OpenMP REQUIRED)
 
 # ------------------------------------------------------------------------------
 
-if(MSCV
+if(MSVC
    OR MINGW
    AND NOT MSYS)
   find_package(dlfcn-win32 CONFIG QUIET)

From 89b7dd5eca22bada45eb1bd3d1753eb242190ac9 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Thu, 27 Oct 2022 11:15:27 +0200
Subject: [PATCH 73/82] Fix CMake configuration error due to dlfcn-win32

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index cd2410f6..7fecc3cd 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -185,7 +185,7 @@ if(MSVC
       GIT_TAG 9d0ef119d9fcb9139f831adc224857b791c81140)
     FetchContent_MakeAvailable(dlfcn-win32)
     set(BUILD_SHARED_LIBS ${_build_shared_libs})
-    target_include_directories(dl PUBLIC ${dlfcn-win32_SOURCE_DIR}/src)
+    target_include_directories(dl PUBLIC $<BUILD_INTERFACE:${dlfcn-win32_SOURCE_DIR}/src>)
     list(APPEND CMAKE_DL_LIBS dl)
   else()
     message(STATUS "Found dlfcn-win32 at ${dlfcn-win32_DIR}")

From 84f0ff470a5605137dab37289b0585d66279f92e Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Thu, 27 Oct 2022 11:34:42 +0200
Subject: [PATCH 74/82] Allow customization of the _ITERATOR_DEBUG define when
 compiling with MSVC

---
 third_party/cppsim/CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 7fecc3cd..8741cac6 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -12,6 +12,11 @@ set(CPPSIM_PYTHON_VERSION_MIN 3.7.0)
 
 option(BUILD_TESTING "Build the test suite?" OFF)
 
+if(NOT DEFINED ITERATOR_DEBUG_VALUE)
+  set(ITERATOR_DEBUG_VALUE 0)
+endif()
+option(MSVC_ITERATOR_DEBUG "Define _ITERATOR_DEBUG_LEVEL (defaults to 0, can be set using ITERATOR_DEBUG_VALUE)" OFF)
+
 # ==============================================================================
 
 include(GNUInstallDirs)
@@ -231,6 +236,11 @@ target_compile_features(kernelgen PUBLIC cxx_std_17)
 set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(kernelgen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
                                             $<INSTALL_INTERFACE:${CPPSIM_INSTALL_INCLUDEDIR}>)
+
+if(MSVC AND MSVC_ITERATOR_DEBUG)
+  target_compile_definitions(kernelgen PUBLIC _ITERATOR_DEBUG=${ITERATOR_DEBUG_VALUE})
+endif()
+
 res_embed(
   TARGET kernelgen
   NAME "nointrin"

From 2a2bd33fea8c30e9b4b1c19f6022978b4ad14eb1 Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Thu, 27 Oct 2022 12:00:26 +0200
Subject: [PATCH 75/82] Use dlfcn-win32 also on MSYS

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 8741cac6..49e497e9 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -179,7 +179,7 @@ find_package(OpenMP REQUIRED)
 
 if(MSVC
    OR MINGW
-   AND NOT MSYS)
+   OR MSYS)
   find_package(dlfcn-win32 CONFIG QUIET)
   if(NOT dlfcn-win32_FOUND)
     set(_build_shared_libs ${BUILD_SHARED_LIBS})

From a313ac92b9222c8fe76eaf1e10ed7b3a233b096d Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 1 Nov 2022 15:25:12 +0100
Subject: [PATCH 76/82] Update res_embed version

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 49e497e9..0f8f81aa 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -91,7 +91,7 @@ if(NOT res_embed_FOUND)
   FetchContent_Declare(
     res_embed
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
-    GIT_TAG b803e4df20b09bdd65477a9662530a6feeb228e6)
+    GIT_TAG bf1a143e12dda57a5cb45cb1dc4413b9d6846cb2)
   FetchContent_MakeAvailable(res_embed)
 else()
   message(STATUS "Found res_embed at ${res_embed_DIR}")

From c8713ec84edad6ef18c23ab7be21bd3df54784bf Mon Sep 17 00:00:00 2001
From: Damien Nguyen <ngn.damien@gmail.com>
Date: Tue, 1 Nov 2022 16:21:28 +0100
Subject: [PATCH 77/82] Use dmikushin/res_embed for GitHub workflows

---
 third_party/cppsim/.github/workflows/ci.yml | 64 +++++++++++++--------
 1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/third_party/cppsim/.github/workflows/ci.yml b/third_party/cppsim/.github/workflows/ci.yml
index b152b4ec..afd41506 100644
--- a/third_party/cppsim/.github/workflows/ci.yml
+++ b/third_party/cppsim/.github/workflows/ci.yml
@@ -23,11 +23,13 @@ jobs:
     runs-on: ${{ matrix.runs-on }}
 
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 
@@ -75,11 +77,13 @@ jobs:
           # - "Xcode_13.3.1"  # macos-12 only
           # - "Xcode_13.4.1"  # macos-12 only
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 
@@ -124,11 +128,13 @@ jobs:
     name: "GCC ${{ matrix.gcc }} • x64"
     container: "gcc:${{ matrix.gcc }}"
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 
@@ -177,11 +183,13 @@ jobs:
     name: "Clang ${{ matrix.clang }} • x64"
     container: "silkeh/clang:${{ matrix.clang }}"
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 
@@ -210,11 +218,13 @@ jobs:
     runs-on: windows-latest
     name: "MSVC • x64"
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 
@@ -258,11 +268,13 @@ jobs:
       BOOST_VERSION: 1.78.0
       BOOST_PATH: ${{github.workspace}}/boost/boost
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 
@@ -368,11 +380,13 @@ jobs:
 
     name: "MSYS2 ${{ matrix.msystem }} • x64"
     steps:
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 
@@ -426,11 +440,13 @@ jobs:
     steps:
       - run: git config --global core.autocrlf input
 
-      - uses: actions/checkout@v3
+      - name: Checkout cppsim repository
+        uses: actions/checkout@v3
 
-      - uses: actions/checkout@v3
+      - name: Checkout res_embed repository
+        uses: actions/checkout@v3
         with:
-          repository: Takishima/res_embed
+          repository: dmikushin/res_embed
           ref: master
           path: res_embed
 

From c975c61f88d31a22a4014e22fb5dfbf908186ffa Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 14 Nov 2022 11:35:57 +0100
Subject: [PATCH 78/82] Bumping the res_embed GIT_TAG to a version, which
 (hopefully) fixes some Cygwin/MinGW issues

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index 0f8f81aa..d6da9abf 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -91,7 +91,7 @@ if(NOT res_embed_FOUND)
   FetchContent_Declare(
     res_embed
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
-    GIT_TAG bf1a143e12dda57a5cb45cb1dc4413b9d6846cb2)
+    GIT_TAG 415f23d253cdf2b17aa792600d2f9058779dc2f7)
   FetchContent_MakeAvailable(res_embed)
 else()
   message(STATUS "Found res_embed at ${res_embed_DIR}")

From b9bcccbd3c57073419796f202ec5d5d21578fe7d Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 14 Nov 2022 12:19:26 +0100
Subject: [PATCH 79/82] Do not use .att_mnemonic, which is not supported by
 Clang

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index d6da9abf..e6a3546f 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -91,7 +91,7 @@ if(NOT res_embed_FOUND)
   FetchContent_Declare(
     res_embed
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
-    GIT_TAG 415f23d253cdf2b17aa792600d2f9058779dc2f7)
+    GIT_TAG f4f6b4bf2af84f9e37eaa92387b8b72b15bff3af)
   FetchContent_MakeAvailable(res_embed)
 else()
   message(STATUS "Found res_embed at ${res_embed_DIR}")

From 7879110f798680aca58b501b5a41f5b0de9e03c4 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 14 Nov 2022 13:12:16 +0100
Subject: [PATCH 80/82] Ensuring NO_TYPE_FOR_PECOFF reaches MSYS2 MINGW64 and
 MSYS2 UCRT64 configs, which somehow fail to advertise themselves as MSYS or
 MINGW environments

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index e6a3546f..a56bdaef 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -91,7 +91,7 @@ if(NOT res_embed_FOUND)
   FetchContent_Declare(
     res_embed
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
-    GIT_TAG f4f6b4bf2af84f9e37eaa92387b8b72b15bff3af)
+    GIT_TAG a304a2e6686d1f8f0f998d43cce7a0fe176c50dd)
   FetchContent_MakeAvailable(res_embed)
 else()
   message(STATUS "Found res_embed at ${res_embed_DIR}")

From be87a2b8390a719536e9526f0edc5f5ac250762b Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 14 Nov 2022 17:18:22 +0100
Subject: [PATCH 81/82] Don't use .type directive unsupported on Windows at
 all, if we can live without it everywhere else

---
 third_party/cppsim/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cppsim/CMakeLists.txt b/third_party/cppsim/CMakeLists.txt
index a56bdaef..a9702271 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/third_party/cppsim/CMakeLists.txt
@@ -91,7 +91,7 @@ if(NOT res_embed_FOUND)
   FetchContent_Declare(
     res_embed
     GIT_REPOSITORY https://github.com/dmikushin/res_embed.git
-    GIT_TAG a304a2e6686d1f8f0f998d43cce7a0fe176c50dd)
+    GIT_TAG 93b5711070086dea53c3b535018ff34e68479242)
   FetchContent_MakeAvailable(res_embed)
 else()
   message(STATUS "Found res_embed at ${res_embed_DIR}")

From 7c43563bc52ac4ef1a3c6fe6537c5882b135e793 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Tue, 20 Dec 2022 11:04:58 +0100
Subject: [PATCH 82/82] Getting cppsim to compile from sources alongside the
 MindQuantum sources

---
 build.bat                                     |  2 -
 build.ps1                                     |  4 --
 build_locally.bat                             |  7 --
 build_locally.ps1                             |  5 --
 build_locally.sh                              |  1 -
 .../include/cppsim}/combinations.h            |  0
 .../include/cppsim}/compiler.h                |  0
 .../include/cppsim}/cppsim_omp.hpp            |  0
 .../include/cppsim}/cpu/schedule.h            |  0
 .../include/cppsim}/fusion.hpp                |  0
 .../include/cppsim}/gpu/schedule.h            |  0
 .../include/cppsim}/gpu_support.h             |  0
 .../cppsim}/intrin/alignedallocator.hpp       |  6 +-
 .../include/cppsim}/intrin/cintrin.hpp        |  0
 .../include/cppsim}/intrin/kernel1.hpp        |  0
 .../include/cppsim}/intrin/kernel2.hpp        |  0
 .../include/cppsim}/intrin/kernel3.hpp        |  0
 .../include/cppsim}/intrin/kernel4.hpp        |  0
 .../include/cppsim}/intrin/kernel5.hpp        |  0
 .../include/cppsim}/intrin/kernels.hpp        |  0
 .../include/cppsim}/kernelgen.hpp             |  0
 .../include/cppsim}/nointrin/kernel1.hpp      |  0
 .../include/cppsim}/nointrin/kernel2.hpp      |  0
 .../include/cppsim}/nointrin/kernel3.hpp      |  0
 .../include/cppsim}/nointrin/kernel4.hpp      |  0
 .../include/cppsim}/nointrin/kernel5.hpp      |  0
 .../include/cppsim}/nointrin/kernelgen.py     |  0
 .../include/cppsim}/nointrin/kernels.hpp      |  0
 .../include/cppsim}/partitioner.h             |  0
 .../include/cppsim}/schedule.h                |  0
 .../include/cppsim}/simulator.hpp             |  0
 .../include/cppsim}/tempfile.h                |  0
 ccsrc/lib/CMakeLists.txt                      |  1 +
 .../lib}/cppsim/CMakeLists.txt                | 49 +++++--------
 {third_party => ccsrc/lib}/cppsim/README.md   |  4 --
 .../src => ccsrc/lib/cppsim}/_cppsim.cpp      |  0
 .../lib/cppsim}/benchmark/benchmark.cpp       |  0
 .../src => ccsrc/lib/cppsim}/compiler.cpp     |  0
 .../src => ccsrc/lib/cppsim}/kernelgen.cpp    |  0
 .../src => ccsrc/lib/cppsim}/tempfile.cpp     |  0
 .../lib/cppsim}/test/test_combinations.cpp    |  0
 .../lib/cppsim}/test/test_nointrin.cpp        |  0
 .../lib/cppsim}/test/test_popcount.cpp        |  0
 .../commands/check_code_compiles.cmake        |  0
 .../cmake => cmake}/commands/kernelgen.cmake  |  0
 .../compiler_has_std_filesystem.cmake         |  0
 .../cmake => cmake}/cppsimConfig.cmake.in     |  0
 cmake/options.cmake                           |  1 -
 docs/source/cmake_reference.rst               |  4 --
 scripts/build/default_values.bat              |  1 -
 scripts/build/default_values.conf             |  1 -
 scripts/build/parse_common_args.ps1           |  5 --
 scripts/build/parse_common_args.sh            |  4 --
 third_party/CMakeLists.txt                    |  5 +-
 third_party/cppsim/.cmake-format.yaml         | 44 ------------
 .../cppsim/{.gitmodules => cmake/commands/a}  |  0
 third_party/cppsim/cppsim.cmake               | 70 -------------------
 57 files changed, 23 insertions(+), 191 deletions(-)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/combinations.h (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/compiler.h (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/cppsim_omp.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/cpu/schedule.h (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/fusion.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/gpu/schedule.h (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/gpu_support.h (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/alignedallocator.hpp (95%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/cintrin.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/kernel1.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/kernel2.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/kernel3.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/kernel4.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/kernel5.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/intrin/kernels.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/kernelgen.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/nointrin/kernel1.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/nointrin/kernel2.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/nointrin/kernel3.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/nointrin/kernel4.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/nointrin/kernel5.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/nointrin/kernelgen.py (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/nointrin/kernels.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/partitioner.h (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/schedule.h (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/simulator.hpp (100%)
 rename {third_party/cppsim/include => ccsrc/include/cppsim}/tempfile.h (100%)
 rename {third_party => ccsrc/lib}/cppsim/CMakeLists.txt (84%)
 rename {third_party => ccsrc/lib}/cppsim/README.md (97%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/_cppsim.cpp (100%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/benchmark/benchmark.cpp (100%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/compiler.cpp (100%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/kernelgen.cpp (100%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/tempfile.cpp (100%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/test/test_combinations.cpp (100%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/test/test_nointrin.cpp (100%)
 rename {third_party/cppsim/src => ccsrc/lib/cppsim}/test/test_popcount.cpp (100%)
 rename {third_party/cppsim/cmake => cmake}/commands/check_code_compiles.cmake (100%)
 rename {third_party/cppsim/cmake => cmake}/commands/kernelgen.cmake (100%)
 rename {third_party/cppsim/cmake => cmake}/compiler_has_std_filesystem.cmake (100%)
 rename {third_party/cppsim/cmake => cmake}/cppsimConfig.cmake.in (100%)
 delete mode 100644 third_party/cppsim/.cmake-format.yaml
 rename third_party/cppsim/{.gitmodules => cmake/commands/a} (100%)
 delete mode 100644 third_party/cppsim/cppsim.cmake

diff --git a/build.bat b/build.bat
index 88ee055f..c3037e10 100644
--- a/build.bat
+++ b/build.bat
@@ -372,7 +372,6 @@ call %SCRIPTDIR%\dos\build_cmake_option.bat BUILD_TESTING !enable_tests!
 call %SCRIPTDIR%\dos\build_cmake_option.bat CLEAN_3RDPARTY_INSTALL_DIR !do_clean_3rdparty!
 call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_ANALYZER !enable_analyzer!
 call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CMAKE_DEBUG !cmake_debug_mode!
-call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CPPSIM !enable_cppsim!
 call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CUDA !enable_gpu!
 call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CXX_EXPERIMENTAL !enable_cxx!
 call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_GITEE !enable_gitee!
@@ -504,7 +503,6 @@ rem ============================================================================
   echo   /CleanBuildDir      Delete build directory before building
   echo   /CleanCache         Re-run CMake with a clean CMake cache
   echo   /CleanVenv          Delete Python virtualenv before building
-  echo   /CppSim             (experimental) Enable the use of cppsim to generate simulation kernels
   echo   /Cxx                (experimental) Enable MindQuantum C++ support
   echo   /Debug              Build in debug mode
   echo   /Delocate           Delocate the binary wheels after build is finished
diff --git a/build.ps1 b/build.ps1
index b8012b58..875fd1c4 100644
--- a/build.ps1
+++ b/build.ps1
@@ -24,7 +24,6 @@ Param(
     [switch]$CleanCache,
     [switch]$CleanVenv,
     [ValidateNotNullOrEmpty()][string]$Config,
-    [switch]$CppSim,
     [ValidateNotNullOrEmpty()][string]$CudaArch,
     [switch]$Cxx,
     [switch]$Debug,
@@ -488,9 +487,6 @@ Do not use the CMake registry to find packages
 .PARAMETER Config
 Path to INI configuration file with default values for the parameters
 
-.PARAMETER CppSim
-(experimental) Enable the use of cppsim to generate simulation kernels
-
 .PARAMETER Cxx
 (experimental) Enable MindQuantum C++ support
 
diff --git a/build_locally.bat b/build_locally.bat
index 81d54636..dc6da764 100644
--- a/build_locally.bat
+++ b/build_locally.bat
@@ -132,11 +132,6 @@ rem ============================================================================
     shift & shift & goto :initial
   )
 
-  if /I "%1" == "/CppSim" (
-    set enable_cppsim=1
-    shift & goto :initial
-  )
-
   if /I "%1" == "/Cxx" (
     set enable_cxx=1
     shift & goto :initial
@@ -348,7 +343,6 @@ call %SCRIPTDIR%\dos\build_locally_cmake_option.bat BUILD_TESTING !enable_tests!
 call %SCRIPTDIR%\dos\build_locally_cmake_option.bat CLEAN_3RDPARTY_INSTALL_DIR !do_clean_3rdparty!
 call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_ANALYZER !enable_analyzer!
 call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CMAKE_DEBUG !cmake_debug_mode!
-call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CPPSIM !enable_cppsim!
 call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CUDA !enable_gpu!
 call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CXX_EXPERIMENTAL !enable_cxx!
 call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_DOCUMENTATION !do_docs!
@@ -546,7 +540,6 @@ exit /B 0
   echo   /CleanCache         Re-run CMake with a clean CMake cache
   echo   /CleanVenv          Delete Python virtualenv before building
   echo   /ConfigureOnly      Stop after the CMake configure and generation steps (ie. before building MindQuantum)
-  echo   /CppSim             (experimental) Enable the use of cppsim to generate simulation kernels
   echo   /Cxx                (experimental) Enable MindQuantum C++ support
   echo   /Debug              Build in debug mode
   echo   /DebugCMake         Enable debugging mode for CMake configuration step
diff --git a/build_locally.ps1 b/build_locally.ps1
index 6b0c4f43..4018d497 100644
--- a/build_locally.ps1
+++ b/build_locally.ps1
@@ -27,7 +27,6 @@ Param(
     [ValidateNotNullOrEmpty()][string]$Config,
     [Alias("C")][switch]$Configure,
     [switch]$ConfigureOnly,
-    [switch]$CppSim,
     [ValidateNotNullOrEmpty()][string]$CudaArch,
     [switch]$Cxx,
     [switch]$Debug,
@@ -244,7 +243,6 @@ $cmake_args = @('-DIN_PLACE_BUILD:BOOL=ON'
                 "-DENABLE_ANALYZER:BOOL={0}" -f $CMAKE_BOOL[$enable_analyzer]
                 "-DENABLE_PROJECTQ:BOOL={0}" -f $CMAKE_BOOL[$enable_projectq]
                 "-DENABLE_CMAKE_DEBUG:BOOL={0}" -f $CMAKE_BOOL[$cmake_debug_mode]
-                "-DENABLE_CPPSIM:BOOL={0}" -f $CMAKE_BOOL[$enable_cppsim]
                 "-DENABLE_CUDA:BOOL={0}" -f $CMAKE_BOOL[$enable_gpu]
                 "-DENABLE_CXX_EXPERIMENTAL:BOOL={0}" -f $CMAKE_BOOL[$enable_cxx]
                 "-DENABLE_DOCUMENTATION:BOOL={0}" -f $CMAKE_BOOL[$do_docs]
@@ -439,9 +437,6 @@ Path to INI configuration file with default values for the parameters
 .PARAMETER ConfigureOnly
 Stop after the CMake configure and generation steps (ie. before building MindQuantum)
 
-.PARAMETER CppSim
-(experimental) Enable the use of cppsim to generate simulation kernels
-
 .PARAMETER Cxx
 (experimental) Enable MindQuantum C++ support
 
diff --git a/build_locally.sh b/build_locally.sh
index 69a83bb7..b5e27025 100755
--- a/build_locally.sh
+++ b/build_locally.sh
@@ -169,7 +169,6 @@ cmake_args=(-DIN_PLACE_BUILD:BOOL=ON
             -DENABLE_PROJECTQ:BOOL="${CMAKE_BOOL[$enable_projectq]}"
             -DENABLE_CMAKE_DEBUG:BOOL="${CMAKE_BOOL[$cmake_debug_mode]}"
             -DENABLE_CUDA:BOOL="${CMAKE_BOOL[$enable_gpu]}"
-            -DENABLE_CPPSIM:BOOL="${CMAKE_BOOL[$enable_cppsim]}"
             -DENABLE_CXX_EXPERIMENTAL:BOOL="${CMAKE_BOOL[$enable_cxx]}"
             -DENABLE_DOCUMENTATION:BOOL="${CMAKE_BOOL[$do_docs]}"
             -DENABLE_GITEE:BOOL="${CMAKE_BOOL[$enable_gitee]}"
diff --git a/third_party/cppsim/include/combinations.h b/ccsrc/include/cppsim/combinations.h
similarity index 100%
rename from third_party/cppsim/include/combinations.h
rename to ccsrc/include/cppsim/combinations.h
diff --git a/third_party/cppsim/include/compiler.h b/ccsrc/include/cppsim/compiler.h
similarity index 100%
rename from third_party/cppsim/include/compiler.h
rename to ccsrc/include/cppsim/compiler.h
diff --git a/third_party/cppsim/include/cppsim_omp.hpp b/ccsrc/include/cppsim/cppsim_omp.hpp
similarity index 100%
rename from third_party/cppsim/include/cppsim_omp.hpp
rename to ccsrc/include/cppsim/cppsim_omp.hpp
diff --git a/third_party/cppsim/include/cpu/schedule.h b/ccsrc/include/cppsim/cpu/schedule.h
similarity index 100%
rename from third_party/cppsim/include/cpu/schedule.h
rename to ccsrc/include/cppsim/cpu/schedule.h
diff --git a/third_party/cppsim/include/fusion.hpp b/ccsrc/include/cppsim/fusion.hpp
similarity index 100%
rename from third_party/cppsim/include/fusion.hpp
rename to ccsrc/include/cppsim/fusion.hpp
diff --git a/third_party/cppsim/include/gpu/schedule.h b/ccsrc/include/cppsim/gpu/schedule.h
similarity index 100%
rename from third_party/cppsim/include/gpu/schedule.h
rename to ccsrc/include/cppsim/gpu/schedule.h
diff --git a/third_party/cppsim/include/gpu_support.h b/ccsrc/include/cppsim/gpu_support.h
similarity index 100%
rename from third_party/cppsim/include/gpu_support.h
rename to ccsrc/include/cppsim/gpu_support.h
diff --git a/third_party/cppsim/include/intrin/alignedallocator.hpp b/ccsrc/include/cppsim/intrin/alignedallocator.hpp
similarity index 95%
rename from third_party/cppsim/include/intrin/alignedallocator.hpp
rename to ccsrc/include/cppsim/intrin/alignedallocator.hpp
index 7719f2d0..7b7715e2 100644
--- a/third_party/cppsim/include/intrin/alignedallocator.hpp
+++ b/ccsrc/include/cppsim/intrin/alignedallocator.hpp
@@ -76,13 +76,15 @@ class aligned_allocator
         std::free(p);
 #endif
     }
-
+#if 0
+    // TODO
+    // class std::allocator<std::complex<double> >’ has no member named ‘max_size’; did you mean ‘_M_max_size’?
     size_type max_size() const noexcept
     {
         std::allocator<T> a;
         return a.max_size();
     }
-
+#endif
 #if __cplusplus >= 201103L
     template <typename C, class... Args>
     void construct(C* c, Args&&... args)
diff --git a/third_party/cppsim/include/intrin/cintrin.hpp b/ccsrc/include/cppsim/intrin/cintrin.hpp
similarity index 100%
rename from third_party/cppsim/include/intrin/cintrin.hpp
rename to ccsrc/include/cppsim/intrin/cintrin.hpp
diff --git a/third_party/cppsim/include/intrin/kernel1.hpp b/ccsrc/include/cppsim/intrin/kernel1.hpp
similarity index 100%
rename from third_party/cppsim/include/intrin/kernel1.hpp
rename to ccsrc/include/cppsim/intrin/kernel1.hpp
diff --git a/third_party/cppsim/include/intrin/kernel2.hpp b/ccsrc/include/cppsim/intrin/kernel2.hpp
similarity index 100%
rename from third_party/cppsim/include/intrin/kernel2.hpp
rename to ccsrc/include/cppsim/intrin/kernel2.hpp
diff --git a/third_party/cppsim/include/intrin/kernel3.hpp b/ccsrc/include/cppsim/intrin/kernel3.hpp
similarity index 100%
rename from third_party/cppsim/include/intrin/kernel3.hpp
rename to ccsrc/include/cppsim/intrin/kernel3.hpp
diff --git a/third_party/cppsim/include/intrin/kernel4.hpp b/ccsrc/include/cppsim/intrin/kernel4.hpp
similarity index 100%
rename from third_party/cppsim/include/intrin/kernel4.hpp
rename to ccsrc/include/cppsim/intrin/kernel4.hpp
diff --git a/third_party/cppsim/include/intrin/kernel5.hpp b/ccsrc/include/cppsim/intrin/kernel5.hpp
similarity index 100%
rename from third_party/cppsim/include/intrin/kernel5.hpp
rename to ccsrc/include/cppsim/intrin/kernel5.hpp
diff --git a/third_party/cppsim/include/intrin/kernels.hpp b/ccsrc/include/cppsim/intrin/kernels.hpp
similarity index 100%
rename from third_party/cppsim/include/intrin/kernels.hpp
rename to ccsrc/include/cppsim/intrin/kernels.hpp
diff --git a/third_party/cppsim/include/kernelgen.hpp b/ccsrc/include/cppsim/kernelgen.hpp
similarity index 100%
rename from third_party/cppsim/include/kernelgen.hpp
rename to ccsrc/include/cppsim/kernelgen.hpp
diff --git a/third_party/cppsim/include/nointrin/kernel1.hpp b/ccsrc/include/cppsim/nointrin/kernel1.hpp
similarity index 100%
rename from third_party/cppsim/include/nointrin/kernel1.hpp
rename to ccsrc/include/cppsim/nointrin/kernel1.hpp
diff --git a/third_party/cppsim/include/nointrin/kernel2.hpp b/ccsrc/include/cppsim/nointrin/kernel2.hpp
similarity index 100%
rename from third_party/cppsim/include/nointrin/kernel2.hpp
rename to ccsrc/include/cppsim/nointrin/kernel2.hpp
diff --git a/third_party/cppsim/include/nointrin/kernel3.hpp b/ccsrc/include/cppsim/nointrin/kernel3.hpp
similarity index 100%
rename from third_party/cppsim/include/nointrin/kernel3.hpp
rename to ccsrc/include/cppsim/nointrin/kernel3.hpp
diff --git a/third_party/cppsim/include/nointrin/kernel4.hpp b/ccsrc/include/cppsim/nointrin/kernel4.hpp
similarity index 100%
rename from third_party/cppsim/include/nointrin/kernel4.hpp
rename to ccsrc/include/cppsim/nointrin/kernel4.hpp
diff --git a/third_party/cppsim/include/nointrin/kernel5.hpp b/ccsrc/include/cppsim/nointrin/kernel5.hpp
similarity index 100%
rename from third_party/cppsim/include/nointrin/kernel5.hpp
rename to ccsrc/include/cppsim/nointrin/kernel5.hpp
diff --git a/third_party/cppsim/include/nointrin/kernelgen.py b/ccsrc/include/cppsim/nointrin/kernelgen.py
similarity index 100%
rename from third_party/cppsim/include/nointrin/kernelgen.py
rename to ccsrc/include/cppsim/nointrin/kernelgen.py
diff --git a/third_party/cppsim/include/nointrin/kernels.hpp b/ccsrc/include/cppsim/nointrin/kernels.hpp
similarity index 100%
rename from third_party/cppsim/include/nointrin/kernels.hpp
rename to ccsrc/include/cppsim/nointrin/kernels.hpp
diff --git a/third_party/cppsim/include/partitioner.h b/ccsrc/include/cppsim/partitioner.h
similarity index 100%
rename from third_party/cppsim/include/partitioner.h
rename to ccsrc/include/cppsim/partitioner.h
diff --git a/third_party/cppsim/include/schedule.h b/ccsrc/include/cppsim/schedule.h
similarity index 100%
rename from third_party/cppsim/include/schedule.h
rename to ccsrc/include/cppsim/schedule.h
diff --git a/third_party/cppsim/include/simulator.hpp b/ccsrc/include/cppsim/simulator.hpp
similarity index 100%
rename from third_party/cppsim/include/simulator.hpp
rename to ccsrc/include/cppsim/simulator.hpp
diff --git a/third_party/cppsim/include/tempfile.h b/ccsrc/include/cppsim/tempfile.h
similarity index 100%
rename from third_party/cppsim/include/tempfile.h
rename to ccsrc/include/cppsim/tempfile.h
diff --git a/ccsrc/lib/CMakeLists.txt b/ccsrc/lib/CMakeLists.txt
index 9e230d40..fb0e8eed 100644
--- a/ccsrc/lib/CMakeLists.txt
+++ b/ccsrc/lib/CMakeLists.txt
@@ -18,6 +18,7 @@
 
 add_subdirectory(mq_base)
 add_subdirectory(simulator)
+add_subdirectory(cppsim)
 
 # ==============================================================================
 
diff --git a/third_party/cppsim/CMakeLists.txt b/ccsrc/lib/cppsim/CMakeLists.txt
similarity index 84%
rename from third_party/cppsim/CMakeLists.txt
rename to ccsrc/lib/cppsim/CMakeLists.txt
index a9702271..7dcbcaa6 100644
--- a/third_party/cppsim/CMakeLists.txt
+++ b/ccsrc/lib/cppsim/CMakeLists.txt
@@ -5,8 +5,6 @@ project(
   VERSION 1.0.0
   LANGUAGES C CXX)
 
-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/commands)
-
 # Minimum required Python version (used both in this file and in the installed CMake configuration)
 set(CPPSIM_PYTHON_VERSION_MIN 3.7.0)
 
@@ -33,7 +31,8 @@ set(CPPSIM_INSTALL_3RDPARTYDIR "${CPPSIM_INSTALL_LIBDIR}/third_party")
 
 # ==============================================================================
 
-include("${CMAKE_CURRENT_LIST_DIR}/cmake/compiler_has_std_filesystem.cmake")
+# TODO
+include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/compiler_has_std_filesystem.cmake")
 
 # ==============================================================================
 
@@ -104,19 +103,6 @@ find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Developm
 
 # ------------------------------------------------------------------------------
 
-find_package(pybind11 CONFIG QUIET)
-if(NOT pybind11_FOUND)
-  FetchContent_Declare(
-    pybind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11.git
-    GIT_TAG 68e6fdaa90fc93979e6d5d1e9f788f464593e8f2)
-  FetchContent_MakeAvailable(pybind11)
-else()
-  message(STATUS "Found pybind11 at ${pybind11_DIR}")
-endif()
-
-# ------------------------------------------------------------------------------
-
 if(APPLE)
   find_program(BREW_CMD brew PATHS /usr/local/bin)
   if(BREW_CMD)
@@ -177,6 +163,8 @@ find_package(OpenMP REQUIRED)
 
 # ------------------------------------------------------------------------------
 
+# TODO Shouldn't we take dlfcn-win32 to third_party,
+# or replace it with cross-platform dynalo?
 if(MSVC
    OR MINGW
    OR MSYS)
@@ -212,7 +200,7 @@ include(CMakePackageConfigHelpers)
 
 set(_namespace cppsim::)
 
-configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/cmake/cppsimConfig.cmake.in
+configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/../../../cmake/cppsimConfig.cmake.in
                               ${PROJECT_BINARY_DIR}/cppsimConfig.cmake INSTALL_DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
 write_basic_package_version_file(${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake COMPATIBILITY SameMajorVersion)
@@ -220,21 +208,21 @@ write_basic_package_version_file(${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
 install(FILES ${PROJECT_BINARY_DIR}/cppsimConfig.cmake ${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake
         DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
-install(DIRECTORY ${PROJECT_SOURCE_DIR}/cmake/commands DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
-install(FILES ${PROJECT_SOURCE_DIR}/cmake/compiler_has_std_filesystem.cmake DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/../../../cmake/commands DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
+install(FILES ${PROJECT_SOURCE_DIR}/../../../cmake/compiler_has_std_filesystem.cmake DESTINATION ${CPPSIM_INSTALL_CMAKEDIR})
 
 file(GLOB _headers ${CMAKE_CURRENT_LIST_DIR}/include/*.h ${CMAKE_CURRENT_LIST_DIR}/include/*.hpp LIST_DIRECTORIES FALSE)
 install(FILES ${_headers} DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
-install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/include/cpu ${CMAKE_CURRENT_LIST_DIR}/include/gpu
-                  ${CMAKE_CURRENT_LIST_DIR}/include/intrin ${CMAKE_CURRENT_LIST_DIR}/include/nointrin
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/cpu ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/gpu
+                  ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/intrin ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/nointrin
         DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR})
 
 # ==============================================================================
 
-add_library(kernelgen STATIC "src/kernelgen.cpp" "src/compiler.cpp" "src/tempfile.cpp")
+add_library(kernelgen STATIC "kernelgen.cpp" "compiler.cpp" "tempfile.cpp")
 target_compile_features(kernelgen PUBLIC cxx_std_17)
 set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_include_directories(kernelgen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+target_include_directories(kernelgen PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include/cppsim>
                                             $<INSTALL_INTERFACE:${CPPSIM_INSTALL_INCLUDEDIR}>)
 
 if(MSVC AND MSVC_ITERATOR_DEBUG)
@@ -244,13 +232,13 @@ endif()
 res_embed(
   TARGET kernelgen
   NAME "nointrin"
-  PATH "${CMAKE_CURRENT_SOURCE_DIR}/include/nointrin/kernelgen.py"
+  PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../include/cppsim/nointrin/kernelgen.py"
   KEYWORD)
 
 target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::module OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS}
                                        ${filesystem_LIBS})
 
-pybind11_add_module(${PROJECT_NAME} MODULE "src/_${PROJECT_NAME}.cpp")
+pybind11_add_module(${PROJECT_NAME} MODULE "_${PROJECT_NAME}.cpp")
 target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen)
 
 # ==============================================================================
@@ -274,9 +262,8 @@ install(
 if(BUILD_TESTING)
   include(kernelgen)
 
-  add_executable(test_nointrin "src/test/test_nointrin.cpp")
+  add_executable(test_nointrin "test/test_nointrin.cpp")
   target_compile_features(test_nointrin PRIVATE cxx_std_17)
-  target_include_directories(test_nointrin PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_link_libraries(test_nointrin PRIVATE gtest kernelgen Eigen3::Eigen)
 
   kernelgen(
@@ -305,16 +292,14 @@ if(BUILD_TESTING)
     VARIANT nointrin
     COMBINATIONS)
 
-  add_executable(test_popcount "src/test/test_popcount.cpp")
+  add_executable(test_popcount "test/test_popcount.cpp")
   target_compile_features(test_popcount PRIVATE cxx_std_17)
-  target_include_directories(test_popcount PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_link_libraries(test_popcount PRIVATE Eigen3::Eigen gtest)
 
-  add_executable(test_combinations "src/test/test_combinations.cpp")
+  add_executable(test_combinations "test/test_combinations.cpp")
   target_compile_features(test_combinations PRIVATE cxx_std_17)
-  target_include_directories(test_combinations PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
   target_link_libraries(test_combinations PRIVATE gtest Eigen3::Eigen OpenMP::OpenMP_CXX)
 
-  add_executable(benchmark "src/benchmark/benchmark.cpp")
+  add_executable(benchmark "benchmark/benchmark.cpp")
   target_link_libraries(benchmark PRIVATE gtest kernelgen)
 endif()
diff --git a/third_party/cppsim/README.md b/ccsrc/lib/cppsim/README.md
similarity index 97%
rename from third_party/cppsim/README.md
rename to ccsrc/lib/cppsim/README.md
index a91d4908..158131f0 100644
--- a/third_party/cppsim/README.md
+++ b/ccsrc/lib/cppsim/README.md
@@ -54,10 +54,6 @@ The proposed generator reproduces the hand-written kernels, and extends support
 ## Testing
 
 ```
-mkdir build
-cd build
-cmake .. -G Ninja
-ninja
 ./test_nointrin
 ./benchmark
 ```
diff --git a/third_party/cppsim/src/_cppsim.cpp b/ccsrc/lib/cppsim/_cppsim.cpp
similarity index 100%
rename from third_party/cppsim/src/_cppsim.cpp
rename to ccsrc/lib/cppsim/_cppsim.cpp
diff --git a/third_party/cppsim/src/benchmark/benchmark.cpp b/ccsrc/lib/cppsim/benchmark/benchmark.cpp
similarity index 100%
rename from third_party/cppsim/src/benchmark/benchmark.cpp
rename to ccsrc/lib/cppsim/benchmark/benchmark.cpp
diff --git a/third_party/cppsim/src/compiler.cpp b/ccsrc/lib/cppsim/compiler.cpp
similarity index 100%
rename from third_party/cppsim/src/compiler.cpp
rename to ccsrc/lib/cppsim/compiler.cpp
diff --git a/third_party/cppsim/src/kernelgen.cpp b/ccsrc/lib/cppsim/kernelgen.cpp
similarity index 100%
rename from third_party/cppsim/src/kernelgen.cpp
rename to ccsrc/lib/cppsim/kernelgen.cpp
diff --git a/third_party/cppsim/src/tempfile.cpp b/ccsrc/lib/cppsim/tempfile.cpp
similarity index 100%
rename from third_party/cppsim/src/tempfile.cpp
rename to ccsrc/lib/cppsim/tempfile.cpp
diff --git a/third_party/cppsim/src/test/test_combinations.cpp b/ccsrc/lib/cppsim/test/test_combinations.cpp
similarity index 100%
rename from third_party/cppsim/src/test/test_combinations.cpp
rename to ccsrc/lib/cppsim/test/test_combinations.cpp
diff --git a/third_party/cppsim/src/test/test_nointrin.cpp b/ccsrc/lib/cppsim/test/test_nointrin.cpp
similarity index 100%
rename from third_party/cppsim/src/test/test_nointrin.cpp
rename to ccsrc/lib/cppsim/test/test_nointrin.cpp
diff --git a/third_party/cppsim/src/test/test_popcount.cpp b/ccsrc/lib/cppsim/test/test_popcount.cpp
similarity index 100%
rename from third_party/cppsim/src/test/test_popcount.cpp
rename to ccsrc/lib/cppsim/test/test_popcount.cpp
diff --git a/third_party/cppsim/cmake/commands/check_code_compiles.cmake b/cmake/commands/check_code_compiles.cmake
similarity index 100%
rename from third_party/cppsim/cmake/commands/check_code_compiles.cmake
rename to cmake/commands/check_code_compiles.cmake
diff --git a/third_party/cppsim/cmake/commands/kernelgen.cmake b/cmake/commands/kernelgen.cmake
similarity index 100%
rename from third_party/cppsim/cmake/commands/kernelgen.cmake
rename to cmake/commands/kernelgen.cmake
diff --git a/third_party/cppsim/cmake/compiler_has_std_filesystem.cmake b/cmake/compiler_has_std_filesystem.cmake
similarity index 100%
rename from third_party/cppsim/cmake/compiler_has_std_filesystem.cmake
rename to cmake/compiler_has_std_filesystem.cmake
diff --git a/third_party/cppsim/cmake/cppsimConfig.cmake.in b/cmake/cppsimConfig.cmake.in
similarity index 100%
rename from third_party/cppsim/cmake/cppsimConfig.cmake.in
rename to cmake/cppsimConfig.cmake.in
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 9d2bdaf8..a89cee63 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -28,7 +28,6 @@ option(ENABLE_PROJECTQ "Enable ProjectQ support" ON)
 option(ENABLE_GITEE "Use Gitee instead of GitHub for checking out third-party dependencies" OFF)
 option(ENABLE_CXX_EXPERIMENTAL "Enable the new (experimental) C++ backend" OFF)
 option(ENABLE_DOCUMENTATION "Enable building of the documentation using Doxygen" OFF)
-option(ENABLE_CPPSIM "Enable the use of cppsim for generating simulator kernels" OFF)
 option(ENABLE_LOGGING "Enable the use of logging in C++" OFF)
 cmake_dependent_option(ENABLE_LOGGING_TRACE_LEVEL "If logging is enabled, log everything down to the TRACE level" OFF
                        "ENABLE_LOGGING" OFF)
diff --git a/docs/source/cmake_reference.rst b/docs/source/cmake_reference.rst
index 44c70cc8..b7ceeb9f 100644
--- a/docs/source/cmake_reference.rst
+++ b/docs/source/cmake_reference.rst
@@ -52,8 +52,6 @@ Descriptions
 +-------------------------------------+-----------------------------------------------------------------------+
 | ``ENABLE_CMAKE_DEBUG``              | Enable verbose output to debug CMake issues                           |
 +-------------------------------------+-----------------------------------------------------------------------+
-| ``ENABLE_CPPSIM``                   | Enable the use of cppsim for generating simulator kernels             |
-+-------------------------------------+-----------------------------------------------------------------------+
 | ``ENABLE_CUDA``                     | Enable the use of CUDA code                                           |
 +-------------------------------------+-----------------------------------------------------------------------+
 | ``ENABLE_CXX_EXPERIMENTAL``         | Enable the building of the (new) experimental C++ backend             |
@@ -139,8 +137,6 @@ Default values
 +-------------------------------------+------------------------------+
 | ``ENABLE_CMAKE_DEBUG``              | OFF                          |
 +-------------------------------------+------------------------------+
-| ``ENABLE_CPPSIM``                   | OFF                          |
-+-------------------------------------+------------------------------+
 | ``ENABLE_CUDA``                     | OFF                          |
 +-------------------------------------+------------------------------+
 | ``ENABLE_CXX_EXPERIMENTAL``         | OFF                          |
diff --git a/scripts/build/default_values.bat b/scripts/build/default_values.bat
index 6ff21dc2..a32063fb 100644
--- a/scripts/build/default_values.bat
+++ b/scripts/build/default_values.bat
@@ -27,7 +27,6 @@ if NOT DEFINED do_update_venv set do_update_venv=0
 if NOT DEFINED dry_run set dry_run=0
 if NOT DEFINED enable_analyzer set enable_analyzer=0
 if NOT DEFINED enable_ccache set enable_ccache=0
-if NOT DEFINED enable_cppsim set enable_cppsim=0
 if NOT DEFINED enable_cxx set enable_cxx=0
 if NOT DEFINED enable_gitee set enable_gitee=0
 if NOT DEFINED enable_gpu set enable_gpu=0
diff --git a/scripts/build/default_values.conf b/scripts/build/default_values.conf
index e27d3efc..700dca0f 100644
--- a/scripts/build/default_values.conf
+++ b/scripts/build/default_values.conf
@@ -22,7 +22,6 @@ do_update_venv = false
 enable_analyzer = false
 enable_gitee = false
 enable_ccache = false
-enable_cppsim = false
 enable_cxx = false
 enable_gpu = false
 enable_projectq = true
diff --git a/scripts/build/parse_common_args.ps1 b/scripts/build/parse_common_args.ps1
index c0367dd5..e9bffa37 100644
--- a/scripts/build/parse_common_args.ps1
+++ b/scripts/build/parse_common_args.ps1
@@ -76,7 +76,6 @@ function Help-Message() {
     Write-Output '  -Config [dir]       Path to INI configuration file with default values for the parameters'
     Write-Output ("                      Defaults to: {0}" -f $config_file)
     Write-Output '                      NB: command line arguments always take precedence over configuration file values'
-    Write-Output '  -CppSim             (experimental) Enable the use of cppsim to generate simulation kernels'
     Write-Output '  -Cxx                (experimental) Enable MindQuantum C++ support'
     Write-Output '  -Debug              Build in debug mode'
     Write-Output '  -DebugCMake         Enable debugging mode for CMake configuration step'
@@ -189,10 +188,6 @@ if (([bool]$CleanVenv)) {
     Set-Value 'do_clean_venv'
 }
 
-if (([bool]$CppSim)) {
-    Set-Value 'enable_cppsim'
-}
-
 if (([bool]$Cxx)) {
     Set-Value 'enable_cxx'
 }
diff --git a/scripts/build/parse_common_args.sh b/scripts/build/parse_common_args.sh
index 905763c0..ab096eca 100755
--- a/scripts/build/parse_common_args.sh
+++ b/scripts/build/parse_common_args.sh
@@ -103,7 +103,6 @@ help_message() {
     echo '  --config=[dir]         Path to INI configuration file with default values for the parameters'
     echo "                         Defaults to: $config_file"
     echo '                         NB: command line arguments always take precedence over configuration file values'
-    echo '  --cppsim               (experimental) Enable the use of cppsim to generate simulation kernels'
     echo '  --cxx                  (experimental) Enable MindQuantum C++ support'
     echo '  --debug                Build in debug mode'
     echo '  --debug-cmake          Enable debugging mode for CMake configuration step'
@@ -228,9 +227,6 @@ while getopts "${getopts_args}" OPT; do
         cuda-arch )         needs_arg;
                             set_var cuda_arch "$(echo "$OPTARG" | tr ',' ';')"
                             ;;
-        cppsim )            no_arg;
-                            set_var enable_cppsim $flag_value
-                            ;;
         cxx )               no_arg;
                             set_var enable_cxx $flag_value
                             ;;
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 587f9ab0..f2744839 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -69,10 +69,7 @@ if(ENABLE_CXX_EXPERIMENTAL)
   include(${CMAKE_CURRENT_LIST_DIR}/tweedledum/tweedledum.cmake)
 
   # cppsim dependencies
-  if(ENABLE_CPPSIM)
-    include(${CMAKE_CURRENT_LIST_DIR}/res_embed/res_embed.cmake)
-    include(${CMAKE_CURRENT_LIST_DIR}/cppsim/cppsim.cmake)
-  endif()
+  include(${CMAKE_CURRENT_LIST_DIR}/res_embed/res_embed.cmake)
 endif()
 
 # ==============================================================================
diff --git a/third_party/cppsim/.cmake-format.yaml b/third_party/cppsim/.cmake-format.yaml
deleted file mode 100644
index e38933b6..00000000
--- a/third_party/cppsim/.cmake-format.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
----
-
-markup:
-  first_comment_is_literal: true
-format:
-  disable: false
-  line_width: 120
-  tab_size: 2
-  use_tabchars: false
-  max_subgroups_hwrap: 2
-  max_pargs_hwrap: 6
-  max_rows_cmdline: 2
-  separate_ctrl_name_with_space: false
-  separate_fn_name_with_space: false
-  dangle_parens: false
-  dangle_align: prefix
-  min_prefix_chars: 4
-  max_prefix_chars: 10
-  max_lines_hwrap: 2
-  line_ending: unix
-  command_case: canonical
-  keyword_case: unchanged
-  enable_sort: true
-  autosort: false
-  require_valid_layout: false
-parse:
-  additional_commands:
-    res_embed:
-      pargs:
-        flags:
-          - KEYWORD
-      kwargs:
-        TARGET: 1
-        NAME: 1
-        PATH: 1
-        DEPENDS: 1+
-    kernelgen:
-      pargs:
-        flags:
-          - COMBINATIONS
-      kwargs:
-        TARGET: 1
-        NQUBITS: 1
-        VARIANT: 1
diff --git a/third_party/cppsim/.gitmodules b/third_party/cppsim/cmake/commands/a
similarity index 100%
rename from third_party/cppsim/.gitmodules
rename to third_party/cppsim/cmake/commands/a
diff --git a/third_party/cppsim/cppsim.cmake b/third_party/cppsim/cppsim.cmake
deleted file mode 100644
index ab9c67bf..00000000
--- a/third_party/cppsim/cppsim.cmake
+++ /dev/null
@@ -1,70 +0,0 @@
-# ==============================================================================
-#
-# Copyright 2022 <Huawei Technologies Co., Ltd>
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-
-set(VER 1.0.0)
-set(GIT_TAG "f0c786a99833e73c28378450582d3c425095adb3")
-
-if(ENABLE_GITEE)
-  set(GIT_REPOSITORY "https://gitee.com/dmikushin/cppsim.git")
-else()
-  set(GIT_REPOSITORY "https://github.com/dmikushin/cppsim.git")
-endif()
-
-set(CMAKE_OPTION
-    -DBUILD_TESTING=OFF
-    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    -DEigen3_DIR=${Eigen3_DIR}
-    -DPYTHON_EXECUTABLE=${Python_EXECUTABLE}
-    -DPython3_EXECUTABLE=${Python_EXECUTABLE}
-    -DPython_EXECUTABLE=${Python_EXECUTABLE}
-    -Ddigestpp_DIR=${digestpp_DIR}
-    -Dpybind11_DIR=${pybind11_DIR}
-    -Dres_embed_DIR=${res_embed_DIR})
-
-if(APPLE)
-  foreach(
-    _var
-    OpenMP_C_FLAGS
-    OpenMP_C_INCLUDE_DIR
-    OpenMP_C_LIB_NAMES
-    OpenMP_CXX_FLAGS
-    OpenMP_CXX_INCLUDE_DIR
-    OpenMP_CXX_LIB_NAMES
-    OpenMP_gomp_LIBRARY
-    OpenMP_libomp_LIBRARY
-    OpenMP_pthread_LIBRARY)
-    if(NOT "${${_var}}" STREQUAL "")
-      list(APPEND CMAKE_OPTION -D${_var}=${${_var}})
-    endif()
-  endforeach()
-endif()
-
-if(NOT _Boost_SYSTEM)
-  # Boost was locally built, make sure we use that one
-  list(APPEND CMAKE_OPTION -DBOOST_ROOT=${Boost_DIRPATH} -DBoost_NO_SYSTEM_PATHS:BOOL=ON)
-endif()
-
-mindquantum_add_pkg(
-  cppsim
-  VER ${VER}
-  GIT_REPOSITORY ${GIT_REPOSITORY}
-  GIT_TAG ${GIT_TAG}
-  MD5 "xxxx" # NB: would be required if local server is enabled for downloads
-  CMAKE_OPTION ${CMAKE_OPTION}
-  FORCE_LOCAL_PKG
-  TARGET_ALIAS mindquantum::cppsim cppsim::kernelgen)