diff --git a/build.bat b/build.bat index 88ee055f..c3037e10 100644 --- a/build.bat +++ b/build.bat @@ -372,7 +372,6 @@ call %SCRIPTDIR%\dos\build_cmake_option.bat BUILD_TESTING !enable_tests! call %SCRIPTDIR%\dos\build_cmake_option.bat CLEAN_3RDPARTY_INSTALL_DIR !do_clean_3rdparty! call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_ANALYZER !enable_analyzer! call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CMAKE_DEBUG !cmake_debug_mode! -call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CPPSIM !enable_cppsim! call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CUDA !enable_gpu! call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_CXX_EXPERIMENTAL !enable_cxx! call %SCRIPTDIR%\dos\build_cmake_option.bat ENABLE_GITEE !enable_gitee! @@ -504,7 +503,6 @@ rem ============================================================================ echo /CleanBuildDir Delete build directory before building echo /CleanCache Re-run CMake with a clean CMake cache echo /CleanVenv Delete Python virtualenv before building - echo /CppSim (experimental) Enable the use of cppsim to generate simulation kernels echo /Cxx (experimental) Enable MindQuantum C++ support echo /Debug Build in debug mode echo /Delocate Delocate the binary wheels after build is finished diff --git a/build.ps1 b/build.ps1 index b8012b58..875fd1c4 100644 --- a/build.ps1 +++ b/build.ps1 @@ -24,7 +24,6 @@ Param( [switch]$CleanCache, [switch]$CleanVenv, [ValidateNotNullOrEmpty()][string]$Config, - [switch]$CppSim, [ValidateNotNullOrEmpty()][string]$CudaArch, [switch]$Cxx, [switch]$Debug, @@ -488,9 +487,6 @@ Do not use the CMake registry to find packages .PARAMETER Config Path to INI configuration file with default values for the parameters -.PARAMETER CppSim -(experimental) Enable the use of cppsim to generate simulation kernels - .PARAMETER Cxx (experimental) Enable MindQuantum C++ support diff --git a/build_locally.bat b/build_locally.bat index 81d54636..dc6da764 100644 --- a/build_locally.bat +++ b/build_locally.bat @@ -132,11 +132,6 @@ rem ============================================================================ shift & shift & goto :initial ) - if /I "%1" == "/CppSim" ( - set enable_cppsim=1 - shift & goto :initial - ) - if /I "%1" == "/Cxx" ( set enable_cxx=1 shift & goto :initial @@ -348,7 +343,6 @@ call %SCRIPTDIR%\dos\build_locally_cmake_option.bat BUILD_TESTING !enable_tests! call %SCRIPTDIR%\dos\build_locally_cmake_option.bat CLEAN_3RDPARTY_INSTALL_DIR !do_clean_3rdparty! call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_ANALYZER !enable_analyzer! call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CMAKE_DEBUG !cmake_debug_mode! -call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CPPSIM !enable_cppsim! call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CUDA !enable_gpu! call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_CXX_EXPERIMENTAL !enable_cxx! call %SCRIPTDIR%\dos\build_locally_cmake_option.bat ENABLE_DOCUMENTATION !do_docs! @@ -546,7 +540,6 @@ exit /B 0 echo /CleanCache Re-run CMake with a clean CMake cache echo /CleanVenv Delete Python virtualenv before building echo /ConfigureOnly Stop after the CMake configure and generation steps (ie. before building MindQuantum) - echo /CppSim (experimental) Enable the use of cppsim to generate simulation kernels echo /Cxx (experimental) Enable MindQuantum C++ support echo /Debug Build in debug mode echo /DebugCMake Enable debugging mode for CMake configuration step diff --git a/build_locally.ps1 b/build_locally.ps1 index 6b0c4f43..4018d497 100644 --- a/build_locally.ps1 +++ b/build_locally.ps1 @@ -27,7 +27,6 @@ Param( [ValidateNotNullOrEmpty()][string]$Config, [Alias("C")][switch]$Configure, [switch]$ConfigureOnly, - [switch]$CppSim, [ValidateNotNullOrEmpty()][string]$CudaArch, [switch]$Cxx, [switch]$Debug, @@ -244,7 +243,6 @@ $cmake_args = @('-DIN_PLACE_BUILD:BOOL=ON' "-DENABLE_ANALYZER:BOOL={0}" -f $CMAKE_BOOL[$enable_analyzer] "-DENABLE_PROJECTQ:BOOL={0}" -f $CMAKE_BOOL[$enable_projectq] "-DENABLE_CMAKE_DEBUG:BOOL={0}" -f $CMAKE_BOOL[$cmake_debug_mode] - "-DENABLE_CPPSIM:BOOL={0}" -f $CMAKE_BOOL[$enable_cppsim] "-DENABLE_CUDA:BOOL={0}" -f $CMAKE_BOOL[$enable_gpu] "-DENABLE_CXX_EXPERIMENTAL:BOOL={0}" -f $CMAKE_BOOL[$enable_cxx] "-DENABLE_DOCUMENTATION:BOOL={0}" -f $CMAKE_BOOL[$do_docs] @@ -439,9 +437,6 @@ Path to INI configuration file with default values for the parameters .PARAMETER ConfigureOnly Stop after the CMake configure and generation steps (ie. before building MindQuantum) -.PARAMETER CppSim -(experimental) Enable the use of cppsim to generate simulation kernels - .PARAMETER Cxx (experimental) Enable MindQuantum C++ support diff --git a/build_locally.sh b/build_locally.sh index 69a83bb7..b5e27025 100755 --- a/build_locally.sh +++ b/build_locally.sh @@ -169,7 +169,6 @@ cmake_args=(-DIN_PLACE_BUILD:BOOL=ON -DENABLE_PROJECTQ:BOOL="${CMAKE_BOOL[$enable_projectq]}" -DENABLE_CMAKE_DEBUG:BOOL="${CMAKE_BOOL[$cmake_debug_mode]}" -DENABLE_CUDA:BOOL="${CMAKE_BOOL[$enable_gpu]}" - -DENABLE_CPPSIM:BOOL="${CMAKE_BOOL[$enable_cppsim]}" -DENABLE_CXX_EXPERIMENTAL:BOOL="${CMAKE_BOOL[$enable_cxx]}" -DENABLE_DOCUMENTATION:BOOL="${CMAKE_BOOL[$do_docs]}" -DENABLE_GITEE:BOOL="${CMAKE_BOOL[$enable_gitee]}" diff --git a/ccsrc/include/cppsim/combinations.h b/ccsrc/include/cppsim/combinations.h new file mode 100644 index 00000000..7698fb16 --- /dev/null +++ b/ccsrc/include/cppsim/combinations.h @@ -0,0 +1,185 @@ +#ifndef COMBINATIONS_H +#define COMBINATIONS_H + +#include "gpu_support.h" + +#include +#include +#include +#include +#include + +// Iterate through the combinations using currying approach: https://stackoverflow.com/a/54508163/4063520 +class Combinations +{ +// ***************************************************************************** +// Simple case: no sum constraint +// ***************************************************************************** +private : + + template< + uint32_t n0, uint32_t n1, uint32_t ...n, // max allowed sequence element value + class Callable + > + GPU_SUPPORT + static constexpr void _iterate(Callable&& c) + { + for (uint32_t i = 0; i < n0; i += 2 * n1) + { + auto bind_an_argument = [i, &c](auto... args) + { + c(i, args...); + }; + + _iterate(bind_an_argument); + } + } + + template< + uint32_t n0, // max allowed sequence element value + class Callable + > + GPU_SUPPORT + static constexpr void _iterate(Callable&& c) + { + for (uint32_t i = 0; i < n0; i++) + { + c(i); + } + } + + +public : + + template + struct Combination + { + using type = std::array; + }; + + // Iterate through all combinations. + // For each combination, call a user-provided function. + template< + uint32_t ...n, // max allowed sequence element value + class Callable + > + GPU_SUPPORT + static constexpr void iterate(Callable&& c) + { + _iterate(c); + } + + // Tell the length of a combination supplied by the iterator + // configured with the given set of template parameters. + template + static constexpr uint32_t length() + { + return sizeof...(n); + } + + // Tell the number of combinations supplied by the iterator + // configured with the given set of template parameters. + template + static uint32_t _popcount() + { + return n0; + } + + // Tell the number of combinations supplied by the iterator + // configured with the given set of template parameters. + template + static uint32_t _popcount() + { + return (n0 / (2 * n1)) * _popcount(); + } + + // Tell the number of combinations supplied by the iterator + // configured with the given set of template parameters. + template + static uint32_t popcount() + { + if (sizeof...(n) == 0) return 0; + return _popcount(); + } + + // Reverse the order of elements in a combination + // configured with the given set of template parameters. + template + static void reverse(typename Combination::type& c) + { + std::reverse(c.begin(), c.end()); + } + +// ***************************************************************************** +// Simple case: no sum constraint with a user-defined range +// ***************************************************************************** +public : + + template< + uint32_t n0, uint32_t n1, uint32_t ...n, // max allowed sequence element value + class Callable + > + GPU_SUPPORT + static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c) + { + for (uint32_t i = *start; (i < n0) && limit; i += 2 * n1) + { + // Flush starting point to zero, in order for all subsequent iterations + // to start from zero as usual. + *start = 0; + + auto bind_an_argument = [i, &c](auto... args) + { + c(i, args...); + }; + + _iterate(start++, limit, bind_an_argument); + } + } + + template< + uint32_t n0, // max allowed sequence element value + class Callable + > + GPU_SUPPORT + static constexpr void _iterate(uint32_t* start, uint32_t& limit, Callable&& c) + { + for (uint32_t i = *start; (i < n0) && limit; i++) + { + // Flush starting point to zero, in order for all subsequent iterations + // to start from zero as usual. + *start = 0; + + c(i); + limit--; + } + } + +public : + + // Iterate through combinations with specific starting point and duration. + // For each combination, call a user-provided function. + template< + uint32_t ...n, // max allowed sequence element value + class Callable + > + GPU_SUPPORT + static constexpr void iterate(const typename Combination::type& start_, const uint32_t limit_, Callable&& c) + { + auto start = start_; + uint32_t limit = limit_; + _iterate(start.data(), limit, c); + } + + // Tell the number of combinations supplied by the iterator + // configured with the given set of template parameters. + template + static uint32_t popcount(const uint32_t limit) + { + // XXX Actually could be less than limit, if start is closer to the end. + return limit; + } +}; + +#endif // COMBINATIONS_H + diff --git a/ccsrc/include/cppsim/compiler.h b/ccsrc/include/cppsim/compiler.h new file mode 100644 index 00000000..f251bf9d --- /dev/null +++ b/ccsrc/include/cppsim/compiler.h @@ -0,0 +1,19 @@ +#ifndef COMPILER_H +#define COMPILER_H + +#include + +class Compiler +{ +public : + + void* codegen(int nqubits, const std::string& source, std::string& errmsg); + + Compiler(); + +}; + +Compiler& get_compiler(); + +#endif // COMPILER_H + diff --git a/ccsrc/include/cppsim/cppsim_omp.hpp b/ccsrc/include/cppsim/cppsim_omp.hpp new file mode 100644 index 00000000..383290f1 --- /dev/null +++ b/ccsrc/include/cppsim/cppsim_omp.hpp @@ -0,0 +1,18 @@ +#ifndef CPPSIM_OMP_HPP +#define CPPSIM_OMP_HPP + +#include + +#if defined(_OPENMP) +# include +#endif + +namespace omp { +#ifdef _MSC_VER +using idx_t = int64_t; +#else +using idx_t = uint64_t; +#endif // _MSC_VER +} // namespace omp + +#endif /* CPPSIM_OMP_HPP */ diff --git a/ccsrc/include/cppsim/cpu/schedule.h b/ccsrc/include/cppsim/cpu/schedule.h new file mode 100644 index 00000000..5dde71d8 --- /dev/null +++ b/ccsrc/include/cppsim/cpu/schedule.h @@ -0,0 +1,101 @@ +#ifndef SCHEDULE_CPU_H +#define SCHEDULE_CPU_H + +#include "partitioner.h" + +#ifdef _OPENMP +#include +#endif +#include +#include +#include + +namespace cpu { + +template< + class Contexts, + class Callable, + uint32_t ...Args // Underlying combination parameters +> +class Schedule +{ + using Combination = typename Combinations::template Combination::type; + + int nworkers; + uint32_t maxCombinationsPerWorker; + + std::vector starts; + + Callable c; + +public : + + int getWorkersCount() const { return nworkers; } + + const char* getName() const { return "cpu"; } + + Schedule(int nworkers_, Callable c_) : + nworkers(nworkers_), c(c_) + { + // Calculate workers partitions on host, which should be + // fast, as the iterator body is trivial. Then we re-use + // this schedule to perform the real iterations with a + // meaningful user-defined iterator body. + Partitioner::template partition( + starts, nworkers, maxCombinationsPerWorker); + } + + void execute(Contexts ctxs) + { + #pragma omp parallel for + for (int iworker = 0; iworker < nworkers; iworker++) + { + auto& ctx = ctxs[iworker]; + uint32_t limit = maxCombinationsPerWorker; + Combinations::template iterate(starts[iworker], limit, [&](auto... args) + { + c(ctx, args...); + }); + } + } +}; + +// We need to know all of the types participating in the used-defined +// combination specialization, in order to estimate the maximum number +// of blocks that could simultaneously fit into the GPU. By using this +// number multipled by the number of SMs, we partition the workload +// most evenly. +template< + class Contexts, + uint32_t ...Args, // Underlying combination parameters + class Callable +> +auto make_schedule(Callable c, int nworkers = 0) +{ + if (nworkers == 0) + { +#ifdef _OPENMP + #pragma omp parallel + { + #pragma omp master + { + nworkers = omp_get_num_threads(); + } + } +#else + nworkers = 1; +#endif + } + + if (nworkers <= 0) nworkers = 1; + + return Schedule< + Contexts, + Callable, + Args...>(nworkers, c); +} + +} // namespace cpu + +#endif // SCHEDULE_CPU_H + diff --git a/ccsrc/include/cppsim/fusion.hpp b/ccsrc/include/cppsim/fusion.hpp new file mode 100644 index 00000000..6539ea91 --- /dev/null +++ b/ccsrc/include/cppsim/fusion.hpp @@ -0,0 +1,167 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATE_QUEUE_HPP_ +#define GATE_QUEUE_HPP_ + +#include +#include +#include +#include +#include +#include "intrin/alignedallocator.hpp" + +class Item{ +public: + using Index = unsigned; + using IndexVector = std::vector; + using Complex = std::complex; + using Matrix = std::vector>>; + Item(Matrix mat, IndexVector idx) : mat_(mat), idx_(idx) {} + Matrix& get_matrix() { return mat_; } + IndexVector& get_indices() { return idx_; } +private: + Matrix mat_; + IndexVector idx_; +}; + +class Fusion{ +public: + using Index = unsigned; + using IndexSet = std::set; + using IndexVector = std::vector; + using Complex = std::complex; + using Matrix = std::vector>>; + using ItemVector = std::vector; + + unsigned num_qubits() { + return set_.size(); + } + + std::size_t size() const { + return items_.size(); + } + + void insert(Matrix matrix, IndexVector index_list, IndexVector const& ctrl_list = {}){ + for (auto idx : index_list) + set_.emplace(idx); + + handle_controls(matrix, index_list, ctrl_list); + Item item(matrix, index_list); + items_.push_back(item); + } + + void perform_fusion(Matrix& fused_matrix, IndexVector& index_list, IndexVector& ctrl_list){ + for (auto idx : set_) + index_list.push_back(idx); + + std::size_t N = num_qubits(); + fused_matrix = Matrix(1UL<>(1UL< oldcol(1UL<> idx2mat[l])&1UL)<> l)&1UL) != ((i >> idx2mat[l])&1UL)) + locidx ^= (1UL << idx2mat[l]); + res += oldcol[locidx] * item.get_matrix()[local_i][j]; + } + M[i][k] = res; + } + } + } + ctrl_list.reserve(ctrl_set_.size()); + for (auto ctrl : ctrl_set_) + ctrl_list.push_back(ctrl); + } + +private: + void add_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& new_ctrls){ + indexList.reserve(indexList.size()+new_ctrls.size()); + indexList.insert(indexList.end(), new_ctrls.begin(), new_ctrls.end()); + + std::size_t F = (1UL << new_ctrls.size()); + Matrix newmatrix(F*matrix.size(), std::vector>(F*matrix.size(), 0.)); + + std::size_t Offset = newmatrix.size()-matrix.size(); + + for (std::size_t i = 0; i < Offset; ++i) + newmatrix[i][i] = 1.; + for (std::size_t i = 0; i < matrix.size(); ++i){ + for (std::size_t j = 0; j < matrix.size(); ++j) + newmatrix[Offset+i][Offset+j] = matrix[i][j]; + } + matrix = std::move(newmatrix); + } + + void handle_controls(Matrix &matrix, IndexVector &indexList, IndexVector const& ctrlList){ + auto unhandled_ctrl = ctrl_set_; // will contain all ctrls that are not part of the new command + // --> need to be removed from the global mask and the controls incorporated into the old + // commands (the ones already in the list). + + for (auto ctrlIdx : ctrlList){ + if (ctrl_set_.count(ctrlIdx) == 0){ // need to either add it to the list or to the command + if (items_.size() > 0){ // add it to the command + add_controls(matrix, indexList, {ctrlIdx}); + set_.insert(ctrlIdx); + } + else // add it to the list + ctrl_set_.emplace(ctrlIdx); + } + else + unhandled_ctrl.erase(ctrlIdx); + } + // remove global controls which are no longer global (because the current command didn't + // have it) + if (unhandled_ctrl.size() > 0){ + IndexVector new_ctrls; + new_ctrls.reserve(unhandled_ctrl.size()); + for (auto idx : unhandled_ctrl){ + new_ctrls.push_back(idx); + ctrl_set_.erase(idx); + set_.insert(idx); + } + for (auto &item : items_) + add_controls(item.get_matrix(), item.get_indices(), new_ctrls); + } + } + + IndexSet set_; + ItemVector items_; + IndexSet ctrl_set_; +}; + +#endif diff --git a/ccsrc/include/cppsim/gpu/schedule.h b/ccsrc/include/cppsim/gpu/schedule.h new file mode 100644 index 00000000..3030ff97 --- /dev/null +++ b/ccsrc/include/cppsim/gpu/schedule.h @@ -0,0 +1,170 @@ +#if defined(__CUDACC__) || defined(__HIPCC__) + +#ifndef SCHEDULE_GPU_H +#define SCHEDULE_GPU_H + +#include "partitioner.h" + +#include +#include +#include +#if defined(__CUDACC__) +#include +#include +#else +#include +#include +#endif + +namespace gpu { + +template< + class Contexts, + class Callable, + class Starts, + uint32_t ...Args // Underlying combination parameters +> +__global__ void kernel(Contexts ctxs, Callable c, + int nworkers, Starts starts, uint32_t maxCombinationsPerWorker) +{ + int iworker = threadIdx.x + blockDim.x * blockIdx.x; + if (iworker >= nworkers) return; + + auto& ctx = ctxs[iworker]; + Combinations::template iterate( + starts[iworker], maxCombinationsPerWorker, [&] __device__ (auto... args) + { + c(ctx, args...); + }); +} + +template< + class Contexts, + class Callable, + uint32_t ...Args // Underlying combination parameters +> +class Schedule +{ + using Combination = typename Combinations::template Combination::type; + + int nblocks; + + int nworkers; + uint32_t maxCombinationsPerWorker; + + thrust::device_vector starts; + + Callable c; + + std::string name; + +public : + + int getWorkersCount() const { return nworkers; } + + const char* getName() const { return name.c_str(); } + + Schedule(int nworkers_, int nblocks_, Callable c_) : + nworkers(nworkers_), nblocks(nblocks_), c(c_) + { + // Calculate workers partitions on host, which should be + // fast, as the iterator body is trivial. Then we re-use + // this schedule to perform the real iterations with a + // meaningful user-defined iterator body. + thrust::host_vector startsHost; + Partitioner::template partition( + startsHost, nworkers, maxCombinationsPerWorker); + starts = startsHost; + + // Get the GPU name. +#if defined(__CUDACC__) + cudaDeviceProp props; + ::gpu::checkErrorStatus(cudaGetDeviceProperties(&props, 0)); +#elif defined(__HIPCC__) + hipDeviceProp_t props; + ::gpu::checkErrorStatus(hipGetDeviceProperties(&props, 0)); +#endif + name = props.name; + } + + void execute(Contexts ctxs) + { + auto startsPtr = thrust::raw_pointer_cast(starts.data()); + kernel< + Contexts, + Callable, + Combination*, + Combinations, + Args...><<>>( + ctxs, c, nworkers, startsPtr, maxCombinationsPerWorker); +#if defined(__CUDACC__) + ::gpu::checkErrorStatus(cudaGetLastError()); + ::gpu::checkErrorStatus(cudaDeviceSynchronize()); +#elif defined(__HIPCC__) + ::gpu::checkErrorStatus(hipGetLastError()); + ::gpu::checkErrorStatus(hipDeviceSynchronize()); +#endif + } +}; + +// We need to know all of the types participating in the used-defined +// combination specialization, in order to estimate the maximum number +// of blocks that could simultaneously fit into the GPU. By using this +// number multipled by the number of SMs, we partition the workload +// most evenly. +template< + class Contexts, + uint32_t ...Args, // Underlying combination parameters + class Callable +> +auto make_schedule(Callable c, int nworkers = 0) +{ + using Combination = typename Combinations::template Combination::type; + + struct cudaFuncAttributes attrs; + ::gpu::checkErrorStatus(cudaFuncGetAttributes(&attrs, + kernel)); + printf("%d registers per thread\n", attrs.numRegs); + + if (nworkers) + { + // Get the GPU compute grid from the user-specified + // number of workers. + int nblocks = nworkers / ::gpu::nthreadsPerBlock; + if (nworkers % ::gpu::nthreadsPerBlock) nblocks++; + + return Schedule< + Contexts, + Callable, + Args...>(nworkers, nblocks, c); + } + + int nblocks = 0; + const size_t dynamicSMemSize = 0; +#if defined(__CUDACC__) + ::gpu::checkErrorStatus(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &nblocks, kernel, + ::gpu::nthreadsPerBlock, dynamicSMemSize)); + cudaDeviceProp props; + ::gpu::checkErrorStatus(cudaGetDeviceProperties(&props, 0)); +#elif defined(__HIPCC__) + ::gpu::checkErrorStatus(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &nblocks, kernel, + ::gpu::nthreadsPerBlock, dynamicSMemSize)); + hipDeviceProp_t props; + ::gpu::checkErrorStatus(hipGetDeviceProperties(&props, 0)); +#endif + nblocks *= props.multiProcessorCount; + + return Schedule< + Contexts, + Callable, + Args...>(nblocks * ::gpu::nthreadsPerBlock, nblocks, c); +} + +} // namespace gpu + +#endif // SCHEDULE_GPU_H + +#endif // defined(__CUDACC__) || defined(__HIPCC__) + diff --git a/ccsrc/include/cppsim/gpu_support.h b/ccsrc/include/cppsim/gpu_support.h new file mode 100644 index 00000000..b513fc80 --- /dev/null +++ b/ccsrc/include/cppsim/gpu_support.h @@ -0,0 +1,68 @@ +#ifndef GPU_SUPPORT_H +#define GPU_SUPPORT_H + +#ifdef __CUDACC__ +#include +#define GPU_SUPPORT __host__ __device__ +#else +#define GPU_SUPPORT +#endif + +#include +#include + +namespace gpu { + +// TODO 128 threads per block should be preferred, +// but also need to respect the upper limit that could +// be introduced by the kernel (e.g. due to the user code, +// which requires a lot of registers). +constexpr const int nthreadsPerBlock = 128; + +#if defined(__CUDACC__) || defined(__HIPCC__) + +#ifdef __CUDACC__ +constexpr const auto gpuSuccess = cudaSuccess; +constexpr const auto GPU_SUCCESS = CUDA_SUCCESS; +#else +constexpr const auto gpuSuccess = hipSuccess; +constexpr const auto GPU_SUCCESS = HIP_SUCCESS; +#endif + +template +void checkErrorStatus(gpuError_t status) +{ + if (status == gpuSuccess) return; + + std::stringstream ss; + ss << "GPU runtime error, errno = "; + ss << status; + ss << " ("; +#if defined(__CUDACC__) + ss << cudaGetErrorString(status); +#elif defined(__HIPCC__) + ss << hipGetErrorString(status); +#endif + ss << ")"; + std::string errorString = ss.str(); + throw std::invalid_argument(errorString); +} + +template +void checkErrorStatusDriver(CUresult status) +{ + if (status == GPU_SUCCESS) return; + + std::stringstream ss; + ss << "GPU driver runtime error, errno = "; + ss << status; + std::string errorString = ss.str(); + throw std::invalid_argument(errorString); +} + +#endif // defined(__CUDACC__) || defined(__HIPCC__) + +} // namespace gpu + +#endif // GPU_SUPPORT_H + diff --git a/ccsrc/include/cppsim/intrin/alignedallocator.hpp b/ccsrc/include/cppsim/intrin/alignedallocator.hpp new file mode 100644 index 00000000..7b7715e2 --- /dev/null +++ b/ccsrc/include/cppsim/intrin/alignedallocator.hpp @@ -0,0 +1,121 @@ +// Copyright (C) 2012 Andreas Hehn . + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef _WIN32 +#include +#else +#include +#endif +#include +#include +#include + +#if __cplusplus < 201103L +#define noexcept +#endif + + +template +class aligned_allocator +{ + public: + typedef T* pointer; + typedef T const* const_pointer; + typedef T& reference; + typedef T const& const_reference; + typedef T value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + template + struct rebind + { + typedef aligned_allocator other; + }; + + aligned_allocator() noexcept {} + aligned_allocator(aligned_allocator const&) noexcept {} + template + aligned_allocator(aligned_allocator const&) noexcept + { + } + + pointer allocate(size_type n) + { + pointer p; + + +#ifdef _WIN32 + p = reinterpret_cast(_aligned_malloc(n * sizeof(T), Alignment)); + if (p == 0) throw std::bad_alloc(); +#else + if (posix_memalign(reinterpret_cast(&p), Alignment, n * sizeof(T))) + throw std::bad_alloc(); +#endif + return p; + } + + void deallocate(pointer p, size_type) noexcept + { +#ifdef _WIN32 + _aligned_free(p); +#else + std::free(p); +#endif + } +#if 0 + // TODO + // class std::allocator >’ has no member named ‘max_size’; did you mean ‘_M_max_size’? + size_type max_size() const noexcept + { + std::allocator a; + return a.max_size(); + } +#endif +#if __cplusplus >= 201103L + template + void construct(C* c, Args&&... args) + { + new ((void*)c) C(std::forward(args)...); + } +#else + void construct(pointer p, const_reference t) { new ((void*)p) T(t); } +#endif + + template + void destroy(C* c) + { + c->~C(); + } + + bool operator==(aligned_allocator const&) const noexcept { return true; } + bool operator!=(aligned_allocator const&) const noexcept { return false; } + template + bool operator==(aligned_allocator const&) const noexcept + { + return false; + } + + template + bool operator!=(aligned_allocator const&) const noexcept + { + return true; + } +}; + +#if __cplusplus < 201103L +#undef noexcept +#endif diff --git a/ccsrc/include/cppsim/intrin/cintrin.hpp b/ccsrc/include/cppsim/intrin/cintrin.hpp new file mode 100644 index 00000000..4319ada2 --- /dev/null +++ b/ccsrc/include/cppsim/intrin/cintrin.hpp @@ -0,0 +1,124 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CINTRIN_HPP_ +#define CINTRIN_HPP_ + +#include +#include + +#ifndef _mm256_set_m128d +#define _mm256_set_m128d(hi,lo) _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), (hi), 0x1) +#endif +#ifndef _mm256_storeu2_m128d +#define _mm256_storeu2_m128d(hiaddr,loaddr,a) do { __m256d _a = (a); _mm_storeu_pd((loaddr), _mm256_castpd256_pd128(_a)); _mm_storeu_pd((hiaddr), _mm256_extractf128_pd(_a, 0x1)); } while (0) +#endif +#ifndef _mm256_loadu2_m128d +#define _mm256_loadu2_m128d(hiaddr,loaddr) _mm256_set_m128d(_mm_loadu_pd(hiaddr), _mm_loadu_pd(loaddr)) +#endif + +template +class cintrin; + +template <> +class cintrin{ +public: + using calc_t = double; + using ret_t = cintrin; + + + cintrin() {} + + template + cintrin(U const *p){ + v_ = _mm256_load_pd((calc_t const*)p); + } + + template + cintrin(U const *p1, U const *p2){ + v_ = _mm256_loadu2_m128d((calc_t const*)p2, (calc_t const*)p1); + } + + template + cintrin(U const *p, bool broadcast){ + auto tmp = _mm_load_pd((calc_t const*)p); + v_ = _mm256_broadcast_pd(&tmp); + } + + explicit cintrin(calc_t const& s1){ + v_ = _mm256_set1_pd(s1); + } + + cintrin(__m256d const& v) : v_(v) { } + + std::complex operator[](unsigned i){ + calc_t v[4]; + _mm256_store_pd(v, v_); + return {v[i*2], v[i*2+1]}; + } + + template + void store(U *p) const{ + _mm256_store_pd((calc_t *)p, v_); + } + + template + void store(U *p1, U *p2) const{ + _mm256_storeu2_m128d((calc_t *)p2, (calc_t *)p1, v_); + } + __m256d v_; +}; + +inline cintrin mul(cintrin const& c1, cintrin const& c2, cintrin const& c2tm){ + auto ac_bd = _mm256_mul_pd(c1.v_, c2.v_); + auto multbmadmc = _mm256_mul_pd(c1.v_, c2tm.v_); + return cintrin(_mm256_hsub_pd(ac_bd, multbmadmc)); +} +inline cintrin operator*(cintrin const& c1, cintrin const& c2){ + __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + auto badc = _mm256_permute_pd(c2.v_, 5); + auto bmadmc = _mm256_mul_pd(badc, neg); + return mul(c1, c2, bmadmc); +} +inline cintrin operator+(cintrin const& c1, cintrin const& c2){ + return cintrin(_mm256_add_pd(c1.v_, c2.v_)); +} +inline cintrin operator*(cintrin const& c1, double const& d){ + auto d_d = _mm256_set1_pd(d); + return _mm256_mul_pd(c1.v_, d_d); +} +inline cintrin operator*(double const& d, cintrin const& c1){ + return c1*d; +} + + + +inline __m256d mul(__m256d const& c1, __m256d const& c2, __m256d const& c2tm){ + auto ac_bd = _mm256_mul_pd(c1, c2); + auto multbmadmc = _mm256_mul_pd(c1, c2tm); + return _mm256_hsub_pd(ac_bd, multbmadmc); +} +inline __m256d add(__m256d const& c1, __m256d const& c2){ + return _mm256_add_pd(c1, c2); +} +template +inline __m256d load2(U *p){ + auto tmp = _mm_load_pd((double const*)p); + return _mm256_broadcast_pd(&tmp); +} +template +inline __m256d load(U const*p1, U const*p2){ + return _mm256_loadu2_m128d((double const*)p2, (double const*)p1); +} +#endif diff --git a/ccsrc/include/cppsim/intrin/kernel1.hpp b/ccsrc/include/cppsim/intrin/kernel1.hpp new file mode 100644 index 00000000..39c519b6 --- /dev/null +++ b/ccsrc/include/cppsim/intrin/kernel1.hpp @@ -0,0 +1,63 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m, M const& mt) +{ + __m256d v[2]; + + v[0] = load2(&psi[I]); + v[1] = load2(&psi[I + d0]); + + _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1]))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id0 }; + std::sort(ids_sorted, ids_sorted + 1, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0; + std::size_t dsorted0 = 1UL << ids_sorted[0]; + + __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1])}; + __m256d mmt[2]; + + __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (unsigned i = 0; i < 2; ++i){ + auto badc = _mm256_permute_pd(mm[i], 5); + mmt[i] = _mm256_mul_pd(badc, neg); + } + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){ + kernel_core(psi, i0 + i1, d0, mm, mmt); + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){ + if (((i0 + i1)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1, d0, mm, mmt); + } + } + } +} diff --git a/ccsrc/include/cppsim/intrin/kernel2.hpp b/ccsrc/include/cppsim/intrin/kernel2.hpp new file mode 100644 index 00000000..58cb09cf --- /dev/null +++ b/ccsrc/include/cppsim/intrin/kernel2.hpp @@ -0,0 +1,70 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m, M const& mt) +{ + __m256d v[4]; + + v[0] = load2(&psi[I]); + v[1] = load2(&psi[I + d0]); + v[2] = load2(&psi[I + d1]); + v[3] = load2(&psi[I + d0 + d1]); + + _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id1, id0 }; + std::sort(ids_sorted, ids_sorted + 2, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1]; + + __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3])}; + __m256d mmt[8]; + + __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (unsigned i = 0; i < 8; ++i){ + auto badc = _mm256_permute_pd(mm[i], 5); + mmt[i] = _mm256_mul_pd(badc, neg); + } + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){ + kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt); + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){ + if (((i0 + i1 + i2)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2, d0, d1, mm, mmt); + } + } + } + } +} diff --git a/ccsrc/include/cppsim/intrin/kernel3.hpp b/ccsrc/include/cppsim/intrin/kernel3.hpp new file mode 100644 index 00000000..bb248337 --- /dev/null +++ b/ccsrc/include/cppsim/intrin/kernel3.hpp @@ -0,0 +1,88 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m, M const& mt) +{ + __m256d v[4]; + + v[0] = load2(&psi[I]); + v[1] = load2(&psi[I + d0]); + v[2] = load2(&psi[I + d1]); + v[3] = load2(&psi[I + d0 + d1]); + + __m256d tmp[4]; + + tmp[0] = add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))); + tmp[1] = add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))); + tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11])))); + tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15])))); + + v[0] = load2(&psi[I + d2]); + v[1] = load2(&psi[I + d0 + d2]); + v[2] = load2(&psi[I + d1 + d2]); + v[3] = load2(&psi[I + d0 + d1 + d2]); + + _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id2, id1, id0 }; + std::sort(ids_sorted, ids_sorted + 3, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2]; + + __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7])}; + __m256d mmt[32]; + + __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (unsigned i = 0; i < 32; ++i){ + auto badc = _mm256_permute_pd(mm[i], 5); + mmt[i] = _mm256_mul_pd(badc, neg); + } + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){ + kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt); + } + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){ + if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, mm, mmt); + } + } + } + } + } +} diff --git a/ccsrc/include/cppsim/intrin/kernel4.hpp b/ccsrc/include/cppsim/intrin/kernel4.hpp new file mode 100644 index 00000000..1a7e516a --- /dev/null +++ b/ccsrc/include/cppsim/intrin/kernel4.hpp @@ -0,0 +1,128 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m, M const& mt) +{ + __m256d v[4]; + + v[0] = load2(&psi[I]); + v[1] = load2(&psi[I + d0]); + v[2] = load2(&psi[I + d1]); + v[3] = load2(&psi[I + d0 + d1]); + + __m256d tmp[8]; + + tmp[0] = add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))); + tmp[1] = add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))); + tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11])))); + tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15])))); + tmp[4] = add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19])))); + tmp[5] = add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23])))); + tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))); + tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))); + + v[0] = load2(&psi[I + d2]); + v[1] = load2(&psi[I + d0 + d2]); + v[2] = load2(&psi[I + d1 + d2]); + v[3] = load2(&psi[I + d0 + d1 + d2]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[40], mt[40]), add(mul(v[1], m[41], mt[41]), add(mul(v[2], m[42], mt[42]), mul(v[3], m[43], mt[43]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[44], mt[44]), add(mul(v[1], m[45], mt[45]), add(mul(v[2], m[46], mt[46]), mul(v[3], m[47], mt[47]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[48], mt[48]), add(mul(v[1], m[49], mt[49]), add(mul(v[2], m[50], mt[50]), mul(v[3], m[51], mt[51]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[52], mt[52]), add(mul(v[1], m[53], mt[53]), add(mul(v[2], m[54], mt[54]), mul(v[3], m[55], mt[55]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63]))))); + + v[0] = load2(&psi[I + d3]); + v[1] = load2(&psi[I + d0 + d3]); + v[2] = load2(&psi[I + d1 + d3]); + v[3] = load2(&psi[I + d0 + d1 + d3]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[72], mt[72]), add(mul(v[1], m[73], mt[73]), add(mul(v[2], m[74], mt[74]), mul(v[3], m[75], mt[75]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[76], mt[76]), add(mul(v[1], m[77], mt[77]), add(mul(v[2], m[78], mt[78]), mul(v[3], m[79], mt[79]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[80], mt[80]), add(mul(v[1], m[81], mt[81]), add(mul(v[2], m[82], mt[82]), mul(v[3], m[83], mt[83]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[84], mt[84]), add(mul(v[1], m[85], mt[85]), add(mul(v[2], m[86], mt[86]), mul(v[3], m[87], mt[87]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95]))))); + + v[0] = load2(&psi[I + d2 + d3]); + v[1] = load2(&psi[I + d0 + d2 + d3]); + v[2] = load2(&psi[I + d1 + d2 + d3]); + v[3] = load2(&psi[I + d0 + d1 + d2 + d3]); + + _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id3, id2, id1, id0 }; + std::sort(ids_sorted, ids_sorted + 4, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3]; + + __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[8][0], &m[9][0]), load(&m[8][1], &m[9][1]), load(&m[8][2], &m[9][2]), load(&m[8][3], &m[9][3]), load(&m[10][0], &m[11][0]), load(&m[10][1], &m[11][1]), load(&m[10][2], &m[11][2]), load(&m[10][3], &m[11][3]), load(&m[12][0], &m[13][0]), load(&m[12][1], &m[13][1]), load(&m[12][2], &m[13][2]), load(&m[12][3], &m[13][3]), load(&m[14][0], &m[15][0]), load(&m[14][1], &m[15][1]), load(&m[14][2], &m[15][2]), load(&m[14][3], &m[15][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7]), load(&m[8][4], &m[9][4]), load(&m[8][5], &m[9][5]), load(&m[8][6], &m[9][6]), load(&m[8][7], &m[9][7]), load(&m[10][4], &m[11][4]), load(&m[10][5], &m[11][5]), load(&m[10][6], &m[11][6]), load(&m[10][7], &m[11][7]), load(&m[12][4], &m[13][4]), load(&m[12][5], &m[13][5]), load(&m[12][6], &m[13][6]), load(&m[12][7], &m[13][7]), load(&m[14][4], &m[15][4]), load(&m[14][5], &m[15][5]), load(&m[14][6], &m[15][6]), load(&m[14][7], &m[15][7]), load(&m[0][8], &m[1][8]), load(&m[0][9], &m[1][9]), load(&m[0][10], &m[1][10]), load(&m[0][11], &m[1][11]), load(&m[2][8], &m[3][8]), load(&m[2][9], &m[3][9]), load(&m[2][10], &m[3][10]), load(&m[2][11], &m[3][11]), load(&m[4][8], &m[5][8]), load(&m[4][9], &m[5][9]), load(&m[4][10], &m[5][10]), load(&m[4][11], &m[5][11]), load(&m[6][8], &m[7][8]), load(&m[6][9], &m[7][9]), load(&m[6][10], &m[7][10]), load(&m[6][11], &m[7][11]), load(&m[8][8], &m[9][8]), load(&m[8][9], &m[9][9]), load(&m[8][10], &m[9][10]), load(&m[8][11], &m[9][11]), load(&m[10][8], &m[11][8]), load(&m[10][9], &m[11][9]), load(&m[10][10], &m[11][10]), load(&m[10][11], &m[11][11]), load(&m[12][8], &m[13][8]), load(&m[12][9], &m[13][9]), load(&m[12][10], &m[13][10]), load(&m[12][11], &m[13][11]), load(&m[14][8], &m[15][8]), load(&m[14][9], &m[15][9]), load(&m[14][10], &m[15][10]), load(&m[14][11], &m[15][11]), load(&m[0][12], &m[1][12]), load(&m[0][13], &m[1][13]), load(&m[0][14], &m[1][14]), load(&m[0][15], &m[1][15]), load(&m[2][12], &m[3][12]), load(&m[2][13], &m[3][13]), load(&m[2][14], &m[3][14]), load(&m[2][15], &m[3][15]), load(&m[4][12], &m[5][12]), load(&m[4][13], &m[5][13]), load(&m[4][14], &m[5][14]), load(&m[4][15], &m[5][15]), load(&m[6][12], &m[7][12]), load(&m[6][13], &m[7][13]), load(&m[6][14], &m[7][14]), load(&m[6][15], &m[7][15]), load(&m[8][12], &m[9][12]), load(&m[8][13], &m[9][13]), load(&m[8][14], &m[9][14]), load(&m[8][15], &m[9][15]), load(&m[10][12], &m[11][12]), load(&m[10][13], &m[11][13]), load(&m[10][14], &m[11][14]), load(&m[10][15], &m[11][15]), load(&m[12][12], &m[13][12]), load(&m[12][13], &m[13][13]), load(&m[12][14], &m[13][14]), load(&m[12][15], &m[13][15]), load(&m[14][12], &m[15][12]), load(&m[14][13], &m[15][13]), load(&m[14][14], &m[15][14]), load(&m[14][15], &m[15][15])}; + __m256d mmt[128]; + + __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (unsigned i = 0; i < 128; ++i){ + auto badc = _mm256_permute_pd(mm[i], 5); + mmt[i] = _mm256_mul_pd(badc, neg); + } + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt); + } + } + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){ + if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, mm, mmt); + } + } + } + } + } + } +} diff --git a/ccsrc/include/cppsim/intrin/kernel5.hpp b/ccsrc/include/cppsim/intrin/kernel5.hpp new file mode 100644 index 00000000..25002b26 --- /dev/null +++ b/ccsrc/include/cppsim/intrin/kernel5.hpp @@ -0,0 +1,252 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m, M const& mt) +{ + __m256d v[4]; + + v[0] = load2(&psi[I]); + v[1] = load2(&psi[I + d0]); + v[2] = load2(&psi[I + d1]); + v[3] = load2(&psi[I + d0 + d1]); + + __m256d tmp[16]; + + tmp[0] = add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))); + tmp[1] = add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))); + tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11])))); + tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15])))); + tmp[4] = add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19])))); + tmp[5] = add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23])))); + tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))); + tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))); + tmp[8] = add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35])))); + tmp[9] = add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39])))); + tmp[10] = add(mul(v[0], m[40], mt[40]), add(mul(v[1], m[41], mt[41]), add(mul(v[2], m[42], mt[42]), mul(v[3], m[43], mt[43])))); + tmp[11] = add(mul(v[0], m[44], mt[44]), add(mul(v[1], m[45], mt[45]), add(mul(v[2], m[46], mt[46]), mul(v[3], m[47], mt[47])))); + tmp[12] = add(mul(v[0], m[48], mt[48]), add(mul(v[1], m[49], mt[49]), add(mul(v[2], m[50], mt[50]), mul(v[3], m[51], mt[51])))); + tmp[13] = add(mul(v[0], m[52], mt[52]), add(mul(v[1], m[53], mt[53]), add(mul(v[2], m[54], mt[54]), mul(v[3], m[55], mt[55])))); + tmp[14] = add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59])))); + tmp[15] = add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63])))); + + v[0] = load2(&psi[I + d2]); + v[1] = load2(&psi[I + d0 + d2]); + v[2] = load2(&psi[I + d1 + d2]); + v[3] = load2(&psi[I + d0 + d1 + d2]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[72], mt[72]), add(mul(v[1], m[73], mt[73]), add(mul(v[2], m[74], mt[74]), mul(v[3], m[75], mt[75]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[76], mt[76]), add(mul(v[1], m[77], mt[77]), add(mul(v[2], m[78], mt[78]), mul(v[3], m[79], mt[79]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[80], mt[80]), add(mul(v[1], m[81], mt[81]), add(mul(v[2], m[82], mt[82]), mul(v[3], m[83], mt[83]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[84], mt[84]), add(mul(v[1], m[85], mt[85]), add(mul(v[2], m[86], mt[86]), mul(v[3], m[87], mt[87]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))); + + v[0] = load2(&psi[I + d3]); + v[1] = load2(&psi[I + d0 + d3]); + v[2] = load2(&psi[I + d1 + d3]); + v[3] = load2(&psi[I + d0 + d1 + d3]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[128], mt[128]), add(mul(v[1], m[129], mt[129]), add(mul(v[2], m[130], mt[130]), mul(v[3], m[131], mt[131]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[132], mt[132]), add(mul(v[1], m[133], mt[133]), add(mul(v[2], m[134], mt[134]), mul(v[3], m[135], mt[135]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[136], mt[136]), add(mul(v[1], m[137], mt[137]), add(mul(v[2], m[138], mt[138]), mul(v[3], m[139], mt[139]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[140], mt[140]), add(mul(v[1], m[141], mt[141]), add(mul(v[2], m[142], mt[142]), mul(v[3], m[143], mt[143]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[144], mt[144]), add(mul(v[1], m[145], mt[145]), add(mul(v[2], m[146], mt[146]), mul(v[3], m[147], mt[147]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[148], mt[148]), add(mul(v[1], m[149], mt[149]), add(mul(v[2], m[150], mt[150]), mul(v[3], m[151], mt[151]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[152], mt[152]), add(mul(v[1], m[153], mt[153]), add(mul(v[2], m[154], mt[154]), mul(v[3], m[155], mt[155]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[156], mt[156]), add(mul(v[1], m[157], mt[157]), add(mul(v[2], m[158], mt[158]), mul(v[3], m[159], mt[159]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[160], mt[160]), add(mul(v[1], m[161], mt[161]), add(mul(v[2], m[162], mt[162]), mul(v[3], m[163], mt[163]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[164], mt[164]), add(mul(v[1], m[165], mt[165]), add(mul(v[2], m[166], mt[166]), mul(v[3], m[167], mt[167]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[168], mt[168]), add(mul(v[1], m[169], mt[169]), add(mul(v[2], m[170], mt[170]), mul(v[3], m[171], mt[171]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[172], mt[172]), add(mul(v[1], m[173], mt[173]), add(mul(v[2], m[174], mt[174]), mul(v[3], m[175], mt[175]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[176], mt[176]), add(mul(v[1], m[177], mt[177]), add(mul(v[2], m[178], mt[178]), mul(v[3], m[179], mt[179]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[180], mt[180]), add(mul(v[1], m[181], mt[181]), add(mul(v[2], m[182], mt[182]), mul(v[3], m[183], mt[183]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[184], mt[184]), add(mul(v[1], m[185], mt[185]), add(mul(v[2], m[186], mt[186]), mul(v[3], m[187], mt[187]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[188], mt[188]), add(mul(v[1], m[189], mt[189]), add(mul(v[2], m[190], mt[190]), mul(v[3], m[191], mt[191]))))); + + v[0] = load2(&psi[I + d2 + d3]); + v[1] = load2(&psi[I + d0 + d2 + d3]); + v[2] = load2(&psi[I + d1 + d2 + d3]); + v[3] = load2(&psi[I + d0 + d1 + d2 + d3]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[192], mt[192]), add(mul(v[1], m[193], mt[193]), add(mul(v[2], m[194], mt[194]), mul(v[3], m[195], mt[195]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[196], mt[196]), add(mul(v[1], m[197], mt[197]), add(mul(v[2], m[198], mt[198]), mul(v[3], m[199], mt[199]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[200], mt[200]), add(mul(v[1], m[201], mt[201]), add(mul(v[2], m[202], mt[202]), mul(v[3], m[203], mt[203]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[204], mt[204]), add(mul(v[1], m[205], mt[205]), add(mul(v[2], m[206], mt[206]), mul(v[3], m[207], mt[207]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[208], mt[208]), add(mul(v[1], m[209], mt[209]), add(mul(v[2], m[210], mt[210]), mul(v[3], m[211], mt[211]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[212], mt[212]), add(mul(v[1], m[213], mt[213]), add(mul(v[2], m[214], mt[214]), mul(v[3], m[215], mt[215]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[216], mt[216]), add(mul(v[1], m[217], mt[217]), add(mul(v[2], m[218], mt[218]), mul(v[3], m[219], mt[219]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[220], mt[220]), add(mul(v[1], m[221], mt[221]), add(mul(v[2], m[222], mt[222]), mul(v[3], m[223], mt[223]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[224], mt[224]), add(mul(v[1], m[225], mt[225]), add(mul(v[2], m[226], mt[226]), mul(v[3], m[227], mt[227]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[228], mt[228]), add(mul(v[1], m[229], mt[229]), add(mul(v[2], m[230], mt[230]), mul(v[3], m[231], mt[231]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[232], mt[232]), add(mul(v[1], m[233], mt[233]), add(mul(v[2], m[234], mt[234]), mul(v[3], m[235], mt[235]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[236], mt[236]), add(mul(v[1], m[237], mt[237]), add(mul(v[2], m[238], mt[238]), mul(v[3], m[239], mt[239]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[240], mt[240]), add(mul(v[1], m[241], mt[241]), add(mul(v[2], m[242], mt[242]), mul(v[3], m[243], mt[243]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[244], mt[244]), add(mul(v[1], m[245], mt[245]), add(mul(v[2], m[246], mt[246]), mul(v[3], m[247], mt[247]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[248], mt[248]), add(mul(v[1], m[249], mt[249]), add(mul(v[2], m[250], mt[250]), mul(v[3], m[251], mt[251]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[252], mt[252]), add(mul(v[1], m[253], mt[253]), add(mul(v[2], m[254], mt[254]), mul(v[3], m[255], mt[255]))))); + + v[0] = load2(&psi[I + d4]); + v[1] = load2(&psi[I + d0 + d4]); + v[2] = load2(&psi[I + d1 + d4]); + v[3] = load2(&psi[I + d0 + d1 + d4]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[256], mt[256]), add(mul(v[1], m[257], mt[257]), add(mul(v[2], m[258], mt[258]), mul(v[3], m[259], mt[259]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[260], mt[260]), add(mul(v[1], m[261], mt[261]), add(mul(v[2], m[262], mt[262]), mul(v[3], m[263], mt[263]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[264], mt[264]), add(mul(v[1], m[265], mt[265]), add(mul(v[2], m[266], mt[266]), mul(v[3], m[267], mt[267]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[268], mt[268]), add(mul(v[1], m[269], mt[269]), add(mul(v[2], m[270], mt[270]), mul(v[3], m[271], mt[271]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[272], mt[272]), add(mul(v[1], m[273], mt[273]), add(mul(v[2], m[274], mt[274]), mul(v[3], m[275], mt[275]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[276], mt[276]), add(mul(v[1], m[277], mt[277]), add(mul(v[2], m[278], mt[278]), mul(v[3], m[279], mt[279]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[280], mt[280]), add(mul(v[1], m[281], mt[281]), add(mul(v[2], m[282], mt[282]), mul(v[3], m[283], mt[283]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[284], mt[284]), add(mul(v[1], m[285], mt[285]), add(mul(v[2], m[286], mt[286]), mul(v[3], m[287], mt[287]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[288], mt[288]), add(mul(v[1], m[289], mt[289]), add(mul(v[2], m[290], mt[290]), mul(v[3], m[291], mt[291]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[292], mt[292]), add(mul(v[1], m[293], mt[293]), add(mul(v[2], m[294], mt[294]), mul(v[3], m[295], mt[295]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[296], mt[296]), add(mul(v[1], m[297], mt[297]), add(mul(v[2], m[298], mt[298]), mul(v[3], m[299], mt[299]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[300], mt[300]), add(mul(v[1], m[301], mt[301]), add(mul(v[2], m[302], mt[302]), mul(v[3], m[303], mt[303]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[304], mt[304]), add(mul(v[1], m[305], mt[305]), add(mul(v[2], m[306], mt[306]), mul(v[3], m[307], mt[307]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[308], mt[308]), add(mul(v[1], m[309], mt[309]), add(mul(v[2], m[310], mt[310]), mul(v[3], m[311], mt[311]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[312], mt[312]), add(mul(v[1], m[313], mt[313]), add(mul(v[2], m[314], mt[314]), mul(v[3], m[315], mt[315]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[316], mt[316]), add(mul(v[1], m[317], mt[317]), add(mul(v[2], m[318], mt[318]), mul(v[3], m[319], mt[319]))))); + + v[0] = load2(&psi[I + d2 + d4]); + v[1] = load2(&psi[I + d0 + d2 + d4]); + v[2] = load2(&psi[I + d1 + d2 + d4]); + v[3] = load2(&psi[I + d0 + d1 + d2 + d4]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[320], mt[320]), add(mul(v[1], m[321], mt[321]), add(mul(v[2], m[322], mt[322]), mul(v[3], m[323], mt[323]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[324], mt[324]), add(mul(v[1], m[325], mt[325]), add(mul(v[2], m[326], mt[326]), mul(v[3], m[327], mt[327]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[328], mt[328]), add(mul(v[1], m[329], mt[329]), add(mul(v[2], m[330], mt[330]), mul(v[3], m[331], mt[331]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[332], mt[332]), add(mul(v[1], m[333], mt[333]), add(mul(v[2], m[334], mt[334]), mul(v[3], m[335], mt[335]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[336], mt[336]), add(mul(v[1], m[337], mt[337]), add(mul(v[2], m[338], mt[338]), mul(v[3], m[339], mt[339]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[340], mt[340]), add(mul(v[1], m[341], mt[341]), add(mul(v[2], m[342], mt[342]), mul(v[3], m[343], mt[343]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[344], mt[344]), add(mul(v[1], m[345], mt[345]), add(mul(v[2], m[346], mt[346]), mul(v[3], m[347], mt[347]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[348], mt[348]), add(mul(v[1], m[349], mt[349]), add(mul(v[2], m[350], mt[350]), mul(v[3], m[351], mt[351]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[352], mt[352]), add(mul(v[1], m[353], mt[353]), add(mul(v[2], m[354], mt[354]), mul(v[3], m[355], mt[355]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[356], mt[356]), add(mul(v[1], m[357], mt[357]), add(mul(v[2], m[358], mt[358]), mul(v[3], m[359], mt[359]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[360], mt[360]), add(mul(v[1], m[361], mt[361]), add(mul(v[2], m[362], mt[362]), mul(v[3], m[363], mt[363]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[364], mt[364]), add(mul(v[1], m[365], mt[365]), add(mul(v[2], m[366], mt[366]), mul(v[3], m[367], mt[367]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[368], mt[368]), add(mul(v[1], m[369], mt[369]), add(mul(v[2], m[370], mt[370]), mul(v[3], m[371], mt[371]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[372], mt[372]), add(mul(v[1], m[373], mt[373]), add(mul(v[2], m[374], mt[374]), mul(v[3], m[375], mt[375]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[376], mt[376]), add(mul(v[1], m[377], mt[377]), add(mul(v[2], m[378], mt[378]), mul(v[3], m[379], mt[379]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[380], mt[380]), add(mul(v[1], m[381], mt[381]), add(mul(v[2], m[382], mt[382]), mul(v[3], m[383], mt[383]))))); + + v[0] = load2(&psi[I + d3 + d4]); + v[1] = load2(&psi[I + d0 + d3 + d4]); + v[2] = load2(&psi[I + d1 + d3 + d4]); + v[3] = load2(&psi[I + d0 + d1 + d3 + d4]); + + tmp[0] = add(tmp[0], add(mul(v[0], m[384], mt[384]), add(mul(v[1], m[385], mt[385]), add(mul(v[2], m[386], mt[386]), mul(v[3], m[387], mt[387]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[388], mt[388]), add(mul(v[1], m[389], mt[389]), add(mul(v[2], m[390], mt[390]), mul(v[3], m[391], mt[391]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[392], mt[392]), add(mul(v[1], m[393], mt[393]), add(mul(v[2], m[394], mt[394]), mul(v[3], m[395], mt[395]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[396], mt[396]), add(mul(v[1], m[397], mt[397]), add(mul(v[2], m[398], mt[398]), mul(v[3], m[399], mt[399]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[400], mt[400]), add(mul(v[1], m[401], mt[401]), add(mul(v[2], m[402], mt[402]), mul(v[3], m[403], mt[403]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[404], mt[404]), add(mul(v[1], m[405], mt[405]), add(mul(v[2], m[406], mt[406]), mul(v[3], m[407], mt[407]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[408], mt[408]), add(mul(v[1], m[409], mt[409]), add(mul(v[2], m[410], mt[410]), mul(v[3], m[411], mt[411]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[412], mt[412]), add(mul(v[1], m[413], mt[413]), add(mul(v[2], m[414], mt[414]), mul(v[3], m[415], mt[415]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[416], mt[416]), add(mul(v[1], m[417], mt[417]), add(mul(v[2], m[418], mt[418]), mul(v[3], m[419], mt[419]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[420], mt[420]), add(mul(v[1], m[421], mt[421]), add(mul(v[2], m[422], mt[422]), mul(v[3], m[423], mt[423]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[424], mt[424]), add(mul(v[1], m[425], mt[425]), add(mul(v[2], m[426], mt[426]), mul(v[3], m[427], mt[427]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[428], mt[428]), add(mul(v[1], m[429], mt[429]), add(mul(v[2], m[430], mt[430]), mul(v[3], m[431], mt[431]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[432], mt[432]), add(mul(v[1], m[433], mt[433]), add(mul(v[2], m[434], mt[434]), mul(v[3], m[435], mt[435]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[436], mt[436]), add(mul(v[1], m[437], mt[437]), add(mul(v[2], m[438], mt[438]), mul(v[3], m[439], mt[439]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[440], mt[440]), add(mul(v[1], m[441], mt[441]), add(mul(v[2], m[442], mt[442]), mul(v[3], m[443], mt[443]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[444], mt[444]), add(mul(v[1], m[445], mt[445]), add(mul(v[2], m[446], mt[446]), mul(v[3], m[447], mt[447]))))); + + v[0] = load2(&psi[I + d2 + d3 + d4]); + v[1] = load2(&psi[I + d0 + d2 + d3 + d4]); + v[2] = load2(&psi[I + d1 + d2 + d3 + d4]); + v[3] = load2(&psi[I + d0 + d1 + d2 + d3 + d4]); + + _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d4], (double*)&psi[I + d4], add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507])))))); + _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511])))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id4, id3, id2, id1, id0 }; + std::sort(ids_sorted, ids_sorted + 5, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3], dsorted4 = 1UL << ids_sorted[4]; + + __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1]), load(&m[0][2], &m[1][2]), load(&m[0][3], &m[1][3]), load(&m[2][0], &m[3][0]), load(&m[2][1], &m[3][1]), load(&m[2][2], &m[3][2]), load(&m[2][3], &m[3][3]), load(&m[4][0], &m[5][0]), load(&m[4][1], &m[5][1]), load(&m[4][2], &m[5][2]), load(&m[4][3], &m[5][3]), load(&m[6][0], &m[7][0]), load(&m[6][1], &m[7][1]), load(&m[6][2], &m[7][2]), load(&m[6][3], &m[7][3]), load(&m[8][0], &m[9][0]), load(&m[8][1], &m[9][1]), load(&m[8][2], &m[9][2]), load(&m[8][3], &m[9][3]), load(&m[10][0], &m[11][0]), load(&m[10][1], &m[11][1]), load(&m[10][2], &m[11][2]), load(&m[10][3], &m[11][3]), load(&m[12][0], &m[13][0]), load(&m[12][1], &m[13][1]), load(&m[12][2], &m[13][2]), load(&m[12][3], &m[13][3]), load(&m[14][0], &m[15][0]), load(&m[14][1], &m[15][1]), load(&m[14][2], &m[15][2]), load(&m[14][3], &m[15][3]), load(&m[16][0], &m[17][0]), load(&m[16][1], &m[17][1]), load(&m[16][2], &m[17][2]), load(&m[16][3], &m[17][3]), load(&m[18][0], &m[19][0]), load(&m[18][1], &m[19][1]), load(&m[18][2], &m[19][2]), load(&m[18][3], &m[19][3]), load(&m[20][0], &m[21][0]), load(&m[20][1], &m[21][1]), load(&m[20][2], &m[21][2]), load(&m[20][3], &m[21][3]), load(&m[22][0], &m[23][0]), load(&m[22][1], &m[23][1]), load(&m[22][2], &m[23][2]), load(&m[22][3], &m[23][3]), load(&m[24][0], &m[25][0]), load(&m[24][1], &m[25][1]), load(&m[24][2], &m[25][2]), load(&m[24][3], &m[25][3]), load(&m[26][0], &m[27][0]), load(&m[26][1], &m[27][1]), load(&m[26][2], &m[27][2]), load(&m[26][3], &m[27][3]), load(&m[28][0], &m[29][0]), load(&m[28][1], &m[29][1]), load(&m[28][2], &m[29][2]), load(&m[28][3], &m[29][3]), load(&m[30][0], &m[31][0]), load(&m[30][1], &m[31][1]), load(&m[30][2], &m[31][2]), load(&m[30][3], &m[31][3]), load(&m[0][4], &m[1][4]), load(&m[0][5], &m[1][5]), load(&m[0][6], &m[1][6]), load(&m[0][7], &m[1][7]), load(&m[2][4], &m[3][4]), load(&m[2][5], &m[3][5]), load(&m[2][6], &m[3][6]), load(&m[2][7], &m[3][7]), load(&m[4][4], &m[5][4]), load(&m[4][5], &m[5][5]), load(&m[4][6], &m[5][6]), load(&m[4][7], &m[5][7]), load(&m[6][4], &m[7][4]), load(&m[6][5], &m[7][5]), load(&m[6][6], &m[7][6]), load(&m[6][7], &m[7][7]), load(&m[8][4], &m[9][4]), load(&m[8][5], &m[9][5]), load(&m[8][6], &m[9][6]), load(&m[8][7], &m[9][7]), load(&m[10][4], &m[11][4]), load(&m[10][5], &m[11][5]), load(&m[10][6], &m[11][6]), load(&m[10][7], &m[11][7]), load(&m[12][4], &m[13][4]), load(&m[12][5], &m[13][5]), load(&m[12][6], &m[13][6]), load(&m[12][7], &m[13][7]), load(&m[14][4], &m[15][4]), load(&m[14][5], &m[15][5]), load(&m[14][6], &m[15][6]), load(&m[14][7], &m[15][7]), load(&m[16][4], &m[17][4]), load(&m[16][5], &m[17][5]), load(&m[16][6], &m[17][6]), load(&m[16][7], &m[17][7]), load(&m[18][4], &m[19][4]), load(&m[18][5], &m[19][5]), load(&m[18][6], &m[19][6]), load(&m[18][7], &m[19][7]), load(&m[20][4], &m[21][4]), load(&m[20][5], &m[21][5]), load(&m[20][6], &m[21][6]), load(&m[20][7], &m[21][7]), load(&m[22][4], &m[23][4]), load(&m[22][5], &m[23][5]), load(&m[22][6], &m[23][6]), load(&m[22][7], &m[23][7]), load(&m[24][4], &m[25][4]), load(&m[24][5], &m[25][5]), load(&m[24][6], &m[25][6]), load(&m[24][7], &m[25][7]), load(&m[26][4], &m[27][4]), load(&m[26][5], &m[27][5]), load(&m[26][6], &m[27][6]), load(&m[26][7], &m[27][7]), load(&m[28][4], &m[29][4]), load(&m[28][5], &m[29][5]), load(&m[28][6], &m[29][6]), load(&m[28][7], &m[29][7]), load(&m[30][4], &m[31][4]), load(&m[30][5], &m[31][5]), load(&m[30][6], &m[31][6]), load(&m[30][7], &m[31][7]), load(&m[0][8], &m[1][8]), load(&m[0][9], &m[1][9]), load(&m[0][10], &m[1][10]), load(&m[0][11], &m[1][11]), load(&m[2][8], &m[3][8]), load(&m[2][9], &m[3][9]), load(&m[2][10], &m[3][10]), load(&m[2][11], &m[3][11]), load(&m[4][8], &m[5][8]), load(&m[4][9], &m[5][9]), load(&m[4][10], &m[5][10]), load(&m[4][11], &m[5][11]), load(&m[6][8], &m[7][8]), load(&m[6][9], &m[7][9]), load(&m[6][10], &m[7][10]), load(&m[6][11], &m[7][11]), load(&m[8][8], &m[9][8]), load(&m[8][9], &m[9][9]), load(&m[8][10], &m[9][10]), load(&m[8][11], &m[9][11]), load(&m[10][8], &m[11][8]), load(&m[10][9], &m[11][9]), load(&m[10][10], &m[11][10]), load(&m[10][11], &m[11][11]), load(&m[12][8], &m[13][8]), load(&m[12][9], &m[13][9]), load(&m[12][10], &m[13][10]), load(&m[12][11], &m[13][11]), load(&m[14][8], &m[15][8]), load(&m[14][9], &m[15][9]), load(&m[14][10], &m[15][10]), load(&m[14][11], &m[15][11]), load(&m[16][8], &m[17][8]), load(&m[16][9], &m[17][9]), load(&m[16][10], &m[17][10]), load(&m[16][11], &m[17][11]), load(&m[18][8], &m[19][8]), load(&m[18][9], &m[19][9]), load(&m[18][10], &m[19][10]), load(&m[18][11], &m[19][11]), load(&m[20][8], &m[21][8]), load(&m[20][9], &m[21][9]), load(&m[20][10], &m[21][10]), load(&m[20][11], &m[21][11]), load(&m[22][8], &m[23][8]), load(&m[22][9], &m[23][9]), load(&m[22][10], &m[23][10]), load(&m[22][11], &m[23][11]), load(&m[24][8], &m[25][8]), load(&m[24][9], &m[25][9]), load(&m[24][10], &m[25][10]), load(&m[24][11], &m[25][11]), load(&m[26][8], &m[27][8]), load(&m[26][9], &m[27][9]), load(&m[26][10], &m[27][10]), load(&m[26][11], &m[27][11]), load(&m[28][8], &m[29][8]), load(&m[28][9], &m[29][9]), load(&m[28][10], &m[29][10]), load(&m[28][11], &m[29][11]), load(&m[30][8], &m[31][8]), load(&m[30][9], &m[31][9]), load(&m[30][10], &m[31][10]), load(&m[30][11], &m[31][11]), load(&m[0][12], &m[1][12]), load(&m[0][13], &m[1][13]), load(&m[0][14], &m[1][14]), load(&m[0][15], &m[1][15]), load(&m[2][12], &m[3][12]), load(&m[2][13], &m[3][13]), load(&m[2][14], &m[3][14]), load(&m[2][15], &m[3][15]), load(&m[4][12], &m[5][12]), load(&m[4][13], &m[5][13]), load(&m[4][14], &m[5][14]), load(&m[4][15], &m[5][15]), load(&m[6][12], &m[7][12]), load(&m[6][13], &m[7][13]), load(&m[6][14], &m[7][14]), load(&m[6][15], &m[7][15]), load(&m[8][12], &m[9][12]), load(&m[8][13], &m[9][13]), load(&m[8][14], &m[9][14]), load(&m[8][15], &m[9][15]), load(&m[10][12], &m[11][12]), load(&m[10][13], &m[11][13]), load(&m[10][14], &m[11][14]), load(&m[10][15], &m[11][15]), load(&m[12][12], &m[13][12]), load(&m[12][13], &m[13][13]), load(&m[12][14], &m[13][14]), load(&m[12][15], &m[13][15]), load(&m[14][12], &m[15][12]), load(&m[14][13], &m[15][13]), load(&m[14][14], &m[15][14]), load(&m[14][15], &m[15][15]), load(&m[16][12], &m[17][12]), load(&m[16][13], &m[17][13]), load(&m[16][14], &m[17][14]), load(&m[16][15], &m[17][15]), load(&m[18][12], &m[19][12]), load(&m[18][13], &m[19][13]), load(&m[18][14], &m[19][14]), load(&m[18][15], &m[19][15]), load(&m[20][12], &m[21][12]), load(&m[20][13], &m[21][13]), load(&m[20][14], &m[21][14]), load(&m[20][15], &m[21][15]), load(&m[22][12], &m[23][12]), load(&m[22][13], &m[23][13]), load(&m[22][14], &m[23][14]), load(&m[22][15], &m[23][15]), load(&m[24][12], &m[25][12]), load(&m[24][13], &m[25][13]), load(&m[24][14], &m[25][14]), load(&m[24][15], &m[25][15]), load(&m[26][12], &m[27][12]), load(&m[26][13], &m[27][13]), load(&m[26][14], &m[27][14]), load(&m[26][15], &m[27][15]), load(&m[28][12], &m[29][12]), load(&m[28][13], &m[29][13]), load(&m[28][14], &m[29][14]), load(&m[28][15], &m[29][15]), load(&m[30][12], &m[31][12]), load(&m[30][13], &m[31][13]), load(&m[30][14], &m[31][14]), load(&m[30][15], &m[31][15]), load(&m[0][16], &m[1][16]), load(&m[0][17], &m[1][17]), load(&m[0][18], &m[1][18]), load(&m[0][19], &m[1][19]), load(&m[2][16], &m[3][16]), load(&m[2][17], &m[3][17]), load(&m[2][18], &m[3][18]), load(&m[2][19], &m[3][19]), load(&m[4][16], &m[5][16]), load(&m[4][17], &m[5][17]), load(&m[4][18], &m[5][18]), load(&m[4][19], &m[5][19]), load(&m[6][16], &m[7][16]), load(&m[6][17], &m[7][17]), load(&m[6][18], &m[7][18]), load(&m[6][19], &m[7][19]), load(&m[8][16], &m[9][16]), load(&m[8][17], &m[9][17]), load(&m[8][18], &m[9][18]), load(&m[8][19], &m[9][19]), load(&m[10][16], &m[11][16]), load(&m[10][17], &m[11][17]), load(&m[10][18], &m[11][18]), load(&m[10][19], &m[11][19]), load(&m[12][16], &m[13][16]), load(&m[12][17], &m[13][17]), load(&m[12][18], &m[13][18]), load(&m[12][19], &m[13][19]), load(&m[14][16], &m[15][16]), load(&m[14][17], &m[15][17]), load(&m[14][18], &m[15][18]), load(&m[14][19], &m[15][19]), load(&m[16][16], &m[17][16]), load(&m[16][17], &m[17][17]), load(&m[16][18], &m[17][18]), load(&m[16][19], &m[17][19]), load(&m[18][16], &m[19][16]), load(&m[18][17], &m[19][17]), load(&m[18][18], &m[19][18]), load(&m[18][19], &m[19][19]), load(&m[20][16], &m[21][16]), load(&m[20][17], &m[21][17]), load(&m[20][18], &m[21][18]), load(&m[20][19], &m[21][19]), load(&m[22][16], &m[23][16]), load(&m[22][17], &m[23][17]), load(&m[22][18], &m[23][18]), load(&m[22][19], &m[23][19]), load(&m[24][16], &m[25][16]), load(&m[24][17], &m[25][17]), load(&m[24][18], &m[25][18]), load(&m[24][19], &m[25][19]), load(&m[26][16], &m[27][16]), load(&m[26][17], &m[27][17]), load(&m[26][18], &m[27][18]), load(&m[26][19], &m[27][19]), load(&m[28][16], &m[29][16]), load(&m[28][17], &m[29][17]), load(&m[28][18], &m[29][18]), load(&m[28][19], &m[29][19]), load(&m[30][16], &m[31][16]), load(&m[30][17], &m[31][17]), load(&m[30][18], &m[31][18]), load(&m[30][19], &m[31][19]), load(&m[0][20], &m[1][20]), load(&m[0][21], &m[1][21]), load(&m[0][22], &m[1][22]), load(&m[0][23], &m[1][23]), load(&m[2][20], &m[3][20]), load(&m[2][21], &m[3][21]), load(&m[2][22], &m[3][22]), load(&m[2][23], &m[3][23]), load(&m[4][20], &m[5][20]), load(&m[4][21], &m[5][21]), load(&m[4][22], &m[5][22]), load(&m[4][23], &m[5][23]), load(&m[6][20], &m[7][20]), load(&m[6][21], &m[7][21]), load(&m[6][22], &m[7][22]), load(&m[6][23], &m[7][23]), load(&m[8][20], &m[9][20]), load(&m[8][21], &m[9][21]), load(&m[8][22], &m[9][22]), load(&m[8][23], &m[9][23]), load(&m[10][20], &m[11][20]), load(&m[10][21], &m[11][21]), load(&m[10][22], &m[11][22]), load(&m[10][23], &m[11][23]), load(&m[12][20], &m[13][20]), load(&m[12][21], &m[13][21]), load(&m[12][22], &m[13][22]), load(&m[12][23], &m[13][23]), load(&m[14][20], &m[15][20]), load(&m[14][21], &m[15][21]), load(&m[14][22], &m[15][22]), load(&m[14][23], &m[15][23]), load(&m[16][20], &m[17][20]), load(&m[16][21], &m[17][21]), load(&m[16][22], &m[17][22]), load(&m[16][23], &m[17][23]), load(&m[18][20], &m[19][20]), load(&m[18][21], &m[19][21]), load(&m[18][22], &m[19][22]), load(&m[18][23], &m[19][23]), load(&m[20][20], &m[21][20]), load(&m[20][21], &m[21][21]), load(&m[20][22], &m[21][22]), load(&m[20][23], &m[21][23]), load(&m[22][20], &m[23][20]), load(&m[22][21], &m[23][21]), load(&m[22][22], &m[23][22]), load(&m[22][23], &m[23][23]), load(&m[24][20], &m[25][20]), load(&m[24][21], &m[25][21]), load(&m[24][22], &m[25][22]), load(&m[24][23], &m[25][23]), load(&m[26][20], &m[27][20]), load(&m[26][21], &m[27][21]), load(&m[26][22], &m[27][22]), load(&m[26][23], &m[27][23]), load(&m[28][20], &m[29][20]), load(&m[28][21], &m[29][21]), load(&m[28][22], &m[29][22]), load(&m[28][23], &m[29][23]), load(&m[30][20], &m[31][20]), load(&m[30][21], &m[31][21]), load(&m[30][22], &m[31][22]), load(&m[30][23], &m[31][23]), load(&m[0][24], &m[1][24]), load(&m[0][25], &m[1][25]), load(&m[0][26], &m[1][26]), load(&m[0][27], &m[1][27]), load(&m[2][24], &m[3][24]), load(&m[2][25], &m[3][25]), load(&m[2][26], &m[3][26]), load(&m[2][27], &m[3][27]), load(&m[4][24], &m[5][24]), load(&m[4][25], &m[5][25]), load(&m[4][26], &m[5][26]), load(&m[4][27], &m[5][27]), load(&m[6][24], &m[7][24]), load(&m[6][25], &m[7][25]), load(&m[6][26], &m[7][26]), load(&m[6][27], &m[7][27]), load(&m[8][24], &m[9][24]), load(&m[8][25], &m[9][25]), load(&m[8][26], &m[9][26]), load(&m[8][27], &m[9][27]), load(&m[10][24], &m[11][24]), load(&m[10][25], &m[11][25]), load(&m[10][26], &m[11][26]), load(&m[10][27], &m[11][27]), load(&m[12][24], &m[13][24]), load(&m[12][25], &m[13][25]), load(&m[12][26], &m[13][26]), load(&m[12][27], &m[13][27]), load(&m[14][24], &m[15][24]), load(&m[14][25], &m[15][25]), load(&m[14][26], &m[15][26]), load(&m[14][27], &m[15][27]), load(&m[16][24], &m[17][24]), load(&m[16][25], &m[17][25]), load(&m[16][26], &m[17][26]), load(&m[16][27], &m[17][27]), load(&m[18][24], &m[19][24]), load(&m[18][25], &m[19][25]), load(&m[18][26], &m[19][26]), load(&m[18][27], &m[19][27]), load(&m[20][24], &m[21][24]), load(&m[20][25], &m[21][25]), load(&m[20][26], &m[21][26]), load(&m[20][27], &m[21][27]), load(&m[22][24], &m[23][24]), load(&m[22][25], &m[23][25]), load(&m[22][26], &m[23][26]), load(&m[22][27], &m[23][27]), load(&m[24][24], &m[25][24]), load(&m[24][25], &m[25][25]), load(&m[24][26], &m[25][26]), load(&m[24][27], &m[25][27]), load(&m[26][24], &m[27][24]), load(&m[26][25], &m[27][25]), load(&m[26][26], &m[27][26]), load(&m[26][27], &m[27][27]), load(&m[28][24], &m[29][24]), load(&m[28][25], &m[29][25]), load(&m[28][26], &m[29][26]), load(&m[28][27], &m[29][27]), load(&m[30][24], &m[31][24]), load(&m[30][25], &m[31][25]), load(&m[30][26], &m[31][26]), load(&m[30][27], &m[31][27]), load(&m[0][28], &m[1][28]), load(&m[0][29], &m[1][29]), load(&m[0][30], &m[1][30]), load(&m[0][31], &m[1][31]), load(&m[2][28], &m[3][28]), load(&m[2][29], &m[3][29]), load(&m[2][30], &m[3][30]), load(&m[2][31], &m[3][31]), load(&m[4][28], &m[5][28]), load(&m[4][29], &m[5][29]), load(&m[4][30], &m[5][30]), load(&m[4][31], &m[5][31]), load(&m[6][28], &m[7][28]), load(&m[6][29], &m[7][29]), load(&m[6][30], &m[7][30]), load(&m[6][31], &m[7][31]), load(&m[8][28], &m[9][28]), load(&m[8][29], &m[9][29]), load(&m[8][30], &m[9][30]), load(&m[8][31], &m[9][31]), load(&m[10][28], &m[11][28]), load(&m[10][29], &m[11][29]), load(&m[10][30], &m[11][30]), load(&m[10][31], &m[11][31]), load(&m[12][28], &m[13][28]), load(&m[12][29], &m[13][29]), load(&m[12][30], &m[13][30]), load(&m[12][31], &m[13][31]), load(&m[14][28], &m[15][28]), load(&m[14][29], &m[15][29]), load(&m[14][30], &m[15][30]), load(&m[14][31], &m[15][31]), load(&m[16][28], &m[17][28]), load(&m[16][29], &m[17][29]), load(&m[16][30], &m[17][30]), load(&m[16][31], &m[17][31]), load(&m[18][28], &m[19][28]), load(&m[18][29], &m[19][29]), load(&m[18][30], &m[19][30]), load(&m[18][31], &m[19][31]), load(&m[20][28], &m[21][28]), load(&m[20][29], &m[21][29]), load(&m[20][30], &m[21][30]), load(&m[20][31], &m[21][31]), load(&m[22][28], &m[23][28]), load(&m[22][29], &m[23][29]), load(&m[22][30], &m[23][30]), load(&m[22][31], &m[23][31]), load(&m[24][28], &m[25][28]), load(&m[24][29], &m[25][29]), load(&m[24][30], &m[25][30]), load(&m[24][31], &m[25][31]), load(&m[26][28], &m[27][28]), load(&m[26][29], &m[27][29]), load(&m[26][30], &m[27][30]), load(&m[26][31], &m[27][31]), load(&m[28][28], &m[29][28]), load(&m[28][29], &m[29][29]), load(&m[28][30], &m[29][30]), load(&m[28][31], &m[29][31]), load(&m[30][28], &m[31][28]), load(&m[30][29], &m[31][29]), load(&m[30][30], &m[31][30]), load(&m[30][31], &m[31][31])}; + __m256d mmt[512]; + + __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); + for (unsigned i = 0; i < 512; ++i){ + auto badc = _mm256_permute_pd(mm[i], 5); + mmt[i] = _mm256_mul_pd(badc, neg); + } + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){ + for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt); + } + } + } + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){ + for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){ + if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, mm, mmt); + } + } + } + } + } + } + } +} diff --git a/ccsrc/include/cppsim/intrin/kernels.hpp b/ccsrc/include/cppsim/intrin/kernels.hpp new file mode 100644 index 00000000..0b4f8540 --- /dev/null +++ b/ccsrc/include/cppsim/intrin/kernels.hpp @@ -0,0 +1,36 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include "cintrin.hpp" +#include "alignedallocator.hpp" + +#include "cppsim_omp.hpp" + +#define LOOP_COLLAPSE1 2 +#define LOOP_COLLAPSE2 3 +#define LOOP_COLLAPSE3 4 +#define LOOP_COLLAPSE4 5 +#define LOOP_COLLAPSE5 6 + +#include "kernel1.hpp" +#include "kernel2.hpp" +#include "kernel3.hpp" +#include "kernel4.hpp" +#include "kernel5.hpp" diff --git a/ccsrc/include/cppsim/kernelgen.hpp b/ccsrc/include/cppsim/kernelgen.hpp new file mode 100644 index 00000000..12511736 --- /dev/null +++ b/ccsrc/include/cppsim/kernelgen.hpp @@ -0,0 +1,49 @@ +#ifndef KERNELGEN_HPP +#define KERNELGEN_HPP + +#include "compiler.h" + +#include +#include + +class KernelGen +{ + std::string nointrin; + +public : + + std::string generate(int nqubits, unsigned* ids = nullptr); + + KernelGen(); +}; + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernelgen(V &psi, Id& ids, M const& m, std::size_t ctrlmask) +{ + static KernelGen g; + + const auto nqubits = ids.size(); + + // Generate the kernel source code. + auto source = g.generate(nqubits, &ids[0]); + + // Compile the source code using external compiler. + std::string errmsg; + void* handle = get_compiler().codegen(nqubits, source, errmsg); + if (!handle) + { + std::cerr << "Kernel generation has failed, aborting:" << std::endl; + std::cerr << errmsg; + exit(-1); + } + + // Call the generated kernel. + typedef void (*kernel_t)(void* /*psi*/, unsigned int* /*ids*/, const int* /*m*/, size_t /*ctrlmask*/); + auto kernel = (kernel_t)handle; + #pragma omp parallel + kernel(reinterpret_cast(&psi[0]), &ids[0], reinterpret_cast(&m[0][0]), ctrlmask); +} + +#endif // KERNELGEN_HPP + diff --git a/ccsrc/include/cppsim/nointrin/kernel1.hpp b/ccsrc/include/cppsim/nointrin/kernel1.hpp new file mode 100644 index 00000000..173ab44c --- /dev/null +++ b/ccsrc/include/cppsim/nointrin/kernel1.hpp @@ -0,0 +1,58 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define LOOP_COLLAPSE1 2 + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m) +{ + std::array v = + { + psi[I], + psi[I + d0] + }; + + psi[I] = (add(mul(v[0], m[0][0]), mul(v[1], m[0][1]))); + psi[I + d0] = (add(mul(v[0], m[1][0]), mul(v[1], m[1][1]))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id0 }; + std::sort(ids_sorted, ids_sorted + 1, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0; + std::size_t dsorted0 = 1UL << ids_sorted[0]; + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){ + kernel_core(psi, i0 + i1, d0, m); + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; ++i1){ + if (((i0 + i1)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1, d0, m); + } + } + } +} diff --git a/ccsrc/include/cppsim/nointrin/kernel2.hpp b/ccsrc/include/cppsim/nointrin/kernel2.hpp new file mode 100644 index 00000000..dfd6e067 --- /dev/null +++ b/ccsrc/include/cppsim/nointrin/kernel2.hpp @@ -0,0 +1,66 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define LOOP_COLLAPSE2 3 + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, M const& m) +{ + std::array v = + { + psi[I], + psi[I + d0], + psi[I + d1], + psi[I + d0 + d1] + }; + + psi[I] = (add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3]))))); + psi[I + d0] = (add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3]))))); + psi[I + d1] = (add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3]))))); + psi[I + d0 + d1] = (add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3]))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id1, id0 }; + std::sort(ids_sorted, ids_sorted + 2, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1]; + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){ + kernel_core(psi, i0 + i1 + i2, d0, d1, m); + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE2) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; ++i2){ + if (((i0 + i1 + i2)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2, d0, d1, m); + } + } + } + } +} diff --git a/ccsrc/include/cppsim/nointrin/kernel3.hpp b/ccsrc/include/cppsim/nointrin/kernel3.hpp new file mode 100644 index 00000000..4b767e96 --- /dev/null +++ b/ccsrc/include/cppsim/nointrin/kernel3.hpp @@ -0,0 +1,91 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define LOOP_COLLAPSE3 4 + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, M const& m) +{ + std::array v = + { + psi[I], + psi[I + d0], + psi[I + d1], + psi[I + d0 + d1] + }; + + std::array tmp = + { + add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))), + add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))), + add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3])))), + add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3])))), + add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3])))), + add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3])))), + add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3])))), + add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3])))) + }; + + v[0] = psi[I + d2]; + v[1] = psi[I + d0 + d2]; + v[2] = psi[I + d1 + d2]; + v[3] = psi[I + d0 + d1 + d2]; + + psi[I] = (add(tmp[0], add(mul(v[0], m[0][4]), add(mul(v[1], m[0][5]), add(mul(v[2], m[0][6]), mul(v[3], m[0][7])))))); + psi[I + d0] = (add(tmp[1], add(mul(v[0], m[1][4]), add(mul(v[1], m[1][5]), add(mul(v[2], m[1][6]), mul(v[3], m[1][7])))))); + psi[I + d1] = (add(tmp[2], add(mul(v[0], m[2][4]), add(mul(v[1], m[2][5]), add(mul(v[2], m[2][6]), mul(v[3], m[2][7])))))); + psi[I + d0 + d1] = (add(tmp[3], add(mul(v[0], m[3][4]), add(mul(v[1], m[3][5]), add(mul(v[2], m[3][6]), mul(v[3], m[3][7])))))); + psi[I + d2] = (add(tmp[4], add(mul(v[0], m[4][4]), add(mul(v[1], m[4][5]), add(mul(v[2], m[4][6]), mul(v[3], m[4][7])))))); + psi[I + d0 + d2] = (add(tmp[5], add(mul(v[0], m[5][4]), add(mul(v[1], m[5][5]), add(mul(v[2], m[5][6]), mul(v[3], m[5][7])))))); + psi[I + d1 + d2] = (add(tmp[6], add(mul(v[0], m[6][4]), add(mul(v[1], m[6][5]), add(mul(v[2], m[6][6]), mul(v[3], m[6][7])))))); + psi[I + d0 + d1 + d2] = (add(tmp[7], add(mul(v[0], m[7][4]), add(mul(v[1], m[7][5]), add(mul(v[2], m[7][6]), mul(v[3], m[7][7])))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id2, id1, id0 }; + std::sort(ids_sorted, ids_sorted + 3, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2]; + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){ + kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m); + } + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE3) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; ++i3){ + if (((i0 + i1 + i2 + i3)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3, d0, d1, d2, m); + } + } + } + } + } +} diff --git a/ccsrc/include/cppsim/nointrin/kernel4.hpp b/ccsrc/include/cppsim/nointrin/kernel4.hpp new file mode 100644 index 00000000..fe31469b --- /dev/null +++ b/ccsrc/include/cppsim/nointrin/kernel4.hpp @@ -0,0 +1,155 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define LOOP_COLLAPSE4 5 + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, M const& m) +{ + std::array v = + { + psi[I], + psi[I + d0], + psi[I + d1], + psi[I + d0 + d1] + }; + + std::array tmp = + { + add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))), + add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))), + add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3])))), + add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3])))), + add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3])))), + add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3])))), + add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3])))), + add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3])))), + add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3])))), + add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3])))), + add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3])))), + add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3])))), + add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3])))), + add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3])))), + add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3])))), + add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3])))) + }; + + v[0] = psi[I + d2]; + v[1] = psi[I + d0 + d2]; + v[2] = psi[I + d1 + d2]; + v[3] = psi[I + d0 + d1 + d2]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][4]), add(mul(v[1], m[0][5]), add(mul(v[2], m[0][6]), mul(v[3], m[0][7]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][4]), add(mul(v[1], m[1][5]), add(mul(v[2], m[1][6]), mul(v[3], m[1][7]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][4]), add(mul(v[1], m[2][5]), add(mul(v[2], m[2][6]), mul(v[3], m[2][7]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][4]), add(mul(v[1], m[3][5]), add(mul(v[2], m[3][6]), mul(v[3], m[3][7]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][4]), add(mul(v[1], m[4][5]), add(mul(v[2], m[4][6]), mul(v[3], m[4][7]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][4]), add(mul(v[1], m[5][5]), add(mul(v[2], m[5][6]), mul(v[3], m[5][7]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][4]), add(mul(v[1], m[6][5]), add(mul(v[2], m[6][6]), mul(v[3], m[6][7]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][4]), add(mul(v[1], m[7][5]), add(mul(v[2], m[7][6]), mul(v[3], m[7][7]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][4]), add(mul(v[1], m[8][5]), add(mul(v[2], m[8][6]), mul(v[3], m[8][7]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][4]), add(mul(v[1], m[9][5]), add(mul(v[2], m[9][6]), mul(v[3], m[9][7]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][4]), add(mul(v[1], m[10][5]), add(mul(v[2], m[10][6]), mul(v[3], m[10][7]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][4]), add(mul(v[1], m[11][5]), add(mul(v[2], m[11][6]), mul(v[3], m[11][7]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][4]), add(mul(v[1], m[12][5]), add(mul(v[2], m[12][6]), mul(v[3], m[12][7]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][4]), add(mul(v[1], m[13][5]), add(mul(v[2], m[13][6]), mul(v[3], m[13][7]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][4]), add(mul(v[1], m[14][5]), add(mul(v[2], m[14][6]), mul(v[3], m[14][7]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][4]), add(mul(v[1], m[15][5]), add(mul(v[2], m[15][6]), mul(v[3], m[15][7]))))); + + v[0] = psi[I + d3]; + v[1] = psi[I + d0 + d3]; + v[2] = psi[I + d1 + d3]; + v[3] = psi[I + d0 + d1 + d3]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][8]), add(mul(v[1], m[0][9]), add(mul(v[2], m[0][10]), mul(v[3], m[0][11]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][8]), add(mul(v[1], m[1][9]), add(mul(v[2], m[1][10]), mul(v[3], m[1][11]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][8]), add(mul(v[1], m[2][9]), add(mul(v[2], m[2][10]), mul(v[3], m[2][11]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][8]), add(mul(v[1], m[3][9]), add(mul(v[2], m[3][10]), mul(v[3], m[3][11]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][8]), add(mul(v[1], m[4][9]), add(mul(v[2], m[4][10]), mul(v[3], m[4][11]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][8]), add(mul(v[1], m[5][9]), add(mul(v[2], m[5][10]), mul(v[3], m[5][11]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][8]), add(mul(v[1], m[6][9]), add(mul(v[2], m[6][10]), mul(v[3], m[6][11]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][8]), add(mul(v[1], m[7][9]), add(mul(v[2], m[7][10]), mul(v[3], m[7][11]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][8]), add(mul(v[1], m[8][9]), add(mul(v[2], m[8][10]), mul(v[3], m[8][11]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][8]), add(mul(v[1], m[9][9]), add(mul(v[2], m[9][10]), mul(v[3], m[9][11]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][8]), add(mul(v[1], m[10][9]), add(mul(v[2], m[10][10]), mul(v[3], m[10][11]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][8]), add(mul(v[1], m[11][9]), add(mul(v[2], m[11][10]), mul(v[3], m[11][11]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][8]), add(mul(v[1], m[12][9]), add(mul(v[2], m[12][10]), mul(v[3], m[12][11]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][8]), add(mul(v[1], m[13][9]), add(mul(v[2], m[13][10]), mul(v[3], m[13][11]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][8]), add(mul(v[1], m[14][9]), add(mul(v[2], m[14][10]), mul(v[3], m[14][11]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][8]), add(mul(v[1], m[15][9]), add(mul(v[2], m[15][10]), mul(v[3], m[15][11]))))); + + v[0] = psi[I + d2 + d3]; + v[1] = psi[I + d0 + d2 + d3]; + v[2] = psi[I + d1 + d2 + d3]; + v[3] = psi[I + d0 + d1 + d2 + d3]; + + psi[I] = (add(tmp[0], add(mul(v[0], m[0][12]), add(mul(v[1], m[0][13]), add(mul(v[2], m[0][14]), mul(v[3], m[0][15])))))); + psi[I + d0] = (add(tmp[1], add(mul(v[0], m[1][12]), add(mul(v[1], m[1][13]), add(mul(v[2], m[1][14]), mul(v[3], m[1][15])))))); + psi[I + d1] = (add(tmp[2], add(mul(v[0], m[2][12]), add(mul(v[1], m[2][13]), add(mul(v[2], m[2][14]), mul(v[3], m[2][15])))))); + psi[I + d0 + d1] = (add(tmp[3], add(mul(v[0], m[3][12]), add(mul(v[1], m[3][13]), add(mul(v[2], m[3][14]), mul(v[3], m[3][15])))))); + psi[I + d2] = (add(tmp[4], add(mul(v[0], m[4][12]), add(mul(v[1], m[4][13]), add(mul(v[2], m[4][14]), mul(v[3], m[4][15])))))); + psi[I + d0 + d2] = (add(tmp[5], add(mul(v[0], m[5][12]), add(mul(v[1], m[5][13]), add(mul(v[2], m[5][14]), mul(v[3], m[5][15])))))); + psi[I + d1 + d2] = (add(tmp[6], add(mul(v[0], m[6][12]), add(mul(v[1], m[6][13]), add(mul(v[2], m[6][14]), mul(v[3], m[6][15])))))); + psi[I + d0 + d1 + d2] = (add(tmp[7], add(mul(v[0], m[7][12]), add(mul(v[1], m[7][13]), add(mul(v[2], m[7][14]), mul(v[3], m[7][15])))))); + psi[I + d3] = (add(tmp[8], add(mul(v[0], m[8][12]), add(mul(v[1], m[8][13]), add(mul(v[2], m[8][14]), mul(v[3], m[8][15])))))); + psi[I + d0 + d3] = (add(tmp[9], add(mul(v[0], m[9][12]), add(mul(v[1], m[9][13]), add(mul(v[2], m[9][14]), mul(v[3], m[9][15])))))); + psi[I + d1 + d3] = (add(tmp[10], add(mul(v[0], m[10][12]), add(mul(v[1], m[10][13]), add(mul(v[2], m[10][14]), mul(v[3], m[10][15])))))); + psi[I + d0 + d1 + d3] = (add(tmp[11], add(mul(v[0], m[11][12]), add(mul(v[1], m[11][13]), add(mul(v[2], m[11][14]), mul(v[3], m[11][15])))))); + psi[I + d2 + d3] = (add(tmp[12], add(mul(v[0], m[12][12]), add(mul(v[1], m[12][13]), add(mul(v[2], m[12][14]), mul(v[3], m[12][15])))))); + psi[I + d0 + d2 + d3] = (add(tmp[13], add(mul(v[0], m[13][12]), add(mul(v[1], m[13][13]), add(mul(v[2], m[13][14]), mul(v[3], m[13][15])))))); + psi[I + d1 + d2 + d3] = (add(tmp[14], add(mul(v[0], m[14][12]), add(mul(v[1], m[14][13]), add(mul(v[2], m[14][14]), mul(v[3], m[14][15])))))); + psi[I + d0 + d1 + d2 + d3] = (add(tmp[15], add(mul(v[0], m[15][12]), add(mul(v[1], m[15][13]), add(mul(v[2], m[15][14]), mul(v[3], m[15][15])))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id3, id2, id1, id0 }; + std::sort(ids_sorted, ids_sorted + 4, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3]; + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m); + } + } + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE4) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; ++i4){ + if (((i0 + i1 + i2 + i3 + i4)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4, d0, d1, d2, d3, m); + } + } + } + } + } + } +} diff --git a/ccsrc/include/cppsim/nointrin/kernel5.hpp b/ccsrc/include/cppsim/nointrin/kernel5.hpp new file mode 100644 index 00000000..7a8d0dc5 --- /dev/null +++ b/ccsrc/include/cppsim/nointrin/kernel5.hpp @@ -0,0 +1,375 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define LOOP_COLLAPSE5 6 + +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, M const& m) +{ + std::array v = + { + psi[I], + psi[I + d0], + psi[I + d1], + psi[I + d0 + d1] + }; + + std::array tmp = + { + add(mul(v[0], m[0][0]), add(mul(v[1], m[0][1]), add(mul(v[2], m[0][2]), mul(v[3], m[0][3])))), + add(mul(v[0], m[1][0]), add(mul(v[1], m[1][1]), add(mul(v[2], m[1][2]), mul(v[3], m[1][3])))), + add(mul(v[0], m[2][0]), add(mul(v[1], m[2][1]), add(mul(v[2], m[2][2]), mul(v[3], m[2][3])))), + add(mul(v[0], m[3][0]), add(mul(v[1], m[3][1]), add(mul(v[2], m[3][2]), mul(v[3], m[3][3])))), + add(mul(v[0], m[4][0]), add(mul(v[1], m[4][1]), add(mul(v[2], m[4][2]), mul(v[3], m[4][3])))), + add(mul(v[0], m[5][0]), add(mul(v[1], m[5][1]), add(mul(v[2], m[5][2]), mul(v[3], m[5][3])))), + add(mul(v[0], m[6][0]), add(mul(v[1], m[6][1]), add(mul(v[2], m[6][2]), mul(v[3], m[6][3])))), + add(mul(v[0], m[7][0]), add(mul(v[1], m[7][1]), add(mul(v[2], m[7][2]), mul(v[3], m[7][3])))), + add(mul(v[0], m[8][0]), add(mul(v[1], m[8][1]), add(mul(v[2], m[8][2]), mul(v[3], m[8][3])))), + add(mul(v[0], m[9][0]), add(mul(v[1], m[9][1]), add(mul(v[2], m[9][2]), mul(v[3], m[9][3])))), + add(mul(v[0], m[10][0]), add(mul(v[1], m[10][1]), add(mul(v[2], m[10][2]), mul(v[3], m[10][3])))), + add(mul(v[0], m[11][0]), add(mul(v[1], m[11][1]), add(mul(v[2], m[11][2]), mul(v[3], m[11][3])))), + add(mul(v[0], m[12][0]), add(mul(v[1], m[12][1]), add(mul(v[2], m[12][2]), mul(v[3], m[12][3])))), + add(mul(v[0], m[13][0]), add(mul(v[1], m[13][1]), add(mul(v[2], m[13][2]), mul(v[3], m[13][3])))), + add(mul(v[0], m[14][0]), add(mul(v[1], m[14][1]), add(mul(v[2], m[14][2]), mul(v[3], m[14][3])))), + add(mul(v[0], m[15][0]), add(mul(v[1], m[15][1]), add(mul(v[2], m[15][2]), mul(v[3], m[15][3])))), + add(mul(v[0], m[16][0]), add(mul(v[1], m[16][1]), add(mul(v[2], m[16][2]), mul(v[3], m[16][3])))), + add(mul(v[0], m[17][0]), add(mul(v[1], m[17][1]), add(mul(v[2], m[17][2]), mul(v[3], m[17][3])))), + add(mul(v[0], m[18][0]), add(mul(v[1], m[18][1]), add(mul(v[2], m[18][2]), mul(v[3], m[18][3])))), + add(mul(v[0], m[19][0]), add(mul(v[1], m[19][1]), add(mul(v[2], m[19][2]), mul(v[3], m[19][3])))), + add(mul(v[0], m[20][0]), add(mul(v[1], m[20][1]), add(mul(v[2], m[20][2]), mul(v[3], m[20][3])))), + add(mul(v[0], m[21][0]), add(mul(v[1], m[21][1]), add(mul(v[2], m[21][2]), mul(v[3], m[21][3])))), + add(mul(v[0], m[22][0]), add(mul(v[1], m[22][1]), add(mul(v[2], m[22][2]), mul(v[3], m[22][3])))), + add(mul(v[0], m[23][0]), add(mul(v[1], m[23][1]), add(mul(v[2], m[23][2]), mul(v[3], m[23][3])))), + add(mul(v[0], m[24][0]), add(mul(v[1], m[24][1]), add(mul(v[2], m[24][2]), mul(v[3], m[24][3])))), + add(mul(v[0], m[25][0]), add(mul(v[1], m[25][1]), add(mul(v[2], m[25][2]), mul(v[3], m[25][3])))), + add(mul(v[0], m[26][0]), add(mul(v[1], m[26][1]), add(mul(v[2], m[26][2]), mul(v[3], m[26][3])))), + add(mul(v[0], m[27][0]), add(mul(v[1], m[27][1]), add(mul(v[2], m[27][2]), mul(v[3], m[27][3])))), + add(mul(v[0], m[28][0]), add(mul(v[1], m[28][1]), add(mul(v[2], m[28][2]), mul(v[3], m[28][3])))), + add(mul(v[0], m[29][0]), add(mul(v[1], m[29][1]), add(mul(v[2], m[29][2]), mul(v[3], m[29][3])))), + add(mul(v[0], m[30][0]), add(mul(v[1], m[30][1]), add(mul(v[2], m[30][2]), mul(v[3], m[30][3])))), + add(mul(v[0], m[31][0]), add(mul(v[1], m[31][1]), add(mul(v[2], m[31][2]), mul(v[3], m[31][3])))) + }; + + v[0] = psi[I + d2]; + v[1] = psi[I + d0 + d2]; + v[2] = psi[I + d1 + d2]; + v[3] = psi[I + d0 + d1 + d2]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][4]), add(mul(v[1], m[0][5]), add(mul(v[2], m[0][6]), mul(v[3], m[0][7]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][4]), add(mul(v[1], m[1][5]), add(mul(v[2], m[1][6]), mul(v[3], m[1][7]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][4]), add(mul(v[1], m[2][5]), add(mul(v[2], m[2][6]), mul(v[3], m[2][7]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][4]), add(mul(v[1], m[3][5]), add(mul(v[2], m[3][6]), mul(v[3], m[3][7]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][4]), add(mul(v[1], m[4][5]), add(mul(v[2], m[4][6]), mul(v[3], m[4][7]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][4]), add(mul(v[1], m[5][5]), add(mul(v[2], m[5][6]), mul(v[3], m[5][7]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][4]), add(mul(v[1], m[6][5]), add(mul(v[2], m[6][6]), mul(v[3], m[6][7]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][4]), add(mul(v[1], m[7][5]), add(mul(v[2], m[7][6]), mul(v[3], m[7][7]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][4]), add(mul(v[1], m[8][5]), add(mul(v[2], m[8][6]), mul(v[3], m[8][7]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][4]), add(mul(v[1], m[9][5]), add(mul(v[2], m[9][6]), mul(v[3], m[9][7]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][4]), add(mul(v[1], m[10][5]), add(mul(v[2], m[10][6]), mul(v[3], m[10][7]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][4]), add(mul(v[1], m[11][5]), add(mul(v[2], m[11][6]), mul(v[3], m[11][7]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][4]), add(mul(v[1], m[12][5]), add(mul(v[2], m[12][6]), mul(v[3], m[12][7]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][4]), add(mul(v[1], m[13][5]), add(mul(v[2], m[13][6]), mul(v[3], m[13][7]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][4]), add(mul(v[1], m[14][5]), add(mul(v[2], m[14][6]), mul(v[3], m[14][7]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][4]), add(mul(v[1], m[15][5]), add(mul(v[2], m[15][6]), mul(v[3], m[15][7]))))); + tmp[16] = add(tmp[16], add(mul(v[0], m[16][4]), add(mul(v[1], m[16][5]), add(mul(v[2], m[16][6]), mul(v[3], m[16][7]))))); + tmp[17] = add(tmp[17], add(mul(v[0], m[17][4]), add(mul(v[1], m[17][5]), add(mul(v[2], m[17][6]), mul(v[3], m[17][7]))))); + tmp[18] = add(tmp[18], add(mul(v[0], m[18][4]), add(mul(v[1], m[18][5]), add(mul(v[2], m[18][6]), mul(v[3], m[18][7]))))); + tmp[19] = add(tmp[19], add(mul(v[0], m[19][4]), add(mul(v[1], m[19][5]), add(mul(v[2], m[19][6]), mul(v[3], m[19][7]))))); + tmp[20] = add(tmp[20], add(mul(v[0], m[20][4]), add(mul(v[1], m[20][5]), add(mul(v[2], m[20][6]), mul(v[3], m[20][7]))))); + tmp[21] = add(tmp[21], add(mul(v[0], m[21][4]), add(mul(v[1], m[21][5]), add(mul(v[2], m[21][6]), mul(v[3], m[21][7]))))); + tmp[22] = add(tmp[22], add(mul(v[0], m[22][4]), add(mul(v[1], m[22][5]), add(mul(v[2], m[22][6]), mul(v[3], m[22][7]))))); + tmp[23] = add(tmp[23], add(mul(v[0], m[23][4]), add(mul(v[1], m[23][5]), add(mul(v[2], m[23][6]), mul(v[3], m[23][7]))))); + tmp[24] = add(tmp[24], add(mul(v[0], m[24][4]), add(mul(v[1], m[24][5]), add(mul(v[2], m[24][6]), mul(v[3], m[24][7]))))); + tmp[25] = add(tmp[25], add(mul(v[0], m[25][4]), add(mul(v[1], m[25][5]), add(mul(v[2], m[25][6]), mul(v[3], m[25][7]))))); + tmp[26] = add(tmp[26], add(mul(v[0], m[26][4]), add(mul(v[1], m[26][5]), add(mul(v[2], m[26][6]), mul(v[3], m[26][7]))))); + tmp[27] = add(tmp[27], add(mul(v[0], m[27][4]), add(mul(v[1], m[27][5]), add(mul(v[2], m[27][6]), mul(v[3], m[27][7]))))); + tmp[28] = add(tmp[28], add(mul(v[0], m[28][4]), add(mul(v[1], m[28][5]), add(mul(v[2], m[28][6]), mul(v[3], m[28][7]))))); + tmp[29] = add(tmp[29], add(mul(v[0], m[29][4]), add(mul(v[1], m[29][5]), add(mul(v[2], m[29][6]), mul(v[3], m[29][7]))))); + tmp[30] = add(tmp[30], add(mul(v[0], m[30][4]), add(mul(v[1], m[30][5]), add(mul(v[2], m[30][6]), mul(v[3], m[30][7]))))); + tmp[31] = add(tmp[31], add(mul(v[0], m[31][4]), add(mul(v[1], m[31][5]), add(mul(v[2], m[31][6]), mul(v[3], m[31][7]))))); + + v[0] = psi[I + d3]; + v[1] = psi[I + d0 + d3]; + v[2] = psi[I + d1 + d3]; + v[3] = psi[I + d0 + d1 + d3]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][8]), add(mul(v[1], m[0][9]), add(mul(v[2], m[0][10]), mul(v[3], m[0][11]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][8]), add(mul(v[1], m[1][9]), add(mul(v[2], m[1][10]), mul(v[3], m[1][11]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][8]), add(mul(v[1], m[2][9]), add(mul(v[2], m[2][10]), mul(v[3], m[2][11]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][8]), add(mul(v[1], m[3][9]), add(mul(v[2], m[3][10]), mul(v[3], m[3][11]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][8]), add(mul(v[1], m[4][9]), add(mul(v[2], m[4][10]), mul(v[3], m[4][11]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][8]), add(mul(v[1], m[5][9]), add(mul(v[2], m[5][10]), mul(v[3], m[5][11]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][8]), add(mul(v[1], m[6][9]), add(mul(v[2], m[6][10]), mul(v[3], m[6][11]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][8]), add(mul(v[1], m[7][9]), add(mul(v[2], m[7][10]), mul(v[3], m[7][11]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][8]), add(mul(v[1], m[8][9]), add(mul(v[2], m[8][10]), mul(v[3], m[8][11]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][8]), add(mul(v[1], m[9][9]), add(mul(v[2], m[9][10]), mul(v[3], m[9][11]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][8]), add(mul(v[1], m[10][9]), add(mul(v[2], m[10][10]), mul(v[3], m[10][11]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][8]), add(mul(v[1], m[11][9]), add(mul(v[2], m[11][10]), mul(v[3], m[11][11]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][8]), add(mul(v[1], m[12][9]), add(mul(v[2], m[12][10]), mul(v[3], m[12][11]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][8]), add(mul(v[1], m[13][9]), add(mul(v[2], m[13][10]), mul(v[3], m[13][11]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][8]), add(mul(v[1], m[14][9]), add(mul(v[2], m[14][10]), mul(v[3], m[14][11]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][8]), add(mul(v[1], m[15][9]), add(mul(v[2], m[15][10]), mul(v[3], m[15][11]))))); + tmp[16] = add(tmp[16], add(mul(v[0], m[16][8]), add(mul(v[1], m[16][9]), add(mul(v[2], m[16][10]), mul(v[3], m[16][11]))))); + tmp[17] = add(tmp[17], add(mul(v[0], m[17][8]), add(mul(v[1], m[17][9]), add(mul(v[2], m[17][10]), mul(v[3], m[17][11]))))); + tmp[18] = add(tmp[18], add(mul(v[0], m[18][8]), add(mul(v[1], m[18][9]), add(mul(v[2], m[18][10]), mul(v[3], m[18][11]))))); + tmp[19] = add(tmp[19], add(mul(v[0], m[19][8]), add(mul(v[1], m[19][9]), add(mul(v[2], m[19][10]), mul(v[3], m[19][11]))))); + tmp[20] = add(tmp[20], add(mul(v[0], m[20][8]), add(mul(v[1], m[20][9]), add(mul(v[2], m[20][10]), mul(v[3], m[20][11]))))); + tmp[21] = add(tmp[21], add(mul(v[0], m[21][8]), add(mul(v[1], m[21][9]), add(mul(v[2], m[21][10]), mul(v[3], m[21][11]))))); + tmp[22] = add(tmp[22], add(mul(v[0], m[22][8]), add(mul(v[1], m[22][9]), add(mul(v[2], m[22][10]), mul(v[3], m[22][11]))))); + tmp[23] = add(tmp[23], add(mul(v[0], m[23][8]), add(mul(v[1], m[23][9]), add(mul(v[2], m[23][10]), mul(v[3], m[23][11]))))); + tmp[24] = add(tmp[24], add(mul(v[0], m[24][8]), add(mul(v[1], m[24][9]), add(mul(v[2], m[24][10]), mul(v[3], m[24][11]))))); + tmp[25] = add(tmp[25], add(mul(v[0], m[25][8]), add(mul(v[1], m[25][9]), add(mul(v[2], m[25][10]), mul(v[3], m[25][11]))))); + tmp[26] = add(tmp[26], add(mul(v[0], m[26][8]), add(mul(v[1], m[26][9]), add(mul(v[2], m[26][10]), mul(v[3], m[26][11]))))); + tmp[27] = add(tmp[27], add(mul(v[0], m[27][8]), add(mul(v[1], m[27][9]), add(mul(v[2], m[27][10]), mul(v[3], m[27][11]))))); + tmp[28] = add(tmp[28], add(mul(v[0], m[28][8]), add(mul(v[1], m[28][9]), add(mul(v[2], m[28][10]), mul(v[3], m[28][11]))))); + tmp[29] = add(tmp[29], add(mul(v[0], m[29][8]), add(mul(v[1], m[29][9]), add(mul(v[2], m[29][10]), mul(v[3], m[29][11]))))); + tmp[30] = add(tmp[30], add(mul(v[0], m[30][8]), add(mul(v[1], m[30][9]), add(mul(v[2], m[30][10]), mul(v[3], m[30][11]))))); + tmp[31] = add(tmp[31], add(mul(v[0], m[31][8]), add(mul(v[1], m[31][9]), add(mul(v[2], m[31][10]), mul(v[3], m[31][11]))))); + + v[0] = psi[I + d2 + d3]; + v[1] = psi[I + d0 + d2 + d3]; + v[2] = psi[I + d1 + d2 + d3]; + v[3] = psi[I + d0 + d1 + d2 + d3]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][12]), add(mul(v[1], m[0][13]), add(mul(v[2], m[0][14]), mul(v[3], m[0][15]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][12]), add(mul(v[1], m[1][13]), add(mul(v[2], m[1][14]), mul(v[3], m[1][15]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][12]), add(mul(v[1], m[2][13]), add(mul(v[2], m[2][14]), mul(v[3], m[2][15]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][12]), add(mul(v[1], m[3][13]), add(mul(v[2], m[3][14]), mul(v[3], m[3][15]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][12]), add(mul(v[1], m[4][13]), add(mul(v[2], m[4][14]), mul(v[3], m[4][15]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][12]), add(mul(v[1], m[5][13]), add(mul(v[2], m[5][14]), mul(v[3], m[5][15]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][12]), add(mul(v[1], m[6][13]), add(mul(v[2], m[6][14]), mul(v[3], m[6][15]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][12]), add(mul(v[1], m[7][13]), add(mul(v[2], m[7][14]), mul(v[3], m[7][15]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][12]), add(mul(v[1], m[8][13]), add(mul(v[2], m[8][14]), mul(v[3], m[8][15]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][12]), add(mul(v[1], m[9][13]), add(mul(v[2], m[9][14]), mul(v[3], m[9][15]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][12]), add(mul(v[1], m[10][13]), add(mul(v[2], m[10][14]), mul(v[3], m[10][15]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][12]), add(mul(v[1], m[11][13]), add(mul(v[2], m[11][14]), mul(v[3], m[11][15]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][12]), add(mul(v[1], m[12][13]), add(mul(v[2], m[12][14]), mul(v[3], m[12][15]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][12]), add(mul(v[1], m[13][13]), add(mul(v[2], m[13][14]), mul(v[3], m[13][15]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][12]), add(mul(v[1], m[14][13]), add(mul(v[2], m[14][14]), mul(v[3], m[14][15]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][12]), add(mul(v[1], m[15][13]), add(mul(v[2], m[15][14]), mul(v[3], m[15][15]))))); + tmp[16] = add(tmp[16], add(mul(v[0], m[16][12]), add(mul(v[1], m[16][13]), add(mul(v[2], m[16][14]), mul(v[3], m[16][15]))))); + tmp[17] = add(tmp[17], add(mul(v[0], m[17][12]), add(mul(v[1], m[17][13]), add(mul(v[2], m[17][14]), mul(v[3], m[17][15]))))); + tmp[18] = add(tmp[18], add(mul(v[0], m[18][12]), add(mul(v[1], m[18][13]), add(mul(v[2], m[18][14]), mul(v[3], m[18][15]))))); + tmp[19] = add(tmp[19], add(mul(v[0], m[19][12]), add(mul(v[1], m[19][13]), add(mul(v[2], m[19][14]), mul(v[3], m[19][15]))))); + tmp[20] = add(tmp[20], add(mul(v[0], m[20][12]), add(mul(v[1], m[20][13]), add(mul(v[2], m[20][14]), mul(v[3], m[20][15]))))); + tmp[21] = add(tmp[21], add(mul(v[0], m[21][12]), add(mul(v[1], m[21][13]), add(mul(v[2], m[21][14]), mul(v[3], m[21][15]))))); + tmp[22] = add(tmp[22], add(mul(v[0], m[22][12]), add(mul(v[1], m[22][13]), add(mul(v[2], m[22][14]), mul(v[3], m[22][15]))))); + tmp[23] = add(tmp[23], add(mul(v[0], m[23][12]), add(mul(v[1], m[23][13]), add(mul(v[2], m[23][14]), mul(v[3], m[23][15]))))); + tmp[24] = add(tmp[24], add(mul(v[0], m[24][12]), add(mul(v[1], m[24][13]), add(mul(v[2], m[24][14]), mul(v[3], m[24][15]))))); + tmp[25] = add(tmp[25], add(mul(v[0], m[25][12]), add(mul(v[1], m[25][13]), add(mul(v[2], m[25][14]), mul(v[3], m[25][15]))))); + tmp[26] = add(tmp[26], add(mul(v[0], m[26][12]), add(mul(v[1], m[26][13]), add(mul(v[2], m[26][14]), mul(v[3], m[26][15]))))); + tmp[27] = add(tmp[27], add(mul(v[0], m[27][12]), add(mul(v[1], m[27][13]), add(mul(v[2], m[27][14]), mul(v[3], m[27][15]))))); + tmp[28] = add(tmp[28], add(mul(v[0], m[28][12]), add(mul(v[1], m[28][13]), add(mul(v[2], m[28][14]), mul(v[3], m[28][15]))))); + tmp[29] = add(tmp[29], add(mul(v[0], m[29][12]), add(mul(v[1], m[29][13]), add(mul(v[2], m[29][14]), mul(v[3], m[29][15]))))); + tmp[30] = add(tmp[30], add(mul(v[0], m[30][12]), add(mul(v[1], m[30][13]), add(mul(v[2], m[30][14]), mul(v[3], m[30][15]))))); + tmp[31] = add(tmp[31], add(mul(v[0], m[31][12]), add(mul(v[1], m[31][13]), add(mul(v[2], m[31][14]), mul(v[3], m[31][15]))))); + + v[0] = psi[I + d4]; + v[1] = psi[I + d0 + d4]; + v[2] = psi[I + d1 + d4]; + v[3] = psi[I + d0 + d1 + d4]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][16]), add(mul(v[1], m[0][17]), add(mul(v[2], m[0][18]), mul(v[3], m[0][19]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][16]), add(mul(v[1], m[1][17]), add(mul(v[2], m[1][18]), mul(v[3], m[1][19]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][16]), add(mul(v[1], m[2][17]), add(mul(v[2], m[2][18]), mul(v[3], m[2][19]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][16]), add(mul(v[1], m[3][17]), add(mul(v[2], m[3][18]), mul(v[3], m[3][19]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][16]), add(mul(v[1], m[4][17]), add(mul(v[2], m[4][18]), mul(v[3], m[4][19]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][16]), add(mul(v[1], m[5][17]), add(mul(v[2], m[5][18]), mul(v[3], m[5][19]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][16]), add(mul(v[1], m[6][17]), add(mul(v[2], m[6][18]), mul(v[3], m[6][19]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][16]), add(mul(v[1], m[7][17]), add(mul(v[2], m[7][18]), mul(v[3], m[7][19]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][16]), add(mul(v[1], m[8][17]), add(mul(v[2], m[8][18]), mul(v[3], m[8][19]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][16]), add(mul(v[1], m[9][17]), add(mul(v[2], m[9][18]), mul(v[3], m[9][19]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][16]), add(mul(v[1], m[10][17]), add(mul(v[2], m[10][18]), mul(v[3], m[10][19]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][16]), add(mul(v[1], m[11][17]), add(mul(v[2], m[11][18]), mul(v[3], m[11][19]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][16]), add(mul(v[1], m[12][17]), add(mul(v[2], m[12][18]), mul(v[3], m[12][19]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][16]), add(mul(v[1], m[13][17]), add(mul(v[2], m[13][18]), mul(v[3], m[13][19]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][16]), add(mul(v[1], m[14][17]), add(mul(v[2], m[14][18]), mul(v[3], m[14][19]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][16]), add(mul(v[1], m[15][17]), add(mul(v[2], m[15][18]), mul(v[3], m[15][19]))))); + tmp[16] = add(tmp[16], add(mul(v[0], m[16][16]), add(mul(v[1], m[16][17]), add(mul(v[2], m[16][18]), mul(v[3], m[16][19]))))); + tmp[17] = add(tmp[17], add(mul(v[0], m[17][16]), add(mul(v[1], m[17][17]), add(mul(v[2], m[17][18]), mul(v[3], m[17][19]))))); + tmp[18] = add(tmp[18], add(mul(v[0], m[18][16]), add(mul(v[1], m[18][17]), add(mul(v[2], m[18][18]), mul(v[3], m[18][19]))))); + tmp[19] = add(tmp[19], add(mul(v[0], m[19][16]), add(mul(v[1], m[19][17]), add(mul(v[2], m[19][18]), mul(v[3], m[19][19]))))); + tmp[20] = add(tmp[20], add(mul(v[0], m[20][16]), add(mul(v[1], m[20][17]), add(mul(v[2], m[20][18]), mul(v[3], m[20][19]))))); + tmp[21] = add(tmp[21], add(mul(v[0], m[21][16]), add(mul(v[1], m[21][17]), add(mul(v[2], m[21][18]), mul(v[3], m[21][19]))))); + tmp[22] = add(tmp[22], add(mul(v[0], m[22][16]), add(mul(v[1], m[22][17]), add(mul(v[2], m[22][18]), mul(v[3], m[22][19]))))); + tmp[23] = add(tmp[23], add(mul(v[0], m[23][16]), add(mul(v[1], m[23][17]), add(mul(v[2], m[23][18]), mul(v[3], m[23][19]))))); + tmp[24] = add(tmp[24], add(mul(v[0], m[24][16]), add(mul(v[1], m[24][17]), add(mul(v[2], m[24][18]), mul(v[3], m[24][19]))))); + tmp[25] = add(tmp[25], add(mul(v[0], m[25][16]), add(mul(v[1], m[25][17]), add(mul(v[2], m[25][18]), mul(v[3], m[25][19]))))); + tmp[26] = add(tmp[26], add(mul(v[0], m[26][16]), add(mul(v[1], m[26][17]), add(mul(v[2], m[26][18]), mul(v[3], m[26][19]))))); + tmp[27] = add(tmp[27], add(mul(v[0], m[27][16]), add(mul(v[1], m[27][17]), add(mul(v[2], m[27][18]), mul(v[3], m[27][19]))))); + tmp[28] = add(tmp[28], add(mul(v[0], m[28][16]), add(mul(v[1], m[28][17]), add(mul(v[2], m[28][18]), mul(v[3], m[28][19]))))); + tmp[29] = add(tmp[29], add(mul(v[0], m[29][16]), add(mul(v[1], m[29][17]), add(mul(v[2], m[29][18]), mul(v[3], m[29][19]))))); + tmp[30] = add(tmp[30], add(mul(v[0], m[30][16]), add(mul(v[1], m[30][17]), add(mul(v[2], m[30][18]), mul(v[3], m[30][19]))))); + tmp[31] = add(tmp[31], add(mul(v[0], m[31][16]), add(mul(v[1], m[31][17]), add(mul(v[2], m[31][18]), mul(v[3], m[31][19]))))); + + v[0] = psi[I + d2 + d4]; + v[1] = psi[I + d0 + d2 + d4]; + v[2] = psi[I + d1 + d2 + d4]; + v[3] = psi[I + d0 + d1 + d2 + d4]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][20]), add(mul(v[1], m[0][21]), add(mul(v[2], m[0][22]), mul(v[3], m[0][23]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][20]), add(mul(v[1], m[1][21]), add(mul(v[2], m[1][22]), mul(v[3], m[1][23]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][20]), add(mul(v[1], m[2][21]), add(mul(v[2], m[2][22]), mul(v[3], m[2][23]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][20]), add(mul(v[1], m[3][21]), add(mul(v[2], m[3][22]), mul(v[3], m[3][23]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][20]), add(mul(v[1], m[4][21]), add(mul(v[2], m[4][22]), mul(v[3], m[4][23]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][20]), add(mul(v[1], m[5][21]), add(mul(v[2], m[5][22]), mul(v[3], m[5][23]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][20]), add(mul(v[1], m[6][21]), add(mul(v[2], m[6][22]), mul(v[3], m[6][23]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][20]), add(mul(v[1], m[7][21]), add(mul(v[2], m[7][22]), mul(v[3], m[7][23]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][20]), add(mul(v[1], m[8][21]), add(mul(v[2], m[8][22]), mul(v[3], m[8][23]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][20]), add(mul(v[1], m[9][21]), add(mul(v[2], m[9][22]), mul(v[3], m[9][23]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][20]), add(mul(v[1], m[10][21]), add(mul(v[2], m[10][22]), mul(v[3], m[10][23]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][20]), add(mul(v[1], m[11][21]), add(mul(v[2], m[11][22]), mul(v[3], m[11][23]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][20]), add(mul(v[1], m[12][21]), add(mul(v[2], m[12][22]), mul(v[3], m[12][23]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][20]), add(mul(v[1], m[13][21]), add(mul(v[2], m[13][22]), mul(v[3], m[13][23]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][20]), add(mul(v[1], m[14][21]), add(mul(v[2], m[14][22]), mul(v[3], m[14][23]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][20]), add(mul(v[1], m[15][21]), add(mul(v[2], m[15][22]), mul(v[3], m[15][23]))))); + tmp[16] = add(tmp[16], add(mul(v[0], m[16][20]), add(mul(v[1], m[16][21]), add(mul(v[2], m[16][22]), mul(v[3], m[16][23]))))); + tmp[17] = add(tmp[17], add(mul(v[0], m[17][20]), add(mul(v[1], m[17][21]), add(mul(v[2], m[17][22]), mul(v[3], m[17][23]))))); + tmp[18] = add(tmp[18], add(mul(v[0], m[18][20]), add(mul(v[1], m[18][21]), add(mul(v[2], m[18][22]), mul(v[3], m[18][23]))))); + tmp[19] = add(tmp[19], add(mul(v[0], m[19][20]), add(mul(v[1], m[19][21]), add(mul(v[2], m[19][22]), mul(v[3], m[19][23]))))); + tmp[20] = add(tmp[20], add(mul(v[0], m[20][20]), add(mul(v[1], m[20][21]), add(mul(v[2], m[20][22]), mul(v[3], m[20][23]))))); + tmp[21] = add(tmp[21], add(mul(v[0], m[21][20]), add(mul(v[1], m[21][21]), add(mul(v[2], m[21][22]), mul(v[3], m[21][23]))))); + tmp[22] = add(tmp[22], add(mul(v[0], m[22][20]), add(mul(v[1], m[22][21]), add(mul(v[2], m[22][22]), mul(v[3], m[22][23]))))); + tmp[23] = add(tmp[23], add(mul(v[0], m[23][20]), add(mul(v[1], m[23][21]), add(mul(v[2], m[23][22]), mul(v[3], m[23][23]))))); + tmp[24] = add(tmp[24], add(mul(v[0], m[24][20]), add(mul(v[1], m[24][21]), add(mul(v[2], m[24][22]), mul(v[3], m[24][23]))))); + tmp[25] = add(tmp[25], add(mul(v[0], m[25][20]), add(mul(v[1], m[25][21]), add(mul(v[2], m[25][22]), mul(v[3], m[25][23]))))); + tmp[26] = add(tmp[26], add(mul(v[0], m[26][20]), add(mul(v[1], m[26][21]), add(mul(v[2], m[26][22]), mul(v[3], m[26][23]))))); + tmp[27] = add(tmp[27], add(mul(v[0], m[27][20]), add(mul(v[1], m[27][21]), add(mul(v[2], m[27][22]), mul(v[3], m[27][23]))))); + tmp[28] = add(tmp[28], add(mul(v[0], m[28][20]), add(mul(v[1], m[28][21]), add(mul(v[2], m[28][22]), mul(v[3], m[28][23]))))); + tmp[29] = add(tmp[29], add(mul(v[0], m[29][20]), add(mul(v[1], m[29][21]), add(mul(v[2], m[29][22]), mul(v[3], m[29][23]))))); + tmp[30] = add(tmp[30], add(mul(v[0], m[30][20]), add(mul(v[1], m[30][21]), add(mul(v[2], m[30][22]), mul(v[3], m[30][23]))))); + tmp[31] = add(tmp[31], add(mul(v[0], m[31][20]), add(mul(v[1], m[31][21]), add(mul(v[2], m[31][22]), mul(v[3], m[31][23]))))); + + v[0] = psi[I + d3 + d4]; + v[1] = psi[I + d0 + d3 + d4]; + v[2] = psi[I + d1 + d3 + d4]; + v[3] = psi[I + d0 + d1 + d3 + d4]; + + tmp[0] = add(tmp[0], add(mul(v[0], m[0][24]), add(mul(v[1], m[0][25]), add(mul(v[2], m[0][26]), mul(v[3], m[0][27]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[1][24]), add(mul(v[1], m[1][25]), add(mul(v[2], m[1][26]), mul(v[3], m[1][27]))))); + tmp[2] = add(tmp[2], add(mul(v[0], m[2][24]), add(mul(v[1], m[2][25]), add(mul(v[2], m[2][26]), mul(v[3], m[2][27]))))); + tmp[3] = add(tmp[3], add(mul(v[0], m[3][24]), add(mul(v[1], m[3][25]), add(mul(v[2], m[3][26]), mul(v[3], m[3][27]))))); + tmp[4] = add(tmp[4], add(mul(v[0], m[4][24]), add(mul(v[1], m[4][25]), add(mul(v[2], m[4][26]), mul(v[3], m[4][27]))))); + tmp[5] = add(tmp[5], add(mul(v[0], m[5][24]), add(mul(v[1], m[5][25]), add(mul(v[2], m[5][26]), mul(v[3], m[5][27]))))); + tmp[6] = add(tmp[6], add(mul(v[0], m[6][24]), add(mul(v[1], m[6][25]), add(mul(v[2], m[6][26]), mul(v[3], m[6][27]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[7][24]), add(mul(v[1], m[7][25]), add(mul(v[2], m[7][26]), mul(v[3], m[7][27]))))); + tmp[8] = add(tmp[8], add(mul(v[0], m[8][24]), add(mul(v[1], m[8][25]), add(mul(v[2], m[8][26]), mul(v[3], m[8][27]))))); + tmp[9] = add(tmp[9], add(mul(v[0], m[9][24]), add(mul(v[1], m[9][25]), add(mul(v[2], m[9][26]), mul(v[3], m[9][27]))))); + tmp[10] = add(tmp[10], add(mul(v[0], m[10][24]), add(mul(v[1], m[10][25]), add(mul(v[2], m[10][26]), mul(v[3], m[10][27]))))); + tmp[11] = add(tmp[11], add(mul(v[0], m[11][24]), add(mul(v[1], m[11][25]), add(mul(v[2], m[11][26]), mul(v[3], m[11][27]))))); + tmp[12] = add(tmp[12], add(mul(v[0], m[12][24]), add(mul(v[1], m[12][25]), add(mul(v[2], m[12][26]), mul(v[3], m[12][27]))))); + tmp[13] = add(tmp[13], add(mul(v[0], m[13][24]), add(mul(v[1], m[13][25]), add(mul(v[2], m[13][26]), mul(v[3], m[13][27]))))); + tmp[14] = add(tmp[14], add(mul(v[0], m[14][24]), add(mul(v[1], m[14][25]), add(mul(v[2], m[14][26]), mul(v[3], m[14][27]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[15][24]), add(mul(v[1], m[15][25]), add(mul(v[2], m[15][26]), mul(v[3], m[15][27]))))); + tmp[16] = add(tmp[16], add(mul(v[0], m[16][24]), add(mul(v[1], m[16][25]), add(mul(v[2], m[16][26]), mul(v[3], m[16][27]))))); + tmp[17] = add(tmp[17], add(mul(v[0], m[17][24]), add(mul(v[1], m[17][25]), add(mul(v[2], m[17][26]), mul(v[3], m[17][27]))))); + tmp[18] = add(tmp[18], add(mul(v[0], m[18][24]), add(mul(v[1], m[18][25]), add(mul(v[2], m[18][26]), mul(v[3], m[18][27]))))); + tmp[19] = add(tmp[19], add(mul(v[0], m[19][24]), add(mul(v[1], m[19][25]), add(mul(v[2], m[19][26]), mul(v[3], m[19][27]))))); + tmp[20] = add(tmp[20], add(mul(v[0], m[20][24]), add(mul(v[1], m[20][25]), add(mul(v[2], m[20][26]), mul(v[3], m[20][27]))))); + tmp[21] = add(tmp[21], add(mul(v[0], m[21][24]), add(mul(v[1], m[21][25]), add(mul(v[2], m[21][26]), mul(v[3], m[21][27]))))); + tmp[22] = add(tmp[22], add(mul(v[0], m[22][24]), add(mul(v[1], m[22][25]), add(mul(v[2], m[22][26]), mul(v[3], m[22][27]))))); + tmp[23] = add(tmp[23], add(mul(v[0], m[23][24]), add(mul(v[1], m[23][25]), add(mul(v[2], m[23][26]), mul(v[3], m[23][27]))))); + tmp[24] = add(tmp[24], add(mul(v[0], m[24][24]), add(mul(v[1], m[24][25]), add(mul(v[2], m[24][26]), mul(v[3], m[24][27]))))); + tmp[25] = add(tmp[25], add(mul(v[0], m[25][24]), add(mul(v[1], m[25][25]), add(mul(v[2], m[25][26]), mul(v[3], m[25][27]))))); + tmp[26] = add(tmp[26], add(mul(v[0], m[26][24]), add(mul(v[1], m[26][25]), add(mul(v[2], m[26][26]), mul(v[3], m[26][27]))))); + tmp[27] = add(tmp[27], add(mul(v[0], m[27][24]), add(mul(v[1], m[27][25]), add(mul(v[2], m[27][26]), mul(v[3], m[27][27]))))); + tmp[28] = add(tmp[28], add(mul(v[0], m[28][24]), add(mul(v[1], m[28][25]), add(mul(v[2], m[28][26]), mul(v[3], m[28][27]))))); + tmp[29] = add(tmp[29], add(mul(v[0], m[29][24]), add(mul(v[1], m[29][25]), add(mul(v[2], m[29][26]), mul(v[3], m[29][27]))))); + tmp[30] = add(tmp[30], add(mul(v[0], m[30][24]), add(mul(v[1], m[30][25]), add(mul(v[2], m[30][26]), mul(v[3], m[30][27]))))); + tmp[31] = add(tmp[31], add(mul(v[0], m[31][24]), add(mul(v[1], m[31][25]), add(mul(v[2], m[31][26]), mul(v[3], m[31][27]))))); + + v[0] = psi[I + d2 + d3 + d4]; + v[1] = psi[I + d0 + d2 + d3 + d4]; + v[2] = psi[I + d1 + d2 + d3 + d4]; + v[3] = psi[I + d0 + d1 + d2 + d3 + d4]; + + psi[I] = (add(tmp[0], add(mul(v[0], m[0][28]), add(mul(v[1], m[0][29]), add(mul(v[2], m[0][30]), mul(v[3], m[0][31])))))); + psi[I + d0] = (add(tmp[1], add(mul(v[0], m[1][28]), add(mul(v[1], m[1][29]), add(mul(v[2], m[1][30]), mul(v[3], m[1][31])))))); + psi[I + d1] = (add(tmp[2], add(mul(v[0], m[2][28]), add(mul(v[1], m[2][29]), add(mul(v[2], m[2][30]), mul(v[3], m[2][31])))))); + psi[I + d0 + d1] = (add(tmp[3], add(mul(v[0], m[3][28]), add(mul(v[1], m[3][29]), add(mul(v[2], m[3][30]), mul(v[3], m[3][31])))))); + psi[I + d2] = (add(tmp[4], add(mul(v[0], m[4][28]), add(mul(v[1], m[4][29]), add(mul(v[2], m[4][30]), mul(v[3], m[4][31])))))); + psi[I + d0 + d2] = (add(tmp[5], add(mul(v[0], m[5][28]), add(mul(v[1], m[5][29]), add(mul(v[2], m[5][30]), mul(v[3], m[5][31])))))); + psi[I + d1 + d2] = (add(tmp[6], add(mul(v[0], m[6][28]), add(mul(v[1], m[6][29]), add(mul(v[2], m[6][30]), mul(v[3], m[6][31])))))); + psi[I + d0 + d1 + d2] = (add(tmp[7], add(mul(v[0], m[7][28]), add(mul(v[1], m[7][29]), add(mul(v[2], m[7][30]), mul(v[3], m[7][31])))))); + psi[I + d3] = (add(tmp[8], add(mul(v[0], m[8][28]), add(mul(v[1], m[8][29]), add(mul(v[2], m[8][30]), mul(v[3], m[8][31])))))); + psi[I + d0 + d3] = (add(tmp[9], add(mul(v[0], m[9][28]), add(mul(v[1], m[9][29]), add(mul(v[2], m[9][30]), mul(v[3], m[9][31])))))); + psi[I + d1 + d3] = (add(tmp[10], add(mul(v[0], m[10][28]), add(mul(v[1], m[10][29]), add(mul(v[2], m[10][30]), mul(v[3], m[10][31])))))); + psi[I + d0 + d1 + d3] = (add(tmp[11], add(mul(v[0], m[11][28]), add(mul(v[1], m[11][29]), add(mul(v[2], m[11][30]), mul(v[3], m[11][31])))))); + psi[I + d2 + d3] = (add(tmp[12], add(mul(v[0], m[12][28]), add(mul(v[1], m[12][29]), add(mul(v[2], m[12][30]), mul(v[3], m[12][31])))))); + psi[I + d0 + d2 + d3] = (add(tmp[13], add(mul(v[0], m[13][28]), add(mul(v[1], m[13][29]), add(mul(v[2], m[13][30]), mul(v[3], m[13][31])))))); + psi[I + d1 + d2 + d3] = (add(tmp[14], add(mul(v[0], m[14][28]), add(mul(v[1], m[14][29]), add(mul(v[2], m[14][30]), mul(v[3], m[14][31])))))); + psi[I + d0 + d1 + d2 + d3] = (add(tmp[15], add(mul(v[0], m[15][28]), add(mul(v[1], m[15][29]), add(mul(v[2], m[15][30]), mul(v[3], m[15][31])))))); + psi[I + d4] = (add(tmp[16], add(mul(v[0], m[16][28]), add(mul(v[1], m[16][29]), add(mul(v[2], m[16][30]), mul(v[3], m[16][31])))))); + psi[I + d0 + d4] = (add(tmp[17], add(mul(v[0], m[17][28]), add(mul(v[1], m[17][29]), add(mul(v[2], m[17][30]), mul(v[3], m[17][31])))))); + psi[I + d1 + d4] = (add(tmp[18], add(mul(v[0], m[18][28]), add(mul(v[1], m[18][29]), add(mul(v[2], m[18][30]), mul(v[3], m[18][31])))))); + psi[I + d0 + d1 + d4] = (add(tmp[19], add(mul(v[0], m[19][28]), add(mul(v[1], m[19][29]), add(mul(v[2], m[19][30]), mul(v[3], m[19][31])))))); + psi[I + d2 + d4] = (add(tmp[20], add(mul(v[0], m[20][28]), add(mul(v[1], m[20][29]), add(mul(v[2], m[20][30]), mul(v[3], m[20][31])))))); + psi[I + d0 + d2 + d4] = (add(tmp[21], add(mul(v[0], m[21][28]), add(mul(v[1], m[21][29]), add(mul(v[2], m[21][30]), mul(v[3], m[21][31])))))); + psi[I + d1 + d2 + d4] = (add(tmp[22], add(mul(v[0], m[22][28]), add(mul(v[1], m[22][29]), add(mul(v[2], m[22][30]), mul(v[3], m[22][31])))))); + psi[I + d0 + d1 + d2 + d4] = (add(tmp[23], add(mul(v[0], m[23][28]), add(mul(v[1], m[23][29]), add(mul(v[2], m[23][30]), mul(v[3], m[23][31])))))); + psi[I + d3 + d4] = (add(tmp[24], add(mul(v[0], m[24][28]), add(mul(v[1], m[24][29]), add(mul(v[2], m[24][30]), mul(v[3], m[24][31])))))); + psi[I + d0 + d3 + d4] = (add(tmp[25], add(mul(v[0], m[25][28]), add(mul(v[1], m[25][29]), add(mul(v[2], m[25][30]), mul(v[3], m[25][31])))))); + psi[I + d1 + d3 + d4] = (add(tmp[26], add(mul(v[0], m[26][28]), add(mul(v[1], m[26][29]), add(mul(v[2], m[26][30]), mul(v[3], m[26][31])))))); + psi[I + d0 + d1 + d3 + d4] = (add(tmp[27], add(mul(v[0], m[27][28]), add(mul(v[1], m[27][29]), add(mul(v[2], m[27][30]), mul(v[3], m[27][31])))))); + psi[I + d2 + d3 + d4] = (add(tmp[28], add(mul(v[0], m[28][28]), add(mul(v[1], m[28][29]), add(mul(v[2], m[28][30]), mul(v[3], m[28][31])))))); + psi[I + d0 + d2 + d3 + d4] = (add(tmp[29], add(mul(v[0], m[29][28]), add(mul(v[1], m[29][29]), add(mul(v[2], m[29][30]), mul(v[3], m[29][31])))))); + psi[I + d1 + d2 + d3 + d4] = (add(tmp[30], add(mul(v[0], m[30][28]), add(mul(v[1], m[30][29]), add(mul(v[2], m[30][30]), mul(v[3], m[30][31])))))); + psi[I + d0 + d1 + d2 + d3 + d4] = (add(tmp[31], add(mul(v[0], m[31][28]), add(mul(v[1], m[31][29]), add(mul(v[2], m[31][30]), mul(v[3], m[31][31])))))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t ids_sorted[] = { id4, id3, id2, id1, id0 }; + std::sort(ids_sorted, ids_sorted + 5, std::greater()); + std::size_t n = 1UL << (ids_sorted[0] + 1); + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4; + std::size_t dsorted0 = 1UL << ids_sorted[0], dsorted1 = 1UL << ids_sorted[1], dsorted2 = 1UL << ids_sorted[2], dsorted3 = 1UL << ids_sorted[3], dsorted4 = 1UL << ids_sorted[4]; + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){ + for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m); + } + } + } + } + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE5) schedule(static) + for (omp::idx_t i0 = 0; i0 < n; i0 += 2 * dsorted0){ + for (omp::idx_t i1 = 0; i1 < dsorted0; i1 += 2 * dsorted1){ + for (omp::idx_t i2 = 0; i2 < dsorted1; i2 += 2 * dsorted2){ + for (omp::idx_t i3 = 0; i3 < dsorted2; i3 += 2 * dsorted3){ + for (omp::idx_t i4 = 0; i4 < dsorted3; i4 += 2 * dsorted4){ + for (omp::idx_t i5 = 0; i5 < dsorted4; ++i5){ + if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m); + } + } + } + } + } + } + } +} diff --git a/ccsrc/include/cppsim/nointrin/kernelgen.py b/ccsrc/include/cppsim/nointrin/kernelgen.py new file mode 100644 index 00000000..0eeb7214 --- /dev/null +++ b/ccsrc/include/cppsim/nointrin/kernelgen.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +import argparse +import itertools +import os + +def kernelgen(nqubits, ids=None, m=None, matvec=True, combinations=False): + # If m matrix is given explicitly, do not use matrix-vector form + # of operation, because we expect many terms to be removed, due to + # multiplication by zero m element. + if not (m is None): + matvec = False + + # All combinations of qubits, excluding dupes, e.g. for nqubits = 2: + # 0 0 + # 1 0 + # 0 1 + # 1 1 + combs = list(itertools.product([0, 1], repeat=nqubits)) + + # Pretty-print the indexed PSI array values. + strcombs = [] + for j in range(0, len(combs)): + comb = tuple(reversed(combs[j])) + strcomb = 'psi[I'.format(j) + for i in range(0, nqubits): + if comb[i] != 0: + strcomb += " + d{}".format(i) + strcomb += ']'; + strcombs.append(strcomb) + + left = '_' + right = '' + if matvec: + left = '[' + right = ']' + + # Pretty-print the right hand sides (recursively). + if m is None: + def rhs(n, j, i): + if i < n - 1: + return f'add(mul(v{left}{i}{right}, M({j}, {i})), ' + rhs(n, j, i + 1) + ''.join(')') + else: + return f'mul(v{left}{i}{right}, M({j}, {i})' + else: + def rhs(n, j, i): + if i < n - 1: + if m(j, i) != 0: + return f'add(mul(v{left}{i}{right}, M({j}, {i})), ' + rhs(n, j, i + 1) + ''.join(')') + else: + return rhs(n, j, i + 1) + else: + if m(j, i) != 0: + return f'mul(v{left}{i}{right}, M({j}, {i})' + else: + return '0' + + strrhs = [] + for j in range(0, len(strcombs)): + strrhs.append(rhs(len(strcombs), j, 0)) + + ids_sorted = [] + if ids != None: + ids_sorted = sorted(ids, reverse = True) + + # Some string constants clash with the {} syntax of print(), so we + # substitute them as constants. + newline = "\n" + + kernel = \ +""" +{include} +{include} +{include} +{include} +{eigen} +{combinations} + +{define} add(a, b) (a + b) +{define} mul(a, b) (a * b) + +{define} M(j, i) (m[j * {n} + i]) + +template<{d_template}class T> +inline void kernel_core(T* psi, std::size_t I{d_var}, const T* m) +{{ + {v} + {matvec} + {psi_assign} +}} + +{undef} add +{undef} mul +{undef} M +""".format( \ + include = "#include", \ + define = "#define", \ + undef = "#undef", \ + nqubits = nqubits, \ + eigen = "{define} EIGEN_DEFAULT_DENSE_INDEX_TYPE int{newline}{define} EIGEN_VECTORIZE{newline}{include} ".format(define="#define", newline=newline, include = "#include") if matvec else '', \ + combinations = "#include \"combinations.h\"" if combinations else '', \ + n = len(strcombs), + d_template = ''.join('std::size_t d{}, '.format(i) for i in range (0, nqubits)) if ids != None else '', \ + d_var = ''.join(', std::size_t d{}'.format(i) for i in range (0, nqubits)) if ids == None else '', \ + v = f"const std::array v = {{{newline}" + ''.join('{}{},{}'.format(' ' * 8, strcombs[i], newline) for i in range(0, len(strcombs))) + "{}}};{}".format(' ' * 4, newline) if matvec else ''.join('const auto v_{} = {};{}{}'.format(i, strcombs[i], newline, ' ' * 4) for i in range(0, len(strcombs))), \ + matvec = "const auto result = Eigen::Map>(m) * Eigen::Map>(v.data());{newline}".format(n = len(strcombs), newline = newline) if matvec else '', \ + psi_assign = ''.join('{} = {};{} '.format(strcombs[i], strrhs[i], newline) for i in range(0, len(strcombs))) if not matvec else ''.join('{} = result[{}];{} '.format(strcombs[i], i, newline) for i in range(0, len(strcombs)))) + \ +""" +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(T* psi, {id_var}const T* m, std::size_t ctrlmask) +{{ + {ids_sorted} + {sort} + {constexpr}std::size_t {n}; + {constexpr}std::size_t {d}; + {constexpr}std::size_t {dsorted}; + + if (ctrlmask == 0){{ + {pragma} omp for collapse({collapse}) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * {dsorted_0}){{ +{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < {dsorted_last}; ++i{nqubits}){{ + {offset_1}kernel_core{d_template}(psi, {i}, {d_args}m); + {offset}}} + }} + }} + else{{ + {pragma} omp for collapse({collapse}) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * {dsorted_0}){{ +{for_loops}{offset_2}for (std::size_t i{nqubits} = 0; i{nqubits} < {dsorted_last}; ++i{nqubits}){{ + {offset_1}if ((({i})&ctrlmask) == ctrlmask) + {offset_2}kernel_core{d_template}(psi, {i}, {d_args}m); + {offset}}} + }} + }} +}} + +""".format( \ + pragma = "#pragma", \ + nqubits = nqubits, \ + id_var = ''.join('unsigned id{}, '.format(nqubits - i - 1) for i in range (0, nqubits)) if ids == None else '', \ + constexpr = 'constexpr ' if ids != None else '', + d = f"d0 = 1UL << {'id0' if ids == None else ids[0]}{''.join(', d{} = 1UL << {}'.format(i, 'id{}'.format(i) if ids == None else ids[i]) for i in range (1, nqubits))}", \ + d_args = ''.join('d{}, '.format(i) for i in range (0, nqubits)) if ids == None else '', \ + n = 'n = 1UL << (ids_sorted[0] + 1)' if ids == None else f'n = 1UL << {ids_sorted[0] + 1}', \ + ids_sorted = (f"{'constexpr ' if ids != None else ''}std::size_t ids_sorted[] = {{ id{nqubits - 1}" + ''.join(', id{}'.format(nqubits - i - 1) for i in range (1, nqubits)) + f" }};") if ids == None else '', \ + sort = f'std::sort(ids_sorted, ids_sorted + {nqubits}, std::greater());' if ids == None else '', \ + dsorted = f"dsorted0 = 1UL << {'ids_sorted[0]' if ids == None else ids_sorted[0]}{''.join(', dsorted{} = 1UL << {}'.format(i, 'ids_sorted[{}]'.format(i) if ids == None else ids_sorted[i]) for i in range (1, nqubits))}", \ + dsorted_0 = "dsorted0", \ + dsorted_last = f"dsorted{nqubits - 1}", \ + collapse = f"{nqubits + 1}", \ + offset = ''.join(' '.format(i) for i in range (0, nqubits)), \ + offset_1 = ''.join(' '.format(i) for i in range (0, nqubits + 1)), \ + offset_2 = ''.join(' '.format(i) for i in range (0, nqubits + 2)), \ + d_template = ('') if ids != None else '', \ + i = 'i0' + ''.join(' + i{}'.format(i) for i in range (1, nqubits + 1)), \ + for_loops = ''.join('{}for (std::size_t i{} = 0; i{} < dsorted{}; i{} += 2 * dsorted{}){}'.format(''.join(' ' for j in range(0, i + 2)), i, i, i - 1, i, i, newline) for i in range (1, nqubits))) + + return kernel + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate Haener-Steiger quantum kernels in the form used in ProjectQ simulator') + parser.add_argument('nqubits', type=int, help='the number of qubits to generate the kernel for') + parser.add_argument('output', type=str, help='output file name') + parser.add_argument('-matvec', '--matvec', nargs='?', type=bool, default=True, help='kernel core implementation: as a matrix-vector operation (True, default), or as an explicit formula (False)') + parser.add_argument('-combinations', '--combinations', nargs='?', type=bool, default=True, help='multithreading implementation: combinations partitioner (True, default), or OpenMP collapse (False, only for CPU with small number of cores)') + args = parser.parse_args() + + nqubits = int(args.nqubits) + output = args.output + matvec = args.matvec + combinations = args.combinations + + try: + os.makedirs(os.path.dirname(output)) + except: + pass + with open(output, "w") as o: + o.write(kernelgen(nqubits, matvec=matvec, combinations=combinations)) + diff --git a/ccsrc/include/cppsim/nointrin/kernels.hpp b/ccsrc/include/cppsim/nointrin/kernels.hpp new file mode 100644 index 00000000..81abe524 --- /dev/null +++ b/ccsrc/include/cppsim/nointrin/kernels.hpp @@ -0,0 +1,34 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include // size_t +#include + +#define add(a, b) (a + b) +#define mul(a, b) (a * b) + +#include "cppsim_omp.hpp" + +#include "kernel1.hpp" +#include "kernel2.hpp" +#include "kernel3.hpp" +#include "kernel4.hpp" +#include "kernel5.hpp" + +#undef add +#undef mul + diff --git a/ccsrc/include/cppsim/partitioner.h b/ccsrc/include/cppsim/partitioner.h new file mode 100644 index 00000000..20e5d111 --- /dev/null +++ b/ccsrc/include/cppsim/partitioner.h @@ -0,0 +1,54 @@ +#ifndef COMBINATIONS_PARTITIONER_H +#define COMBINATIONS_PARTITIONER_H + +#include "combinations.h" + +#include + +class Partitioner +{ +public : + + template< + uint32_t ...Args, // Underlying combination parameters + class Starts + > + static void partition(Starts& starts, int& nworkers, uint32_t& maxCombinationsPerWorker) + { + using Combination = typename Combinations::template Combination::type; + + uint32_t totalNumberOfCombinations = Combinations::template popcount(); + maxCombinationsPerWorker = totalNumberOfCombinations / nworkers; + if (totalNumberOfCombinations % nworkers) maxCombinationsPerWorker++; + + // Record starting points for workers' cooperative processing. + starts.reserve(nworkers); + uint32_t i = maxCombinationsPerWorker; + Combinations::template iterate([&](auto... args) + { + if (i < maxCombinationsPerWorker) + { + i++; + return; + } + + // Combinations::iterate uses reversed order of starting point indices. + // We revert it here, in order to make the Combinations::iterate + // code more generic. + Combination start { args... }; + Combinations::template reverse(start); + starts.push_back(start); + i = 1; + }); + + // Re-evaluate the number of workers, as their number could be eventually + // smaller than the initially proposed number of workers. + nworkers = starts.size(); + + printf("%u iterations in total, %u workers, %u iterations per worker\n", + totalNumberOfCombinations, nworkers, maxCombinationsPerWorker); + } +}; + +#endif // COMBINATIONS_PARTITIONER_H + diff --git a/ccsrc/include/cppsim/schedule.h b/ccsrc/include/cppsim/schedule.h new file mode 100644 index 00000000..141f2461 --- /dev/null +++ b/ccsrc/include/cppsim/schedule.h @@ -0,0 +1,75 @@ +#ifndef COMBINATIONS_SCHEDULE_H +#define COMBINATIONS_SCHEDULE_H + +#include "combinations.h" + +#if defined(__CUDACC__) || defined(__HIPCC__) +#include "gpu/schedule.h" +#endif +#include "cpu/schedule.h" + +enum BackendPreference +{ + BackendNoPreference = 0, + BackendPreferCPU = 1, + BackendPreferDiscreteGPU = 2, + BackendPreferIntegratedGPU = 3 +}; + +template< + BackendPreference backend = BackendNoPreference> +class Schedule +{ +public : + + // Iterate through combinations with specific starting point and duration. + // For each combination, call a user-provided function. + template< + class Contexts, + uint32_t ...Args, // Underlying combination parameters + class Callable + > + static auto schedule(Callable c, int nworkers = 0) + { + if constexpr (backend == BackendNoPreference) + { + // Prioritize GPU execution, if supported. +#if defined(__CUDACC__) || defined(__HIPCC__) + return gpu::make_schedule< + Contexts, + Args...>(c, nworkers); +#else + return cpu::make_schedule< + Contexts, + Args...>(c, nworkers); +#endif + } +#if defined(__CUDACC__) || defined(__HIPCC__) + else if constexpr (backend == BackendPreferDiscreteGPU) + { + return gpu::make_schedule< + Contexts, + Args...>(c, nworkers); + } +#endif + else if constexpr (backend == BackendPreferCPU) + { + return cpu::make_schedule< + Contexts, + Args...>(c, nworkers); + } + else + { + throw std::invalid_argument("Unsupported backend"); + } + } + + template + static void iterate(Contexts& ctxs, Schedule& schedule) + { + schedule.execute(ctxs); + } +}; + +#endif // COMBINATIONS_SCHEDULE_H + diff --git a/ccsrc/include/cppsim/simulator.hpp b/ccsrc/include/cppsim/simulator.hpp new file mode 100644 index 00000000..f3c9df71 --- /dev/null +++ b/ccsrc/include/cppsim/simulator.hpp @@ -0,0 +1,585 @@ +// Copyright 2017 ProjectQ-Framework (www.projectq.ch) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_HPP_ +#define SIMULATOR_HPP_ + +#include +#include + +#if defined(NOINTRIN) || !defined(INTRIN) +#include "nointrin/kernels.hpp" +#else +#include "intrin/kernels.hpp" +#endif + +#include "cppsim_omp.hpp" +#include "intrin/alignedallocator.hpp" +#include "fusion.hpp" +#include "kernelgen.hpp" + +#include +#include +#include +#include +#include +#include + + +class Simulator{ +public: + using calc_type = double; + using complex_type = std::complex; + using StateVector = std::vector>; + using Map = std::map; + using RndEngine = std::mt19937; + using Term = std::vector>; + using TermsDict = std::vector>; + using ComplexTermsDict = std::vector>; + + Simulator(unsigned seed = 1) : N_(0), vec_(1,0.), fusion_qubits_min_(4), + fusion_qubits_max_(5), rnd_eng_(seed) { + vec_[0]=1.; // all-zero initial state + std::uniform_real_distribution dist(0., 1.); + rng_ = std::bind(dist, std::ref(rnd_eng_)); + } + + void allocate_qubit(unsigned id){ + if (map_.count(id) == 0){ + map_[id] = N_++; + StateVector newvec; // avoid large memory allocations + if( tmpBuff1_.capacity() >= (1UL << N_) ) + std::swap(newvec, tmpBuff1_); + newvec.resize(1UL << N_); +#pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < newvec.size(); ++i) + newvec[i] = (i < vec_.size())?vec_[i]:0.; + std::swap(vec_, newvec); + // recycle large memory + std::swap(tmpBuff1_, newvec); + if( tmpBuff1_.capacity() < tmpBuff2_.capacity() ) + std::swap(tmpBuff1_, tmpBuff2_); + } + else + throw(std::runtime_error( + "AllocateQubit: ID already exists. Qubit IDs should be unique.")); + } + + bool get_classical_value(unsigned id, calc_type tol = 1.e-12){ + run(); + unsigned pos = map_[id]; + std::size_t delta = (1UL << pos); + + for (std::size_t i = 0; i < vec_.size(); i += 2*delta){ + for (std::size_t j = 0; j < delta; ++j){ + if (std::norm(vec_[i+j]) > tol) + return false; + if (std::norm(vec_[i+j+delta]) > tol) + return true; + } + } + assert(false); // this will never happen + return false; // suppress 'control reaches end of non-void...' + } + + bool is_classical(unsigned id, calc_type tol = 1.e-12){ + run(); + unsigned pos = map_[id]; + std::size_t delta = (1UL << pos); + + short up = 0, down = 0; + #pragma omp parallel for schedule(static) reduction(|:up,down) + for (omp::idx_t i = 0; i < vec_.size(); i += 2*delta){ + for (omp::idx_t j = 0; j < delta; ++j){ + up = up | ((std::norm(vec_[i+j]) > tol)&1); + down = down | ((std::norm(vec_[i+j+delta]) > tol)&1); + } + } + + return 1 == (up^down); + } + + void collapse_vector(unsigned id, bool value = false, bool shrink = false){ + run(); + unsigned pos = map_[id]; + std::size_t delta = (1UL << pos); + + if (!shrink){ + #pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); i += 2*delta){ + for (std::size_t j = 0; j < delta; ++j) + vec_[i+j+static_cast(!value)*delta] = 0.; + } + } + else{ + StateVector newvec; // avoid costly memory reallocations + if( tmpBuff1_.capacity() >= (1UL << (N_-1)) ) + std::swap(tmpBuff1_, newvec); + newvec.resize((1UL << (N_-1))); + #pragma omp parallel for schedule(static) if(0) + for (omp::idx_t i = 0; i < vec_.size(); i += 2*delta) + std::copy_n(&vec_[i + static_cast(value)*delta], + delta, &newvec[i/2]); + std::swap(vec_, newvec); + std::swap(tmpBuff1_, newvec); + if( tmpBuff1_.capacity() < tmpBuff2_.capacity() ) + std::swap(tmpBuff1_, tmpBuff2_); + + for (auto& p : map_){ + if (p.second > pos) + p.second--; + } + map_.erase(id); + N_--; + } + } + + void measure_qubits(std::vector const& ids, std::vector &res){ + run(); + + std::vector positions(ids.size()); + for (unsigned i = 0; i < ids.size(); ++i) + positions[i] = map_[ids[i]]; + + calc_type P = 0.; + calc_type rnd = rng_(); + + // pick entry at random with probability |entry|^2 + std::size_t pick = 0; + while (P < rnd && pick < vec_.size()) + P += std::norm(vec_[pick++]); + + pick--; + // determine result vector (boolean values for each qubit) + // and create mask to detect bad entries (i.e., entries that don't agree with measurement) + res = std::vector(ids.size()); + std::size_t mask = 0; + std::size_t val = 0; + for (unsigned i = 0; i < ids.size(); ++i){ + bool r = ((pick >> positions[i]) & 1) == 1; + res[i] = r; + mask |= (1UL << positions[i]); + val |= (static_cast(r&1) << positions[i]); + } + // set bad entries to 0 + calc_type N = 0.; + #pragma omp parallel for reduction(+:N) schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i){ + if ((i & mask) != val) + vec_[i] = 0.; + else + N += std::norm(vec_[i]); + } + // re-normalize + N = 1./std::sqrt(N); + #pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i) + vec_[i] *= N; + } + + std::vector measure_qubits_return(std::vector const& ids){ + std::vector ret; + measure_qubits(ids, ret); + return ret; + } + + void deallocate_qubit(unsigned id){ + run(); + assert(map_.count(id) == 1); + if (!is_classical(id)) + throw(std::runtime_error("Error: Qubit has not been measured / uncomputed! There is most likely a bug in your code.")); + + bool value = get_classical_value(id); + collapse_vector(id, value, true); + } + + template + void apply_controlled_gate(M const& m, const std::vector& ids, + const std::vector& ctrl){ + auto fused_gates = fused_gates_; + fused_gates.insert(m, ids, ctrl); + + if (fused_gates.num_qubits() >= fusion_qubits_min_ + && fused_gates.num_qubits() <= fusion_qubits_max_){ + fused_gates_ = fused_gates; + run(); + } + else if (fused_gates.num_qubits() > fusion_qubits_max_ + || (fused_gates.num_qubits() - ids.size()) > fused_gates_.num_qubits()){ + run(); + fused_gates_.insert(m, ids, ctrl); + } + else + fused_gates_ = fused_gates; + } + + template + void emulate_math(F const& f, QuReg quregs, const std::vector& ctrl, + bool parallelize = false){ + run(); + auto ctrlmask = get_control_mask(ctrl); + + for (unsigned i = 0; i < quregs.size(); ++i) + for (unsigned j = 0; j < quregs[i].size(); ++j) + quregs[i][j] = map_[quregs[i][j]]; + + StateVector newvec; // avoid costly memory reallocations + if( tmpBuff1_.capacity() >= vec_.size() ) + std::swap(newvec, tmpBuff1_); + newvec.resize(vec_.size()); +#pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); i++) + newvec[i] = 0; + +//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5 + { + std::vector res(quregs.size()); + //#pragma omp for schedule(static) + for (std::size_t i = 0; i < vec_.size(); ++i){ + if ((ctrlmask&i) == ctrlmask){ + for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){ + res[qr_i] = 0; + for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i) + res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i; + } + f(res); + auto new_i = i; + for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){ + for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){ + if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1))) + new_i ^= (1UL << quregs[qr_i][qb_i]); + } + } + newvec[new_i] += vec_[i]; + } + else + newvec[i] += vec_[i]; + } + } + std::swap(vec_, newvec); + std::swap(tmpBuff1_, newvec); + } + + // faster version without calling python + template + inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector& ctrl) + { + emulate_math([a](std::vector &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true); + } + + // faster version without calling python + template + inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector& ctrl) + { + emulate_math([a,N](std::vector &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true); + } + + // faster version without calling python + template + inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector& ctrl) + { + emulate_math([a,N](std::vector &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true); + } + + calc_type get_expectation_value(TermsDict const& td, std::vector const& ids){ + run(); + calc_type expectation = 0.; + + StateVector current_state; // avoid costly memory reallocations + if( tmpBuff1_.capacity() >= vec_.size() ) + std::swap(tmpBuff1_, current_state); + current_state.resize(vec_.size()); +#pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i) + current_state[i] = vec_[i]; + + for (auto const& term : td){ + auto const& coefficient = term.second; + apply_term(term.first, ids, {}); + calc_type delta = 0.; + #pragma omp parallel for reduction(+:delta) schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i){ + auto const a1 = std::real(current_state[i]); + auto const b1 = -std::imag(current_state[i]); + auto const a2 = std::real(vec_[i]); + auto const b2 = std::imag(vec_[i]); + delta += a1 * a2 - b1 * b2; + // reset vec_ + vec_[i] = current_state[i]; + } + expectation += coefficient * delta; + } + std::swap(current_state, tmpBuff1_); + return expectation; + } + + void apply_qubit_operator(ComplexTermsDict const& td, std::vector const& ids){ + run(); + StateVector new_state, current_state; // avoid costly memory reallocations + if( tmpBuff1_.capacity() >= vec_.size() ) + std::swap(tmpBuff1_, new_state); + if( tmpBuff2_.capacity() >= vec_.size() ) + std::swap(tmpBuff2_, current_state); + new_state.resize(vec_.size()); + current_state.resize(vec_.size()); +#pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i){ + new_state[i] = 0; + current_state[i] = vec_[i]; + } + for (auto const& term : td){ + auto const& coefficient = term.second; + apply_term(term.first, ids, {}); + #pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i){ + new_state[i] += coefficient * vec_[i]; + vec_[i] = current_state[i]; + } + } + std::swap(vec_, new_state); + std::swap(tmpBuff1_, new_state); + std::swap(tmpBuff2_, current_state); + } + + calc_type get_probability(std::vector const& bit_string, + std::vector const& ids){ + run(); + if (!check_ids(ids)) + throw(std::runtime_error("get_probability(): Unknown qubit id. Please make sure you have called eng.flush().")); + std::size_t mask = 0, bit_str = 0; + for (unsigned i = 0; i < ids.size(); ++i){ + mask |= 1UL << map_[ids[i]]; + bit_str |= (bit_string[i]?1UL:0UL) << map_[ids[i]]; + } + calc_type probability = 0.; + #pragma omp parallel for reduction(+:probability) schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i) + if ((i & mask) == bit_str) + probability += std::norm(vec_[i]); + return probability; + } + + complex_type const& get_amplitude(std::vector const& bit_string, + std::vector const& ids){ + run(); + std::size_t chk = 0; + std::size_t index = 0; + for (unsigned i = 0; i < ids.size(); ++i){ + if (map_.count(ids[i]) == 0) + break; + chk |= 1UL << map_[ids[i]]; + index |= (bit_string[i]?1UL:0UL) << map_[ids[i]]; + } + if (chk + 1 != vec_.size()) + throw(std::runtime_error("The second argument to get_amplitude() must be a permutation of all allocated qubits. Please make sure you have called eng.flush().")); + return vec_[index]; + } + + void emulate_time_evolution(TermsDict const& tdict, calc_type const& time, + std::vector const& ids, + std::vector const& ctrl){ + run(); + complex_type I(0., 1.); + calc_type tr = 0., op_nrm = 0.; + TermsDict td; + for (unsigned i = 0; i < tdict.size(); ++i){ + if (tdict[i].first.size() == 0) + tr += tdict[i].second; + else{ + td.push_back(tdict[i]); + op_nrm += std::abs(tdict[i].second); + } + } + unsigned s = std::abs(time) * op_nrm + 1.; + complex_type correction = std::exp(-time * I * tr / (double)s); + auto output_state = vec_; + auto ctrlmask = get_control_mask(ctrl); + for (unsigned i = 0; i < s; ++i){ + calc_type nrm_change = 1.; + for (unsigned k = 0; nrm_change > 1.e-12; ++k){ + auto coeff = (-time * I) / double(s * (k + 1)); + auto current_state = vec_; + auto update = StateVector(vec_.size(), 0.); + for (auto const& tup : td){ + apply_term(tup.first, ids, {}); + #pragma omp parallel for schedule(static) + for (omp::idx_t j = 0; j < vec_.size(); ++j){ + update[j] += vec_[j] * tup.second; + vec_[j] = current_state[j]; + } + } + nrm_change = 0.; + #pragma omp parallel for reduction(+:nrm_change) schedule(static) + for (omp::idx_t j = 0; j < vec_.size(); ++j){ + update[j] *= coeff; + vec_[j] = update[j]; + if ((j & ctrlmask) == ctrlmask){ + output_state[j] += update[j]; + nrm_change += std::norm(update[j]); + } + } + nrm_change = std::sqrt(nrm_change); + } + #pragma omp parallel for schedule(static) + for (omp::idx_t j = 0; j < vec_.size(); ++j){ + if ((j & ctrlmask) == ctrlmask) + output_state[j] *= correction; + vec_[j] = output_state[j]; + } + } + } + + void set_wavefunction(StateVector const& wavefunction, std::vector const& ordering){ + run(); + // make sure there are 2^n amplitudes for n qubits + assert(wavefunction.size() == (1UL << ordering.size())); + // check that all qubits have been allocated previously + if (map_.size() != ordering.size() || !check_ids(ordering)) + throw(std::runtime_error("set_wavefunction(): Invalid mapping provided. Please make sure all qubits have been allocated previously (call eng.flush()).")); + + // set mapping and wavefunction + for (unsigned i = 0; i < ordering.size(); ++i) + map_[ordering[i]] = i; + #pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < wavefunction.size(); ++i) + vec_[i] = wavefunction[i]; + } + + void collapse_wavefunction(std::vector const& ids, std::vector const& values){ + run(); + if (ids.size() != values.size()) + throw(std::length_error("collapse_wavefunction(): ids and values size mismatch")); + if (!check_ids(ids)) + throw(std::runtime_error("collapse_wavefunction(): Unknown qubit id(s) provided. Try calling eng.flush() before invoking this function.")); + std::size_t mask = 0, val = 0; + for (unsigned i = 0; i < ids.size(); ++i){ + mask |= (1UL << map_[ids[i]]); + val |= ((values[i]?1UL:0UL) << map_[ids[i]]); + } + // set bad entries to 0 and compute probability of outcome to renormalize + calc_type N = 0.; + #pragma omp parallel for reduction(+:N) schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i){ + if ((i & mask) == val) + N += std::norm(vec_[i]); + } + if (N < 1.e-12) + throw(std::runtime_error("collapse_wavefunction(): Invalid collapse! Probability is ~0.")); + // re-normalize (if possible) + N = 1./std::sqrt(N); + #pragma omp parallel for schedule(static) + for (omp::idx_t i = 0; i < vec_.size(); ++i){ + if ((i & mask) != val) + vec_[i] = 0.; + else + vec_[i] *= N; + } + } + + void run(){ + if (fused_gates_.size() < 1) + return; + + Fusion::Matrix m; + Fusion::IndexVector ids, ctrls; + + fused_gates_.perform_fusion(m, ids, ctrls); + + for (auto& id : ids) + id = map_[id]; + + auto ctrlmask = get_control_mask(ctrls); + + switch (ids.size()){ + case 1: + #pragma omp parallel + kernel(vec_, ids[0], m, ctrlmask); + break; + case 2: + #pragma omp parallel + kernel(vec_, ids[1], ids[0], m, ctrlmask); + break; + case 3: + #pragma omp parallel + kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask); + break; + case 4: + #pragma omp parallel + kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask); + break; + case 5: + #pragma omp parallel + kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask); + break; + default: + // Use embedded generator to generate larger gates in runtime + kernelgen(vec_, ids, m, ctrlmask); + break; + } + + fused_gates_ = Fusion(); + } + + std::tuple cheat(){ + run(); + return make_tuple(map_, std::ref(vec_)); + } + + ~Simulator(){ + } + +private: + void apply_term(Term const& term, std::vector const& ids, + std::vector const& ctrl){ + complex_type I(0., 1.); + Fusion::Matrix X = {{0., 1.}, {1., 0.}}; + Fusion::Matrix Y = {{0., -I}, {I, 0.}}; + Fusion::Matrix Z = {{1., 0.}, {0., -1.}}; + std::vector gates = {X, Y, Z}; + for (auto const& local_op : term){ + unsigned id = ids[local_op.first]; + apply_controlled_gate(gates[local_op.second - 'X'], {id}, ctrl); + } + run(); + } + std::size_t get_control_mask(std::vector const& ctrls){ + std::size_t ctrlmask = 0; + for (auto c : ctrls) + ctrlmask |= (1UL << map_[c]); + return ctrlmask; + } + + bool check_ids(std::vector const& ids){ + for (auto id : ids) + if (!map_.count(id)) + return false; + return true; + } + + unsigned N_; // #qubits + StateVector vec_; + Map map_; + Fusion fused_gates_; + unsigned fusion_qubits_min_, fusion_qubits_max_; + RndEngine rnd_eng_; + std::function rng_; + + // large array buffers to avoid costly reallocations + static StateVector tmpBuff1_, tmpBuff2_; +}; + +Simulator::StateVector Simulator::tmpBuff1_; +Simulator::StateVector Simulator::tmpBuff2_; + +#endif diff --git a/ccsrc/include/cppsim/tempfile.h b/ccsrc/include/cppsim/tempfile.h new file mode 100644 index 00000000..b6422463 --- /dev/null +++ b/ccsrc/include/cppsim/tempfile.h @@ -0,0 +1,39 @@ +#ifndef TEMP_FILE_H +#define TEMP_FILE_H + +#include + +#if __has_include() +# include +#endif + +#if __has_include() && __cpp_lib_filesystem >= 201703 +#include +#include +namespace ec_ns = std; +namespace fs = std::filesystem; +#else +#include +#include +namespace ec_ns = boost::system; +namespace fs = boost::filesystem; +#endif + + +class TempFile +{ + ec_ns::error_code ec; + + std::string filename; + +public : + + const std::string& string(std::error_code& ec) const; + + TempFile(const std::string& mask); + + ~TempFile(); +}; + +#endif // TEMP_FILE_H + diff --git a/ccsrc/lib/CMakeLists.txt b/ccsrc/lib/CMakeLists.txt index 9e230d40..fb0e8eed 100644 --- a/ccsrc/lib/CMakeLists.txt +++ b/ccsrc/lib/CMakeLists.txt @@ -18,6 +18,7 @@ add_subdirectory(mq_base) add_subdirectory(simulator) +add_subdirectory(cppsim) # ============================================================================== diff --git a/ccsrc/lib/cppsim/CMakeLists.txt b/ccsrc/lib/cppsim/CMakeLists.txt new file mode 100644 index 00000000..7dcbcaa6 --- /dev/null +++ b/ccsrc/lib/cppsim/CMakeLists.txt @@ -0,0 +1,305 @@ +cmake_minimum_required(VERSION 3.20 FATAL_ERROR) + +project( + cppsim + VERSION 1.0.0 + LANGUAGES C CXX) + +# Minimum required Python version (used both in this file and in the installed CMake configuration) +set(CPPSIM_PYTHON_VERSION_MIN 3.7.0) + +option(BUILD_TESTING "Build the test suite?" OFF) + +if(NOT DEFINED ITERATOR_DEBUG_VALUE) + set(ITERATOR_DEBUG_VALUE 0) +endif() +option(MSVC_ITERATOR_DEBUG "Define _ITERATOR_DEBUG_LEVEL (defaults to 0, can be set using ITERATOR_DEBUG_VALUE)" OFF) + +# ============================================================================== + +include(GNUInstallDirs) + +set(CPPSIM_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}") +set(CPPSIM_INSTALL_SBINDIR "${CMAKE_INSTALL_SBINDIR}") +set(CPPSIM_INSTALL_SYSCONFDIR "${CMAKE_INSTALL_SYSCONFDIR}") +set(CPPSIM_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/cppsim") +set(CPPSIM_INSTALL_DATADIR "${CMAKE_INSTALL_DATADIR}/cppsim") +set(CPPSIM_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}/cppsim") +set(CPPSIM_INSTALL_DOCDIR "${CMAKE_INSTALL_DATADIR}/doc/cppsim") +set(CPPSIM_INSTALL_CMAKEDIR "${CPPSIM_INSTALL_DATADIR}/cmake") +set(CPPSIM_INSTALL_3RDPARTYDIR "${CPPSIM_INSTALL_LIBDIR}/third_party") + +# ============================================================================== + +# TODO +include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/compiler_has_std_filesystem.cmake") + +# ============================================================================== + +include(FetchContent) +set(FETCHCONTENT_QUIET OFF) + +if(BUILD_TESTING) + FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG 25cc5777a17820a6339204a3552aa1dd5e428669) + FetchContent_Declare( + hipThrust + GIT_REPOSITORY https://github.com/dmikushin/Thrust.git + GIT_TAG 9a12c1259805ed0a3a5fe9bdeb098a872deb936b) + FetchContent_MakeAvailable(hipThrust googletest) + + if(NOT DEFINED Eigen3_DIR) + FetchContent_Declare( + Eigen3 + GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git + GIT_TAG 3147391d946bb4b6c68edd901f2add6ac1f31f8c) + FetchContent_MakeAvailable(googletest Eigen3) + else() + find_package(Eigen3 CONFIG REQUIRED) + endif() +endif() + +# ------------------------------------------------------------------------------ +# In the case of digestpp, the repository only consists of a list of files/directories that we need both our build +# process to find as well as future users of an installed copy of cppsim. +# +# We therefore provide a target that we use for our current build and also install its files into a sub-directory of the +# installation prefix. We then define the digestpp::digestpp target directly within cppsimConfig.cmake + +FetchContent_Declare( + digestpp + GIT_REPOSITORY https://github.com/kerukuro/digestpp.git + GIT_TAG 4ec4106677e652a90716ad929d657a622089ef16) +FetchContent_MakeAvailable(digestpp) + +add_library(digestpp::digestpp INTERFACE IMPORTED) +target_include_directories(digestpp::digestpp INTERFACE $) + +install( + DIRECTORY ${digestpp_SOURCE_DIR} + DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR}/third_party + PATTERN docs EXCLUDE + PATTERN .git EXCLUDE) + +# ------------------------------------------------------------------------------ + +find_package(res_embed QUIET CONFIG) +if(NOT res_embed_FOUND) + FetchContent_Declare( + res_embed + GIT_REPOSITORY https://github.com/dmikushin/res_embed.git + GIT_TAG 93b5711070086dea53c3b535018ff34e68479242) + FetchContent_MakeAvailable(res_embed) +else() + message(STATUS "Found res_embed at ${res_embed_DIR}") +endif() +include(ResEmbed) + +# ------------------------------------------------------------------------------ + +find_package(Python ${CPPSIM_PYTHON_VERSION_MIN} COMPONENTS Interpreter Development.Module) + +# ------------------------------------------------------------------------------ + +if(APPLE) + find_program(BREW_CMD brew PATHS /usr/local/bin) + if(BREW_CMD) + # Homebrew installs libomp in ${LIBOMP_PREFIX}/lib and the headers in ${LIBOMP_PREFIX}/include + execute_process(COMMAND ${BREW_CMD} --prefix libomp OUTPUT_VARIABLE LIBOMP_PREFIX) + string(STRIP ${LIBOMP_PREFIX} LIBOMP_PREFIX) + + find_library( + LIBOMP_LIB omp gomp libomp + HINTS ${LIBOMP_PREFIX} + PATH_SUFFIXES lib + NO_DEFAULT_PATH) + if(LIBOMP_LIB) + get_filename_component(LIBOMP_DIR ${LIBOMP_LIB} DIRECTORY) + list(APPEND CMAKE_LIBRARY_PATH ${LIBOMP_DIR}) + endif() + + find_path( + LIBOMP_INC omp.h + HINTS ${LIBOMP_PREFIX} + PATH_SUFFIXES include + NO_DEFAULT_PATH) + if(LIBOMP_INC) + list(APPEND CMAKE_INCLUDE_PATH ${LIBOMP_INC}) + else() + message(WARNING "Unable to locate omp.h, the code might not compile properly.\n" + "You might want to try installing the `libomp` Homebrew formula: brew install libomp") + endif() + else() + set(_macports_install_prefix "/opt/local") + # MacPorts install libomp in ${_macports_install_prefix}/lib/libomp and the headers in + # ${_macports_install_prefix}/include/libomp + find_library( + LIBOMP_LIB omp gomp libomp + PATHS "${_macports_install_prefix}/lib" + PATH_SUFFIXES libomp + NO_DEFAULT_PATH) + if(LIBOMP_LIB) + get_filename_component(LIBOMP_DIR ${LIBOMP_LIB} DIRECTORY) + list(APPEND CMAKE_LIBRARY_PATH ${LIBOMP_DIR}) + endif() + + find_path( + LIBOMP_INC omp.h + PATHS "${_macports_install_prefix}/include" + PATH_SUFFIXES libomp + NO_DEFAULT_PATH) + if(LIBOMP_INC) + list(APPEND CMAKE_INCLUDE_PATH ${LIBOMP_INC}) + else() + message(WARNING "Unable to locate omp.h, the code might not compile properly.\n" + "You might want to try installing the `libomp` MacPorts port: sudo port install libomp") + endif() + endif() +endif() + +find_package(OpenMP REQUIRED) + +# ------------------------------------------------------------------------------ + +# TODO Shouldn't we take dlfcn-win32 to third_party, +# or replace it with cross-platform dynalo? +if(MSVC + OR MINGW + OR MSYS) + find_package(dlfcn-win32 CONFIG QUIET) + if(NOT dlfcn-win32_FOUND) + set(_build_shared_libs ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS OFF) + FetchContent_Declare( + dlfcn-win32 + GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git + GIT_TAG 9d0ef119d9fcb9139f831adc224857b791c81140) + FetchContent_MakeAvailable(dlfcn-win32) + set(BUILD_SHARED_LIBS ${_build_shared_libs}) + target_include_directories(dl PUBLIC $) + list(APPEND CMAKE_DL_LIBS dl) + else() + message(STATUS "Found dlfcn-win32 at ${dlfcn-win32_DIR}") + list(APPEND CMAKE_DL_LIBS dlfcn-win32::dl) + endif() +endif() + +# ------------------------------------------------------------------------------ + +set(filesystem_LIBS) +if(NOT CPPSIM_HAS_STD_FILESYSTEM) + find_package(Boost REQUIRED COMPONENTS filesystem) + set(filesystem_LIBS Boost::filesystem) +endif() + +# ============================================================================== + +include(CMakePackageConfigHelpers) + +set(_namespace cppsim::) + +configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/../../../cmake/cppsimConfig.cmake.in + ${PROJECT_BINARY_DIR}/cppsimConfig.cmake INSTALL_DESTINATION ${CPPSIM_INSTALL_CMAKEDIR}) + +write_basic_package_version_file(${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake COMPATIBILITY SameMajorVersion) + +install(FILES ${PROJECT_BINARY_DIR}/cppsimConfig.cmake ${PROJECT_BINARY_DIR}/cppsimConfigVersion.cmake + DESTINATION ${CPPSIM_INSTALL_CMAKEDIR}) + +install(DIRECTORY ${PROJECT_SOURCE_DIR}/../../../cmake/commands DESTINATION ${CPPSIM_INSTALL_CMAKEDIR}) +install(FILES ${PROJECT_SOURCE_DIR}/../../../cmake/compiler_has_std_filesystem.cmake DESTINATION ${CPPSIM_INSTALL_CMAKEDIR}) + +file(GLOB _headers ${CMAKE_CURRENT_LIST_DIR}/include/*.h ${CMAKE_CURRENT_LIST_DIR}/include/*.hpp LIST_DIRECTORIES FALSE) +install(FILES ${_headers} DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR}) +install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/cpu ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/gpu + ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/intrin ${CMAKE_CURRENT_LIST_DIR}/../../include/cppsim/nointrin + DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR}) + +# ============================================================================== + +add_library(kernelgen STATIC "kernelgen.cpp" "compiler.cpp" "tempfile.cpp") +target_compile_features(kernelgen PUBLIC cxx_std_17) +set_property(TARGET kernelgen PROPERTY POSITION_INDEPENDENT_CODE ON) +target_include_directories(kernelgen PUBLIC $ + $) + +if(MSVC AND MSVC_ITERATOR_DEBUG) + target_compile_definitions(kernelgen PUBLIC _ITERATOR_DEBUG=${ITERATOR_DEBUG_VALUE}) +endif() + +res_embed( + TARGET kernelgen + NAME "nointrin" + PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../include/cppsim/nointrin/kernelgen.py" + KEYWORD) + +target_link_libraries(kernelgen PUBLIC digestpp::digestpp pybind11::module OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS} + ${filesystem_LIBS}) + +pybind11_add_module(${PROJECT_NAME} MODULE "_${PROJECT_NAME}.cpp") +target_link_libraries(${PROJECT_NAME} PRIVATE kernelgen) + +# ============================================================================== + +install( + TARGETS kernelgen kernelgen_nointrin + EXPORT cppsimTargets + PRIVATE_HEADER DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR} + PUBLIC_HEADER DESTINATION ${CPPSIM_INSTALL_INCLUDEDIR} + ARCHIVE DESTINATION ${CPPSIM_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CPPSIM_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CPPSIM_INSTALL_BINDIR}) + +install( + EXPORT cppsimTargets + NAMESPACE ${_namespace} + DESTINATION ${CPPSIM_INSTALL_CMAKEDIR}) + +# ============================================================================== + +if(BUILD_TESTING) + include(kernelgen) + + add_executable(test_nointrin "test/test_nointrin.cpp") + target_compile_features(test_nointrin PRIVATE cxx_std_17) + target_link_libraries(test_nointrin PRIVATE gtest kernelgen Eigen3::Eigen) + + kernelgen( + TARGET test_nointrin + NQUBITS 1 + VARIANT nointrin + COMBINATIONS) + kernelgen( + TARGET test_nointrin + NQUBITS 2 + VARIANT nointrin + COMBINATIONS) + kernelgen( + TARGET test_nointrin + NQUBITS 3 + VARIANT nointrin + COMBINATIONS) + kernelgen( + TARGET test_nointrin + NQUBITS 4 + VARIANT nointrin + COMBINATIONS) + kernelgen( + TARGET test_nointrin + NQUBITS 5 + VARIANT nointrin + COMBINATIONS) + + add_executable(test_popcount "test/test_popcount.cpp") + target_compile_features(test_popcount PRIVATE cxx_std_17) + target_link_libraries(test_popcount PRIVATE Eigen3::Eigen gtest) + + add_executable(test_combinations "test/test_combinations.cpp") + target_compile_features(test_combinations PRIVATE cxx_std_17) + target_link_libraries(test_combinations PRIVATE gtest Eigen3::Eigen OpenMP::OpenMP_CXX) + + add_executable(benchmark "benchmark/benchmark.cpp") + target_link_libraries(benchmark PRIVATE gtest kernelgen) +endif() diff --git a/ccsrc/lib/cppsim/README.md b/ccsrc/lib/cppsim/README.md new file mode 100644 index 00000000..158131f0 --- /dev/null +++ b/ccsrc/lib/cppsim/README.md @@ -0,0 +1,60 @@ +# _cppsim + +This is a standalone simulator backend for the ProjectQ framework, extended to +generate Haener-Steiger quantum kernels for arbitrary number of qubits. + +## Description + +The original code provides handwritten kernels of up to 5 qubits in the following form: + +```c++ +template +inline void kernel_core(V &psi, std::size_t I, std::size_t d0, M const& m) +{ + std::complex v[2]; + v[0] = psi[I]; + v[1] = psi[I + d0]; + + psi[I] = (add(mul(v[0], m[0][0]), mul(v[1], m[0][1]))); + psi[I + d0] = (add(mul(v[0], m[1][0]), mul(v[1], m[1][1]))); + +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask) +{ + std::size_t n = psi.size(); + std::size_t d0 = 1UL << id0; + std::size_t dsorted[] = {d0 }; + std::sort(dsorted, dsorted + 1, std::greater()); + + if (ctrlmask == 0){ + #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + kernel_core(psi, i0 + i1, d0, m); + } + } + } + else{ + #pragma omp for collapse(LOOP_COLLAPSE1) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; ++i1){ + if (((i0 + i1)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1, d0, m); + } + } + } +} +``` + +The proposed generator reproduces the hand-written kernels, and extends support to unlimited number of qubits. + +## Testing + +``` +./test_nointrin +./benchmark +``` + diff --git a/ccsrc/lib/cppsim/_cppsim.cpp b/ccsrc/lib/cppsim/_cppsim.cpp new file mode 100644 index 00000000..8f17e0c0 --- /dev/null +++ b/ccsrc/lib/cppsim/_cppsim.cpp @@ -0,0 +1,66 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cppsim_omp.hpp" +#include "simulator.hpp" + +namespace py = pybind11; + +using c_type = std::complex; +using ArrayType = std::vector>; +using MatrixType = std::vector; +using QuRegs = std::vector>; + +template +void emulate_math_wrapper(Simulator &sim, py::function const& pyfunc, QR const& qr, std::vector const& ctrls){ + auto f = [&](std::vector& x) { + pybind11::gil_scoped_acquire acquire; + x = pyfunc(x).cast>(); + }; + pybind11::gil_scoped_release release; + sim.emulate_math(f, qr, ctrls); +} + +PYBIND11_MODULE(_cppsim, m) +{ + py::class_(m, "Simulator") + .def(py::init()) + .def("allocate_qubit", &Simulator::allocate_qubit) + .def("deallocate_qubit", &Simulator::deallocate_qubit) + .def("get_classical_value", &Simulator::get_classical_value) + .def("is_classical", &Simulator::is_classical) + .def("measure_qubits", &Simulator::measure_qubits_return) + .def("apply_controlled_gate", &Simulator::apply_controlled_gate) + .def("emulate_math", &emulate_math_wrapper) + .def("emulate_math_addConstant", &Simulator::emulate_math_addConstant) + .def("emulate_math_addConstantModN", &Simulator::emulate_math_addConstantModN) + .def("emulate_math_multiplyByConstantModN", &Simulator::emulate_math_multiplyByConstantModN) + .def("get_expectation_value", &Simulator::get_expectation_value) + .def("apply_qubit_operator", &Simulator::apply_qubit_operator) + .def("emulate_time_evolution", &Simulator::emulate_time_evolution) + .def("get_probability", &Simulator::get_probability) + .def("get_amplitude", &Simulator::get_amplitude) + .def("set_wavefunction", &Simulator::set_wavefunction) + .def("collapse_wavefunction", &Simulator::collapse_wavefunction) + .def("run", &Simulator::run) + .def("cheat", &Simulator::cheat) + ; +} diff --git a/ccsrc/lib/cppsim/benchmark/benchmark.cpp b/ccsrc/lib/cppsim/benchmark/benchmark.cpp new file mode 100644 index 00000000..0e8683a2 --- /dev/null +++ b/ccsrc/lib/cppsim/benchmark/benchmark.cpp @@ -0,0 +1,50 @@ +#include "kernelgen.hpp" + +#include "gtest/gtest.h" + +#include +#include +#include + +template +bool benchmark() +{ + std::array ids; + size_t n = 1; + for (int i = 0; i < nqubits; i++) + { + ids[i] = i; + n += 1UL << i; + } + + std::default_random_engine dre; + std::uniform_int_distribution uid(-1000, 1000); + + // Generate m matrix as integers. + std::array, 1UL << nqubits> m; + for (int j = 0; j < m.size(); j++) + for (int i = 0; i < m.size(); i++) + m[j][i] = uid(dre); + + // Generate psi matrix as integers. + std::vector psi(n); + for (int i = 0; i < psi.size(); i++) + psi[i] = uid(dre); + + // Generate control mask. + std::size_t ctrlmask = 0; // uid(dre); + + kernelgen(psi, ids, m, ctrlmask); +} + +TEST(nointrin, kernel6) +{ + benchmark<6>(); +} + +int main(int argc, char* argv[]) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/ccsrc/lib/cppsim/compiler.cpp b/ccsrc/lib/cppsim/compiler.cpp new file mode 100644 index 00000000..c4e85257 --- /dev/null +++ b/ccsrc/lib/cppsim/compiler.cpp @@ -0,0 +1,214 @@ +#include "compiler.h" +#include "tempfile.h" +#include "digestpp.hpp" + +#include // for array +#include // for system +#include // for ofstream +#include // for istreambuf_iterator, operator== +#include // for map +#include // for stringstream +#include // for error_code +#include // for make_pair, pair + +#include // for dlerror, dlopen, dlsym + +Compiler::Compiler() { } + +class Signature +{ + int nqubits; + std::string source; + + std::string hash_; + +public : + + const std::string& hash() const { return hash_; } + + Signature( + int nqubits_, + const std::string& source_) : + + nqubits(nqubits_), + source(source_) + + { + std::stringstream ss; + // TODO Absorb individually. + ss << digestpp::sha512().absorb(reinterpret_cast(this), sizeof(Signature)).hexdigest(); + hash_ = ss.str(); + } +}; + +static std::map database; + +void* Compiler::codegen( + int nqubits, + const std::string& source, + std::string& errmsg) +{ + std::error_code ec; + + // 0) Check whether the kernel has been already compiled for the + // requested dimensions. + auto hash = Signature(nqubits, source).hash(); + auto existing = database.find(hash); + if (existing != database.end()) + return existing->second; + + // 1) Create source file. + const char* filenameTemplate = "kernelgenXXXXXX"; + std::string filename = TempFile(filenameTemplate).string(ec); + if (ec) return nullptr; + { + std::stringstream ss; + + // Add the content of engine include file. + ss << "#include "; + ss << std::endl; + ss << "#include "; + ss << std::endl; + ss << "#include "; + ss << std::endl; + ss << "#include "; + ss << std::endl; + ss << "template "; + ss << std::endl; + ss << "inline T add(T a, T b){ return a + b; }"; + ss << std::endl; + ss << "template "; + ss << "inline T mul(T a, T b){ return a * b; }"; + ss << std::endl; + ss << std::endl; + + // Add source code. + ss << source; + + // Adding entrypoints. + for (auto type : std::array { + std::make_pair("std::complex", "double"), + std::make_pair("int", "int") + }) + { + ss << std::endl; + ss << "extern \"C\" void kernel_"; + ss << type.second; + ss << "("; + ss << type.second; + ss << "* psi, const unsigned* ids, const "; + ss << type.second; + ss << "* m, std::size_t ctrlmask)"; + ss << std::endl; + ss << "{"; + ss << std::endl; + ss << "\tkernel(reinterpret_cast<"; + ss << type.first; +#if 0 + ss << "*>(psi), "; + for (int i = 0; i < nqubits; i++) + { + ss << "ids["; + ss << nqubits - i - 1; + ss << "], "; + } + ss << "reinterpret_cast(psi), reinterpret_cast(m), ctrlmask);"; + ss << std::endl; + ss << "}"; + ss << std::endl; + } + + const std::string& source = ss.str(); +#if 0 + std::cout << source << std::endl; +#endif + std::ofstream file(filename); + file << source; + } + + // 2) Compile source file into a shared library + std::string binname = TempFile(filenameTemplate).string(ec); + if (ec) return nullptr; + { + std::string errlog = TempFile(filenameTemplate).string(ec); + if (ec) return nullptr; + + std::stringstream ss; +#ifdef __APPLE__ + ss << "g++-11"; +#else + ss << "g++"; +#endif +#if 0 + ss << " -g -O0 -std=c++17 -x c++ "; +#else + ss << " -g -O3 -ffast-math -fopenmp -std=c++17 -x c++ "; +#endif + ss << filename; + ss << " -I/usr/include/eigen3 -fPIC -shared -o"; + ss << binname; + ss << " >"; + ss << errlog; + + const std::string command = ss.str(); + system(command.c_str()); + + // If the output log file is not empty, read its contents, + // and put into errmsg. + if (fs::exists(errlog)) + { + std::ifstream file(errlog); + errmsg = std::string((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + } + } + + // 3) Load shared library and bind its entry point + void* handle = nullptr; + { + // If the output file does not exist, return NULL. + if (!fs::exists(binname)) + return nullptr; + + // If the shared library could not be loaded, return NULL. + void* lib = dlopen(binname.c_str(), RTLD_NOW); + if (!lib) + { + std::stringstream ss; + ss << "Could not open \"" << binname << + "\" as a shared library: \"" << dlerror() << "\"" << std::endl; + errmsg += ss.str(); + return nullptr; + } + + // If the symbol does not exist, return NULL. + handle = dlsym(lib, "kernel_int"); + if (!handle) + { + std::stringstream ss; + ss << "Could not bind symbol \"model_solve\" in shared library \"" << + binname << "\": \"" << dlerror() << "\"" << std::endl; + errmsg += ss.str(); + return nullptr; + } + } + + // 4) Cache the compiled eskew kernel in our internal database, + // so that we could use it again without recompilation, should the same + // dimensions be requested. + database[hash] = handle; + + return handle; +} + +Compiler& get_compiler() +{ + static Compiler compiler; + return compiler; +} + diff --git a/ccsrc/lib/cppsim/kernelgen.cpp b/ccsrc/lib/cppsim/kernelgen.cpp new file mode 100644 index 00000000..44a9c9e7 --- /dev/null +++ b/ccsrc/lib/cppsim/kernelgen.cpp @@ -0,0 +1,67 @@ +#include "kernelgen.hpp" +#include "res_embed.h" + +#include +#include +#include +#include + +namespace py = pybind11; +using namespace pybind11::literals; + +std::string KernelGen::generate(int nqubits, unsigned* ids) +{ + // Use embedded Python interpreter to run the script + // and get the resulting string of source code. + // We intentionally keep the generator in Python, in order + // to let the people to customize it more easily. + py::scoped_interpreter guard {}; + try + { + py::dict globals = py::globals(); + // Assign the __name__, otherwise it is set to "__main__" by default. + globals["__name__"] = "kernelgen"; + py::eval(nointrin, globals, globals); + if (ids) + { + std::vector vids(nqubits); + vids.assign(ids, ids + nqubits); + auto source = globals["kernelgen"](nqubits, "ids"_a = vids).cast(); + return source; + } + else + { + auto source = globals["kernelgen"](nqubits).cast(); + return source; + } + } + catch (pybind11::error_already_set e) + { + std::cerr << "Unable to invoke the Python script: " << e.what() << std::endl; + exit(-1); + } +} + +namespace res { + +namespace embed { + +namespace init { + +void nointrin(); + +} // namespace init + +} // namespace embed + +} // namespace res + +KernelGen::KernelGen() +{ + // Extract the Python script. + res::embed::init::nointrin(); + size_t size = 0; + auto source = res::embed::get("nointrin", &size); + nointrin = std::string(source, size); +} + diff --git a/ccsrc/lib/cppsim/tempfile.cpp b/ccsrc/lib/cppsim/tempfile.cpp new file mode 100644 index 00000000..dc6729af --- /dev/null +++ b/ccsrc/lib/cppsim/tempfile.cpp @@ -0,0 +1,102 @@ +#include "tempfile.h" + +#if __has_include() +#define HAS_UNISTD_H 1 +#else +#define HAS_UNISTD_H 0 +#endif + +#include // for errno +#include // for std::remove +#include // for getenv, atoi +#include // for begin, end + +#if HAS_UNISTD_H +#include // for close +#include // for std::vector +#else +#include // for std::generate_n +#include // for uint32_t +#include // for ofstream +#include // for std::string_view +#include // for std::default_random_engine, etc. +#include // for std::move +#endif // HAS_UNISTD_H + +#if !HAS_UNISTD_H +namespace { + std::string create_temp_file(std::string template_str, ec_ns::error_code& ec) { + static constexpr std::string_view chars = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + thread_local std::random_device rd; + thread_local auto rng = std::default_random_engine(rd()); + thread_local auto dist = std::uniform_int_distribution{0U, + static_cast(chars.size())}; + + auto dir = fs::temp_directory_path(ec); + if (ec) return {}; + + const auto size = template_str.size(); + + for(unsigned int i(0); i < (62 * 62 * 62) /* same as mkstemp */; ++i) { + std::generate_n(std::end(template_str)-6, 6, []() { return chars[dist(rng)]; }); + + std::ofstream fout(template_str); + if(fout) + { + return template_str; + } + } + + ec = ec_ns::error_code(errno, ec_ns::generic_category()); + return {}; + } +} +#endif // !HAS_UNISTD_H + +// ============================================================================= + +const std::string& TempFile::string(std::error_code& ec_) const +{ + ec_ = ec; + return filename; +} + +TempFile::TempFile(const std::string& mask_) +{ + auto dir = fs::temp_directory_path(ec); + if (ec) return; + +#if HAS_UNISTD_H + std::string mask = (dir / mask_).string(); + + std::vector vfilename(mask.c_str(), mask.c_str() + mask.size() + 1); + auto fd = mkstemp(&vfilename[0]); + if (fd == -1) + { + ec = ec_ns::error_code(errno, ec_ns::generic_category()); + return; + } + + close(fd); + filename = std::string(begin(vfilename), end(vfilename)); +#else + auto fname = create_temp_file((dir / mask_).string(), ec); + if (ec) return; + + filename = std::move(fname); +#endif // HAS_UNISTD_H +} + +TempFile::~TempFile() +{ + bool keepCache = false; + const char* keepCacheValue = std::getenv("KEEP_CACHE"); + if (keepCacheValue) + keepCache = std::atoi(keepCacheValue); + if (!keepCache) + std::remove(filename.c_str()); +} + diff --git a/ccsrc/lib/cppsim/test/test_combinations.cpp b/ccsrc/lib/cppsim/test/test_combinations.cpp new file mode 100644 index 00000000..6f1afe93 --- /dev/null +++ b/ccsrc/lib/cppsim/test/test_combinations.cpp @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_VECTORIZE +#include +#include "combinations.h" + +#define add(a, b) (a + b) +#define mul(a, b) (a * b) + +#define M(j, i) (m[j * 32 + i]) + +template +inline void kernel_core(T* psi, std::size_t I, std::size_t d0, std::size_t d1, std::size_t d2, std::size_t d3, std::size_t d4, const T* m) +{ + const std::array v = { + psi[I], + psi[I + d0], + psi[I + d1], + psi[I + d0 + d1], + psi[I + d2], + psi[I + d0 + d2], + psi[I + d1 + d2], + psi[I + d0 + d1 + d2], + psi[I + d3], + psi[I + d0 + d3], + psi[I + d1 + d3], + psi[I + d0 + d1 + d3], + psi[I + d2 + d3], + psi[I + d0 + d2 + d3], + psi[I + d1 + d2 + d3], + psi[I + d0 + d1 + d2 + d3], + psi[I + d4], + psi[I + d0 + d4], + psi[I + d1 + d4], + psi[I + d0 + d1 + d4], + psi[I + d2 + d4], + psi[I + d0 + d2 + d4], + psi[I + d1 + d2 + d4], + psi[I + d0 + d1 + d2 + d4], + psi[I + d3 + d4], + psi[I + d0 + d3 + d4], + psi[I + d1 + d3 + d4], + psi[I + d0 + d1 + d3 + d4], + psi[I + d2 + d3 + d4], + psi[I + d0 + d2 + d3 + d4], + psi[I + d1 + d2 + d3 + d4], + psi[I + d0 + d1 + d2 + d3 + d4], + }; + + const auto result = Eigen::Map>(m) * Eigen::Map>(v.data()); + + psi[I] = result[0]; + psi[I + d0] = result[1]; + psi[I + d1] = result[2]; + psi[I + d0 + d1] = result[3]; + psi[I + d2] = result[4]; + psi[I + d0 + d2] = result[5]; + psi[I + d1 + d2] = result[6]; + psi[I + d0 + d1 + d2] = result[7]; + psi[I + d3] = result[8]; + psi[I + d0 + d3] = result[9]; + psi[I + d1 + d3] = result[10]; + psi[I + d0 + d1 + d3] = result[11]; + psi[I + d2 + d3] = result[12]; + psi[I + d0 + d2 + d3] = result[13]; + psi[I + d1 + d2 + d3] = result[14]; + psi[I + d0 + d1 + d2 + d3] = result[15]; + psi[I + d4] = result[16]; + psi[I + d0 + d4] = result[17]; + psi[I + d1 + d4] = result[18]; + psi[I + d0 + d1 + d4] = result[19]; + psi[I + d2 + d4] = result[20]; + psi[I + d0 + d2 + d4] = result[21]; + psi[I + d1 + d2 + d4] = result[22]; + psi[I + d0 + d1 + d2 + d4] = result[23]; + psi[I + d3 + d4] = result[24]; + psi[I + d0 + d3 + d4] = result[25]; + psi[I + d1 + d3 + d4] = result[26]; + psi[I + d0 + d1 + d3 + d4] = result[27]; + psi[I + d2 + d3 + d4] = result[28]; + psi[I + d0 + d2 + d3 + d4] = result[29]; + psi[I + d1 + d2 + d3 + d4] = result[30]; + psi[I + d0 + d1 + d2 + d3 + d4] = result[31]; + +} + +#undef add +#undef mul +#undef M + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel(T* psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, const T* m, std::size_t ctrlmask) +{ + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4; + std::size_t n = 1 + d0 + d1 + d2 + d3 + d4; + std::size_t dsorted[] = { d4, d3, d2, d1, d0 }; + std::sort(dsorted, dsorted + 5, std::greater()); + + if (ctrlmask == 0){ + #pragma omp for collapse(6) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]) + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]) + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]) + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]) + for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){ + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m); + } + } + } + else{ + #pragma omp for collapse(6) schedule(static) + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]){ + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]) + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]) + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]) + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]) + for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5){ + if (((i0 + i1 + i2 + i3 + i4 + i5)&ctrlmask) == ctrlmask) + kernel_core(psi, i0 + i1 + i2 + i3 + i4 + i5, d0, d1, d2, d3, d4, m); + } + } + } +} + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel_combinations(T* psi, const T* m, std::size_t ctrlmask) +{ + constexpr std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4; + constexpr std::size_t n = 1 + d0 + d1 + d2 + d3 + d4; + std::size_t dsorted[] = { d4, d3, d2, d1, d0 }; + std::sort(dsorted, dsorted + 5, std::greater()); + + if (ctrlmask == 0){ + Combinations::iterate([=](auto... i) + { + kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m); + }); + } + else{ + Combinations::iterate([=](auto... i) + { + if (((i + ...) & ctrlmask) == ctrlmask) + kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m); + }); + } +} + +#include "schedule.h" + +// bit indices id[.] are given from high to low (e.g. control first for CNOT) +template +void kernel_combinations_partitioned(T* psi, const T* m, std::size_t ctrlmask) +{ + constexpr std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4; + constexpr std::size_t n = 1 + d0 + d1 + d2 + d3 + d4; + std::size_t dsorted[] = { d4, d3, d2, d1, d0 }; + std::sort(dsorted, dsorted + 5, std::greater()); + + if (ctrlmask == 0){ + // Here we do the "planning" of execution, not the execution itself. + // We do already specify though an interation loop body, in order + // for the backend to make the resources allocation. + auto backend = Schedule::template schedule( + [=](uint32_t& count_worker, auto... i) + { + kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m); + }); + + printf("Using %s backend with %u workers\n", + backend.getName(), backend.getWorkersCount()); + + // Finally, execute the iterations. + uint32_t* ptr = nullptr; + Schedule::iterate(ptr, backend); + } + else{ + // Here we do the "planning" of execution, not the execution itself. + // We do already specify though an interation loop body, in order + // for the backend to make the resources allocation. + auto backend = Schedule::template schedule( + [=](uint32_t& count_worker, auto... i) + { + if (((i + ...) & ctrlmask) == ctrlmask) + kernel_core(psi, (i + ...), d0, d1, d2, d3, d4, m); + }); + + printf("Using %s backend with %u workers\n", + backend.getName(), backend.getWorkersCount()); + + // Finally, execute the iterations. + uint32_t* ptr = nullptr; + Schedule::iterate(ptr, backend); + } +} + +#include +#include +#include + +#include "gtest/gtest.h" + +template +bool compare(Kernels kernels, V& psi1) +{ + std::default_random_engine dre; + dre.seed(0); + std::uniform_int_distribution uid(-1000, 1000); + + // Generate m matrix as integers. + std::array, 1UL << nqubits> m; + for (int j = 0; j < m.size(); j++) + for (int i = 0; i < m.size(); i++) + m[j][i] = uid(dre); + + // Generate psi matrix as integers. + for (int i = 0; i < psi1.size(); i++) + psi1[i] = uid(dre); + auto psi2 = psi1; + auto psi3 = psi1; + + // Generate control mask. + std::size_t ctrlmask = 0; // uid(dre); + + // Compare kernel against generated kernel. + kernels(psi1, psi2, psi3, m, ctrlmask); + auto diff2 = std::mismatch(psi1.begin(), psi1.end(), psi2.begin()); + auto diff3 = std::mismatch(psi1.begin(), psi1.end(), psi3.begin()); + if ((diff2.first == psi1.end()) && (diff3.first == psi1.end())) + return true; + + if (diff2.first != psi1.end()) + std::cout << "Mismatch in psi2 at " << std::distance(psi1.begin(), diff2.first) << + " : " << *(diff2.first) << " != " << *(diff2.second) << std::endl; + if (diff3.first != psi1.end()) + std::cout << "Mismatch in psi3 at " << std::distance(psi1.begin(), diff3.first) << + " : " << *(diff3.first) << " != " << *(diff3.second) << std::endl; + + return false; +} + +TEST(nointrin, kernel5) +{ + constexpr unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4; + size_t n = 1; + n += 1UL << id0; + n += 1UL << id1; + n += 1UL << id2; + n += 1UL << id3; + n += 1UL << id4; + std::vector psi(n); + ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask) + { + kernel(&psi1[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask); + kernel_combinations(&psi2[0], &m[0][0], ctrlmask); + kernel_combinations_partitioned(&psi3[0], &m[0][0], ctrlmask); + }, + psi)); +} + +int main(int argc, char* argv[]) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/ccsrc/lib/cppsim/test/test_nointrin.cpp b/ccsrc/lib/cppsim/test/test_nointrin.cpp new file mode 100644 index 00000000..803455cc --- /dev/null +++ b/ccsrc/lib/cppsim/test/test_nointrin.cpp @@ -0,0 +1,142 @@ +// Ensure hand-written and generated kernels give equal results. + +#define kernel generated_kernel + +#include "generated/nointrin/kernel1.hpp" +#include "generated/nointrin/kernel2.hpp" +#include "generated/nointrin/kernel3.hpp" +#include "generated/nointrin/kernel4.hpp" +#include "generated/nointrin/kernel5.hpp" + +#undef kernel + +#include "nointrin/kernels.hpp" + +#include "kernelgen.hpp" + +#include +#include +#include + +#include "gtest/gtest.h" + +template +bool compare(Kernels kernels, V& psi1) +{ + std::default_random_engine dre; + dre.seed(0); + std::uniform_int_distribution uid(-1000, 1000); + + // Generate m matrix as integers. + std::array, 1UL << nqubits> m; + for (int j = 0; j < m.size(); j++) + for (int i = 0; i < m.size(); i++) + m[j][i] = uid(dre); + + // Generate psi matrix as integers. + for (int i = 0; i < psi1.size(); i++) + psi1[i] = uid(dre); + auto psi2 = psi1; + auto psi3 = psi1; + + // Generate control mask. + std::size_t ctrlmask = 0; // uid(dre); + + // Compare kernel against generated kernel. + kernels(psi1, psi2, psi3, m, ctrlmask); + auto diff2 = std::mismatch(psi1.begin(), psi1.end(), psi2.begin()); + auto diff3 = std::mismatch(psi1.begin(), psi1.end(), psi3.begin()); + if ((diff2.first == psi1.end()) && (diff3.first == psi1.end())) + return true; + + if (diff2.first != psi1.end()) + std::cout << "Mismatch in psi2 at " << std::distance(psi1.begin(), diff2.first) << + " : " << *(diff2.first) << " != " << *(diff2.second) << std::endl; + if (diff3.first != psi1.end()) + std::cout << "Mismatch in psi3 at " << std::distance(psi1.begin(), diff3.first) << + " : " << *(diff3.first) << " != " << *(diff3.second) << std::endl; + + return false; +} + +TEST(nointrin, kernel1) +{ + unsigned id0 = 1; + std::array ids { id0 }; + size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1); + std::vector psi(n); + ASSERT_TRUE(compare<1>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask) + { + kernel(psi1, id0, m, ctrlmask); + generated_kernel(&psi2[0], id0, &m[0][0], ctrlmask); + kernelgen(psi3, ids, m , ctrlmask); + }, + psi)); +} + +TEST(nointrin, kernel2) +{ + unsigned id0 = 1, id1 = 3; + std::array ids { id0, id1 }; + size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1); + std::vector psi(n); + ASSERT_TRUE(compare<2>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask) + { + kernel(psi1, id1, id0, m, ctrlmask); + generated_kernel(&psi2[0], id1, id0, &m[0][0], ctrlmask); + kernelgen(psi3, ids, m, ctrlmask); + }, + psi)); +} + +TEST(nointrin, kernel3) +{ + unsigned id0 = 1, id1 = 3, id2 = 5; + std::array ids { id0, id1, id2 }; + size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1); + std::vector psi(n); + ASSERT_TRUE(compare<3>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask) + { + kernel(psi1, id2, id1, id0, m, ctrlmask); + generated_kernel(&psi2[0], id2, id1, id0, &m[0][0], ctrlmask); + kernelgen(psi3, ids, m, ctrlmask); + }, + psi)); +} + +TEST(nointrin, kernel4) +{ + unsigned id0 = 1, id1 = 3, id2 = 5, id3 = 7; + std::array ids { id0, id1, id2, id3 }; + size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1); + std::vector psi(n); + ASSERT_TRUE(compare<4>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask) + { + kernel(psi1, id3, id2, id1, id0, m, ctrlmask); + generated_kernel(&psi2[0], id3, id2, id1, id0, &m[0][0], ctrlmask); + kernelgen(psi3, ids, m, ctrlmask); + }, + psi)); +} + +TEST(nointrin, kernel5) +{ + unsigned id0 = 1, id1 = 3, id2 = 5, id3 = 7, id4 = 9; + std::array ids { id0, id1, id2, id3, id4 }; + size_t n = 1UL << (*std::max_element(std::begin(ids), std::end(ids)) + 1); + std::vector psi(n); + ASSERT_TRUE(compare<5>([&](auto& psi1, auto& psi2, auto& psi3, auto m, auto ctrlmask) + { + kernel(psi1, id4, id3, id2, id1, id0, m, ctrlmask); + generated_kernel(&psi2[0], id4, id3, id2, id1, id0, &m[0][0], ctrlmask); + kernelgen(psi3, ids, m, ctrlmask); + }, + psi)); +} + +int main(int argc, char* argv[]) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/ccsrc/lib/cppsim/test/test_popcount.cpp b/ccsrc/lib/cppsim/test/test_popcount.cpp new file mode 100644 index 00000000..5688e2e6 --- /dev/null +++ b/ccsrc/lib/cppsim/test/test_popcount.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include "combinations.h" + +#include "gtest/gtest.h" + +size_t popcount_reference(unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0) +{ + std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4; + std::size_t n = 1 + d0 + d1 + d2 + d3 + d4; + std::size_t dsorted[] = { d4, d3, d2, d1, d0 }; + std::sort(dsorted, dsorted + 5, std::greater()); + + size_t popcount = 0; + for (std::size_t i0 = 0; i0 < n; i0 += 2 * dsorted[0]) + for (std::size_t i1 = 0; i1 < dsorted[0]; i1 += 2 * dsorted[1]) + for (std::size_t i2 = 0; i2 < dsorted[1]; i2 += 2 * dsorted[2]) + for (std::size_t i3 = 0; i3 < dsorted[2]; i3 += 2 * dsorted[3]) + for (std::size_t i4 = 0; i4 < dsorted[3]; i4 += 2 * dsorted[4]) + for (std::size_t i5 = 0; i5 < dsorted[4]; ++i5) + popcount++; + + return popcount; +} + +TEST(popcount, kernel5) +{ + constexpr unsigned id0 = 0, id1 = 1, id2 = 2, id3 = 3, id4 = 4; + constexpr std::size_t d0 = 1UL << id0, d1 = 1UL << id1, d2 = 1UL << id2, d3 = 1UL << id3, d4 = 1UL << id4; + constexpr std::size_t n = 1 + d0 + d1 + d2 + d3 + d4; + auto popcount1 = popcount_reference(id4, id3, id2, id1, id0); + auto popcount2 = Combinations::popcount(); + ASSERT_EQ(popcount1, popcount2); +} + +int main(int argc, char* argv[]) +{ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/cmake/commands/check_code_compiles.cmake b/cmake/commands/check_code_compiles.cmake new file mode 100644 index 00000000..f414b28d --- /dev/null +++ b/cmake/commands/check_code_compiles.cmake @@ -0,0 +1,36 @@ +include(CheckCXXSourceCompiles) + +# ~~~ +# Check whether some C++ code compiles +# +# check_cxx_code_compiles( [, ...]) +# ~~~ +function(check_code_compiles cmake_identifier var lang_standard code) + if(NOT "${ARGN}" STREQUAL "") + set(_lang_list "${ARGN}") + else() + set(_lang_list CXX) + if(_cuda_enabled) + list(APPEND _lang_list CUDA) + endif() + endif() + + if(lang_standard MATCHES "std_([0-9]+)") + set(CMAKE_CXX_STANDARD ${CMAKE_MATCH_1}) + endif() + set(CMAKE_CXX_EXTENSIONS OFF) + + check_cxx_source_compiles("${code}" "${cmake_identifier}") + + set(${var} + ${${cmake_identifier}} + PARENT_SCOPE) + + set(${var} FALSE) + if(${cmake_identifier}) + set(${var} TRUE) + endif() + set(${var} + ${${var}} + CACHE INTERNAL "${cmake_identifier}") +endfunction() diff --git a/cmake/commands/kernelgen.cmake b/cmake/commands/kernelgen.cmake new file mode 100644 index 00000000..a20a89e0 --- /dev/null +++ b/cmake/commands/kernelgen.cmake @@ -0,0 +1,44 @@ +# ~~~ +# Generate some kernel functions +# +# kernelgen([COMBINATIONS] +# [NQUBITS ] +# [VARIANT |] +# [TARGET ] +# ) +# ~~~ +function(kernelgen) + set(options COMBINATIONS) + set(oneValueArgs NQUBITS VARIANT TARGET) + cmake_parse_arguments(KERNELGEN "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(NQUBITS ${KERNELGEN_NQUBITS}) + set(VARIANT ${KERNELGEN_VARIANT}) + if(NOT DEFINED CPPSIM_INCLUDE_DIR) + set(CPPSIM_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) + endif() + set(KERNELGEN "${CPPSIM_INCLUDE_DIR}/${VARIANT}/kernelgen.py") + set(KERNEL_PATH "${CMAKE_CURRENT_BINARY_DIR}/generated/${VARIANT}/kernel${NQUBITS}.hpp") + + if(NOT EXISTS "${KERNELGEN_PY}") + message(FATAL_ERROR "Cannot locate kernelgen Python script: ${KERNELGEN_PY}") + endif() + + set(_args) + if(KERNELGEN_COMBINATIONS) + list(APPEND _args --combinations=True) + endif() + + # Call generator. + add_custom_command( + OUTPUT ${KERNEL_PATH} + COMMAND ${Python_EXECUTABLE} ${KERNELGEN_PY} ${NQUBITS} ${KERNEL_PATH} ${_args} + COMMENT "Generating kernel for ${NQUBITS} qubits" + DEPENDS ${KERNELGEN_PY}) + set_source_files_properties("${KERNEL_PATH}" PROPERTIES GENERATED TRUE) + + # Append the generated file to the target sources. + target_sources(${KERNELGEN_TARGET} PRIVATE ${KERNEL_PATH}) + target_include_directories(${KERNELGEN_TARGET} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + target_compile_features(${KERNELGEN_TARGET} PRIVATE cxx_std_17) +endfunction() diff --git a/cmake/compiler_has_std_filesystem.cmake b/cmake/compiler_has_std_filesystem.cmake new file mode 100644 index 00000000..2bb12b5a --- /dev/null +++ b/cmake/compiler_has_std_filesystem.cmake @@ -0,0 +1,24 @@ +include(check_code_compiles) + +# ============================================================================== + +check_code_compiles( + compiler_has_std_filesystem + CPPSIM_HAS_STD_FILESYSTEM + cxx_std_17 + [[ +#ifdef __has_include +# if __has_include() +# include +# endif +#endif +int main() { +#if __cpp_lib_filesystem >= 201703 + return 0; +#else +#error std::filesystem not supported +#endif +} +]]) + +# ============================================================================== diff --git a/cmake/cppsimConfig.cmake.in b/cmake/cppsimConfig.cmake.in new file mode 100644 index 00000000..d108f0b1 --- /dev/null +++ b/cmake/cppsimConfig.cmake.in @@ -0,0 +1,55 @@ +@PACKAGE_INIT@ + +# ------------------------------------------------------------------------------ + +# NB: PACKAGE_PREFIX_DIR might get overwritten by the find_package() calls below +set(CPPSIM_PREFIX_DIR "${PACKAGE_PREFIX_DIR}") + +if(TARGET cppsim::cppsim) + # Protect against double definitions due to previous call or add_subdirectory() + return() +endif() + +# ============================================================================== + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/commands) + +find_package(res_embed CONFIG REQUIRED) +find_package(OpenMP REQUIRED) +find_package( + Python @CPPSIM_PYTHON_VERSION_MIN@ + COMPONENTS Interpreter Development.Embed + REQUIRED) +find_package(pybind11 CONFIG REQUIRED) + +include(${CMAKE_CURRENT_LIST_DIR}/compiler_has_std_filesystem.cmake) + +if(NOT CPPSIM_HAS_STD_FILESYSTEM) + find_package(Boost REQUIRED COMPONENTS filesystem) +endif() + +list(POP_BACK CMAKE_MODULE_PATH) + +# ------------------------------------------------------------------------------ + +if(NOT TARGET digestpp::digestpp) + add_library(digestpp::digestpp INTERFACE IMPORTED) + target_include_directories(digestpp::digestpp + INTERFACE "${CPPSIM_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@/third_party/digestpp-src") +endif() + +# ============================================================================== + +set(CPPSIM_INCLUDE_DIR + "${CPPSIM_PREFIX_DIR}/@CPPSIM_INSTALL_INCLUDEDIR@" + CACHE FILEPATH "Path to include files for cppsim" FORCE) + +include(${CMAKE_CURRENT_LIST_DIR}/cppsimTargets.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/commands/kernelgen.cmake) + +# ============================================================================== + +unset(CPPSIM_PREFIX_DIR) + +# ============================================================================== diff --git a/cmake/options.cmake b/cmake/options.cmake index 9d2bdaf8..a89cee63 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -28,7 +28,6 @@ option(ENABLE_PROJECTQ "Enable ProjectQ support" ON) option(ENABLE_GITEE "Use Gitee instead of GitHub for checking out third-party dependencies" OFF) option(ENABLE_CXX_EXPERIMENTAL "Enable the new (experimental) C++ backend" OFF) option(ENABLE_DOCUMENTATION "Enable building of the documentation using Doxygen" OFF) -option(ENABLE_CPPSIM "Enable the use of cppsim for generating simulator kernels" OFF) option(ENABLE_LOGGING "Enable the use of logging in C++" OFF) cmake_dependent_option(ENABLE_LOGGING_TRACE_LEVEL "If logging is enabled, log everything down to the TRACE level" OFF "ENABLE_LOGGING" OFF) diff --git a/docs/source/cmake_reference.rst b/docs/source/cmake_reference.rst index 44c70cc8..b7ceeb9f 100644 --- a/docs/source/cmake_reference.rst +++ b/docs/source/cmake_reference.rst @@ -52,8 +52,6 @@ Descriptions +-------------------------------------+-----------------------------------------------------------------------+ | ``ENABLE_CMAKE_DEBUG`` | Enable verbose output to debug CMake issues | +-------------------------------------+-----------------------------------------------------------------------+ -| ``ENABLE_CPPSIM`` | Enable the use of cppsim for generating simulator kernels | -+-------------------------------------+-----------------------------------------------------------------------+ | ``ENABLE_CUDA`` | Enable the use of CUDA code | +-------------------------------------+-----------------------------------------------------------------------+ | ``ENABLE_CXX_EXPERIMENTAL`` | Enable the building of the (new) experimental C++ backend | @@ -139,8 +137,6 @@ Default values +-------------------------------------+------------------------------+ | ``ENABLE_CMAKE_DEBUG`` | OFF | +-------------------------------------+------------------------------+ -| ``ENABLE_CPPSIM`` | OFF | -+-------------------------------------+------------------------------+ | ``ENABLE_CUDA`` | OFF | +-------------------------------------+------------------------------+ | ``ENABLE_CXX_EXPERIMENTAL`` | OFF | diff --git a/scripts/build/default_values.bat b/scripts/build/default_values.bat index 6ff21dc2..a32063fb 100644 --- a/scripts/build/default_values.bat +++ b/scripts/build/default_values.bat @@ -27,7 +27,6 @@ if NOT DEFINED do_update_venv set do_update_venv=0 if NOT DEFINED dry_run set dry_run=0 if NOT DEFINED enable_analyzer set enable_analyzer=0 if NOT DEFINED enable_ccache set enable_ccache=0 -if NOT DEFINED enable_cppsim set enable_cppsim=0 if NOT DEFINED enable_cxx set enable_cxx=0 if NOT DEFINED enable_gitee set enable_gitee=0 if NOT DEFINED enable_gpu set enable_gpu=0 diff --git a/scripts/build/default_values.conf b/scripts/build/default_values.conf index e27d3efc..700dca0f 100644 --- a/scripts/build/default_values.conf +++ b/scripts/build/default_values.conf @@ -22,7 +22,6 @@ do_update_venv = false enable_analyzer = false enable_gitee = false enable_ccache = false -enable_cppsim = false enable_cxx = false enable_gpu = false enable_projectq = true diff --git a/scripts/build/parse_common_args.ps1 b/scripts/build/parse_common_args.ps1 index c0367dd5..e9bffa37 100644 --- a/scripts/build/parse_common_args.ps1 +++ b/scripts/build/parse_common_args.ps1 @@ -76,7 +76,6 @@ function Help-Message() { Write-Output ' -Config [dir] Path to INI configuration file with default values for the parameters' Write-Output (" Defaults to: {0}" -f $config_file) Write-Output ' NB: command line arguments always take precedence over configuration file values' - Write-Output ' -CppSim (experimental) Enable the use of cppsim to generate simulation kernels' Write-Output ' -Cxx (experimental) Enable MindQuantum C++ support' Write-Output ' -Debug Build in debug mode' Write-Output ' -DebugCMake Enable debugging mode for CMake configuration step' @@ -189,10 +188,6 @@ if (([bool]$CleanVenv)) { Set-Value 'do_clean_venv' } -if (([bool]$CppSim)) { - Set-Value 'enable_cppsim' -} - if (([bool]$Cxx)) { Set-Value 'enable_cxx' } diff --git a/scripts/build/parse_common_args.sh b/scripts/build/parse_common_args.sh index 905763c0..ab096eca 100755 --- a/scripts/build/parse_common_args.sh +++ b/scripts/build/parse_common_args.sh @@ -103,7 +103,6 @@ help_message() { echo ' --config=[dir] Path to INI configuration file with default values for the parameters' echo " Defaults to: $config_file" echo ' NB: command line arguments always take precedence over configuration file values' - echo ' --cppsim (experimental) Enable the use of cppsim to generate simulation kernels' echo ' --cxx (experimental) Enable MindQuantum C++ support' echo ' --debug Build in debug mode' echo ' --debug-cmake Enable debugging mode for CMake configuration step' @@ -228,9 +227,6 @@ while getopts "${getopts_args}" OPT; do cuda-arch ) needs_arg; set_var cuda_arch "$(echo "$OPTARG" | tr ',' ';')" ;; - cppsim ) no_arg; - set_var enable_cppsim $flag_value - ;; cxx ) no_arg; set_var enable_cxx $flag_value ;; diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 587f9ab0..f2744839 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -69,10 +69,7 @@ if(ENABLE_CXX_EXPERIMENTAL) include(${CMAKE_CURRENT_LIST_DIR}/tweedledum/tweedledum.cmake) # cppsim dependencies - if(ENABLE_CPPSIM) - include(${CMAKE_CURRENT_LIST_DIR}/res_embed/res_embed.cmake) - include(${CMAKE_CURRENT_LIST_DIR}/cppsim/cppsim.cmake) - endif() + include(${CMAKE_CURRENT_LIST_DIR}/res_embed/res_embed.cmake) endif() # ============================================================================== diff --git a/third_party/cppsim/.github/workflows/ci.yml b/third_party/cppsim/.github/workflows/ci.yml new file mode 100644 index 00000000..afd41506 --- /dev/null +++ b/third_party/cppsim/.github/workflows/ci.yml @@ -0,0 +1,495 @@ +--- + +name: CI + +on: + workflow_dispatch: + push: + branches-ignore: + - 'test' + +jobs: + standard: + strategy: + fail-fast: true + matrix: + runs-on: [ubuntu-latest] + python: + - 3.7 + - 3.8 + - 3.9 + - '3.10' + name: "Python ${{ matrix.python }} • ${{ matrix.runs-on }} • x64" + runs-on: ${{ matrix.runs-on }} + + steps: + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Setup Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + + - name: Prepare env + run: | + python3 -m pip install -U wheel build cmake pybind11 + sudo apt-get update && sudo apt-get install -y libboost-filesystem-dev --no-install-recommends + + - name: Install res_embed + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" + cmake --build res_embed/build --target install -j2 + + - name: Configure + run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)" + + - name: Build & install + run: cmake --build build --target all -j2 -v + + + # ============================================================================ + + macos: + runs-on: macos-11 + name: "MacOS 11 • ${{ matrix.xcode }} • x64" + env: + DEVELOPER_DIR: "/Applications/${{ matrix.xcode }}.app/Contents/Developer" + CC: /usr/bin/clang + CXX: /usr/bin/clang++ + strategy: + fail-fast: false + matrix: + xcode: + - "Xcode_11.7" # Not available on macos-12 + - "Xcode_12.4" # Not available on macos-12 + - "Xcode_12.5.1" # Not available on macos-12 + # - "Xcode_13.0" # Not available on macos-12 + - "Xcode_13.1" + - "Xcode_13.2.1" + # - "Xcode_13.3.1" # macos-12 only + # - "Xcode_13.4.1" # macos-12 only + steps: + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Setup Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Install NASM + uses: ilammy/setup-nasm@v1 + + - name: Prepare env + run: | + python3 -m pip install -U wheel build cmake pybind11 + brew install libomp boost + + - name: Install res_embed + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" + cmake --build res_embed/build --target install -j2 + + - name: Configure + run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)" + + - name: Build & install + run: cmake --build build --target all -j2 + + # ============================================================================ + + gcc: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + gcc: + - 7 # C++17 earliest version + - 8 + - 9 + - 10 + - 11 + - 12 + name: "GCC ${{ matrix.gcc }} • x64" + container: "gcc:${{ matrix.gcc }}" + steps: + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Prepare env + run: > + apt-get update && apt-get install -y python3-dev python3-pip python3-setuptools python3-wheel python3-venv + libboost-filesystem-dev --no-install-recommends + + - name: Prepare env + run: | + python3 -m pip install -U wheel build + python3 -m pip install -U pybind11 + python3 -m pip install cmake --only-binary :all: + + - name: Install res_embed + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" + cmake --build res_embed/build --target install -j2 + + - name: Configure + run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR=$(python3 -m pybind11 --cmakedir) + + - name: Build & install + run: cmake --build build --target all -j2 + + # ============================================================================ + + clang: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + clang: + - 7 + - 8 + - 9 + - 10 # first version for full C++17 support (with patches) + - 11 + - 12 + - 13 + - 14 + env: + CC: clang + CXX: clang++ + + name: "Clang ${{ matrix.clang }} • x64" + container: "silkeh/clang:${{ matrix.clang }}" + steps: + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Prepare env + run: > + apt-get update && apt-get install -y python3-dev python3-pip python3-setuptools python3-wheel python3-venv + libboost-filesystem-dev --no-install-recommends + + - name: Prepare env + run: python3 -m pip install -U wheel build cmake pybind11 + + - name: Install res_embed + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" + cmake --build res_embed/build --target install -j2 + + - name: Configure + run: cmake -S. -Bbuild -DCMAKE_PREFIX_PATH=$PWD/install -Dpybind11_DIR=$(python3 -m pybind11 --cmakedir) + + - name: Build & install + run: cmake --build build --target all -j2 + + # ============================================================================ + + msvc: + runs-on: windows-latest + name: "MSVC • x64" + steps: + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Enable Developer Command Prompt + uses: ilammy/msvc-dev-cmd@v1.12.0 + + - name: Install NASM + uses: ilammy/setup-nasm@v1 + + - name: Prepare env + run: python3 -m pip install -U wheel build cmake pybind11 + + - name: Install res_embed + env: + CC: cl + CXX: cl + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" -DMSVC_ITERATOR_DEBUG=ON + cmake --build res_embed/build --target install -j2 --config Release + + - name: Configure + env: + CC: cl + CXX: cl + run: > + cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="${{ github.workspace }}/install" + -DMSVC_ITERATOR_DEBUG=ON + -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)" + + - name: Build & install + run: cmake --build build --config Release -j2 + + # ============================================================================ + + mingw64: + runs-on: windows-2022 + strategy: + fail-fast: false + name: "MINGW64 • x64" + env: + BOOST_VERSION: 1.78.0 + BOOST_PATH: ${{github.workspace}}/boost/boost + steps: + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Install NASM + uses: ilammy/setup-nasm@v1 + + - name: Set up MinGW64 + uses: egor-tensin/setup-mingw@v2 + id: mingw64-setup + with: + platform: x64 + + - name: Download and install Boost + uses: MarkusJx/install-boost@v2.4.0 + if: steps.cache-boost.outputs.cache-hit != 'true' + id: install-boost + with: + boost_version: ${{ env.BOOST_VERSION }} + platform_version: 2022 + toolset: mingw + + - name: Prepare env + run: python3 -m pip install -U wheel build cmake pybind11 + + - name: Install res_embed + env: + CC: ${{ steps.mingw64-setup.outputs.gcc }} + CXX: ${{ steps.mingw64-setup.outputs.gxx }} + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" -DMSVC_ITERATOR_DEBUG=ON -G "MinGW Makefiles" + cmake --build res_embed/build --target install -j2 + + - name: Configure + env: + BOOST_ROOT: ${{ env.BOOST_PATH }} + CC: ${{ steps.mingw64-setup.outputs.gcc }} + CXX: ${{ steps.mingw64-setup.outputs.gxx }} + run: > + cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="${{ github.workspace }}/install" + -DMSVC_ITERATOR_DEBUG=ON + -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)" + -G "MinGW Makefiles" + + - name: Build & install + run: cmake --build build --target all -j2 + + # ============================================================================ + + msys2: + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + include: + - msystem: MINGW64 + installdeps: >- + git + patch + make + mingw-w64-x86_64-toolchain + mingw-w64-x86_64-cmake + mingw-w64-x86_64-boost + mingw-w64-x86_64-dlfcn + mingw-w64-x86_64-nasm + mingw-w64-x86_64-python + mingw-w64-x86_64-python-pip + cmake_generator: -G "MSYS Makefiles" + CC: gcc + CXX: g++ + + - msystem: CLANG64 + installdeps: >- + git + patch + make + mingw-w64-clang-x86_64-toolchain + mingw-w64-clang-x86_64-libssp + mingw-w64-clang-x86_64-cmake + mingw-w64-clang-x86_64-boost + mingw-w64-clang-x86_64-dlfcn + mingw-w64-clang-x86_64-nasm + mingw-w64-clang-x86_64-python + mingw-w64-clang-x86_64-python-pip + cmake_generator: -G "MSYS Makefiles" + CC: clang + CXX: clang++ + + - msystem: UCRT64 + installdeps: >- + git + patch + make + mingw-w64-ucrt-x86_64-toolchain + mingw-w64-ucrt-x86_64-cmake + mingw-w64-ucrt-x86_64-boost + mingw-w64-ucrt-x86_64-dlfcn + mingw-w64-ucrt-x86_64-nasm + mingw-w64-ucrt-x86_64-python + mingw-w64-ucrt-x86_64-python-pip + cmake_generator: -G "MSYS Makefiles" + CC: gcc + CXX: g++ + + name: "MSYS2 ${{ matrix.msystem }} • x64" + steps: + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Setup MSYS + uses: msys2/setup-msys2@v2 + with: + install: ${{ matrix.installdeps }} + msystem: ${{ matrix.msystem }} + path-type: strict + update: false + + - name: Prepare env + shell: msys2 {0} + env: + CC: ${{ matrix.CC }} + CXX: ${{ matrix.CXX }} + run: python3 -m pip install -U wheel build pybind11 + + - name: Install res_embed + shell: msys2 {0} + env: + CC: ${{ matrix.CC }} + CXX: ${{ matrix.CXX }} + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install" \ + ${{ matrix.cmake_generator }} + cmake --build res_embed/build --target install -j2 -v + + - name: Configure + shell: msys2 {0} + env: + CC: ${{ matrix.CC }} + CXX: ${{ matrix.CXX }} + run: > + cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="${{ github.workspace }}/install" + -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)" + ${{ matrix.cmake_generator }} + + - name: Build & install + run: cmake --build build --target all -j2 -v + + # ============================================================================ + + cygwin: + runs-on: windows-latest + name: "Cygwin • x64" + env: + CYGWIN_NOWINPATH: 1 # only have cygwin's executables on PATH + CHERE_INVOKING: 1 # prevent profile script to change directory + CCACHE_VERSION: 4.6.1 + steps: + - run: git config --global core.autocrlf input + + - name: Checkout cppsim repository + uses: actions/checkout@v3 + + - name: Checkout res_embed repository + uses: actions/checkout@v3 + with: + repository: dmikushin/res_embed + ref: master + path: res_embed + + - name: Setup Cygwin + uses: cygwin/cygwin-install-action@v2 + with: + packages: >- + cygwin cygwin-devel + autoconf automake coreutils m4 make cmake patch git + gawk sed libtool gettext wget curl grep + gzip bzip2 tar xz nasm + binutils gcc-core gcc-g++ libboost-devel + python3 python3-devel python3-pip python3-virtualenv + + - name: Prepare env + env: + PATH: C:\cygwin\bin + shell: bash --login -eo pipefail -o igncr {0} + run: | + python3 -m pip install -U wheel build + python3 -m pip install -U pybind11 + + - name: Install res_embed + env: + PATH: C:\cygwin\bin + shell: bash --login -eo pipefail -o igncr {0} + run: | + cmake -S res_embed -B res_embed/build -DCMAKE_INSTALL_PREFIX="$PWD/install" + cmake --build res_embed/build --target install -j2 -v + + - name: Configure + env: + PATH: C:\cygwin\bin + shell: bash --login -eo pipefail -o igncr {0} + run: > + cmake -S. -Bbuild -DCMAKE_PREFIX_PATH="$PWD/install" + -Dpybind11_DIR="$(python3 -m pybind11 --cmakedir)" + + - name: Build & install + env: + PATH: C:\cygwin\bin + shell: bash --login -eo pipefail -o igncr {0} + run: cmake --build build --target all -j2 -v + + - name: Restore PATH for git + run: Add-Content -Path $env:GITHUB_PATH -Value "C:\Program Files\Git\bin" diff --git a/third_party/cppsim/cmake/commands/a b/third_party/cppsim/cmake/commands/a new file mode 100644 index 00000000..e69de29b diff --git a/third_party/cppsim/cppsim.cmake b/third_party/cppsim/cppsim.cmake deleted file mode 100644 index ab9c67bf..00000000 --- a/third_party/cppsim/cppsim.cmake +++ /dev/null @@ -1,70 +0,0 @@ -# ============================================================================== -# -# Copyright 2022 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# ============================================================================== - -set(VER 1.0.0) -set(GIT_TAG "f0c786a99833e73c28378450582d3c425095adb3") - -if(ENABLE_GITEE) - set(GIT_REPOSITORY "https://gitee.com/dmikushin/cppsim.git") -else() - set(GIT_REPOSITORY "https://github.com/dmikushin/cppsim.git") -endif() - -set(CMAKE_OPTION - -DBUILD_TESTING=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DEigen3_DIR=${Eigen3_DIR} - -DPYTHON_EXECUTABLE=${Python_EXECUTABLE} - -DPython3_EXECUTABLE=${Python_EXECUTABLE} - -DPython_EXECUTABLE=${Python_EXECUTABLE} - -Ddigestpp_DIR=${digestpp_DIR} - -Dpybind11_DIR=${pybind11_DIR} - -Dres_embed_DIR=${res_embed_DIR}) - -if(APPLE) - foreach( - _var - OpenMP_C_FLAGS - OpenMP_C_INCLUDE_DIR - OpenMP_C_LIB_NAMES - OpenMP_CXX_FLAGS - OpenMP_CXX_INCLUDE_DIR - OpenMP_CXX_LIB_NAMES - OpenMP_gomp_LIBRARY - OpenMP_libomp_LIBRARY - OpenMP_pthread_LIBRARY) - if(NOT "${${_var}}" STREQUAL "") - list(APPEND CMAKE_OPTION -D${_var}=${${_var}}) - endif() - endforeach() -endif() - -if(NOT _Boost_SYSTEM) - # Boost was locally built, make sure we use that one - list(APPEND CMAKE_OPTION -DBOOST_ROOT=${Boost_DIRPATH} -DBoost_NO_SYSTEM_PATHS:BOOL=ON) -endif() - -mindquantum_add_pkg( - cppsim - VER ${VER} - GIT_REPOSITORY ${GIT_REPOSITORY} - GIT_TAG ${GIT_TAG} - MD5 "xxxx" # NB: would be required if local server is enabled for downloads - CMAKE_OPTION ${CMAKE_OPTION} - FORCE_LOCAL_PKG - TARGET_ALIAS mindquantum::cppsim cppsim::kernelgen)