From 1e2a2cb394fc4bb09aaf82a720226b5398cf3d92 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 14 Oct 2025 15:32:53 +0200
Subject: [PATCH 01/19] CMake support

---
 .gitignore                                    |    1 -
 CMakeLists.txt                                |  390 +++
 cmake/FindCLime.cmake                         |   27 +
 cmake/FindLemon.cmake                         |   25 +
 cmake/git_hash.h.in                           |    6 +
 .../tmlqcd_config_internal.h.in               |  130 +-
 cmake_includes.txt                            |  425 ++++
 io/Makefile.in                                |  135 -
 src/bin/CMakeLists.txt                        |   19 +
 LapH_ev.c => src/bin/LapH_ev.c                |    0
 benchmark.c => src/bin/benchmark.c            |    0
 .../bin/check_locallity.c                     |    4 +-
 deriv_mg_tune.c => src/bin/deriv_mg_tune.c    |    0
 gen_sources.c => src/bin/gen_sources.c        |    0
 hmc_tm.c => src/bin/hmc_tm.c                  |    0
 hopping_test.c => src/bin/hopping_test.c      |    0
 invert.c => src/bin/invert.c                  |    0
 {util => src/bin}/main_ildg2uk.c              |    0
 .../bin/offline_measurement.c                 |    0
 .../bin/qphix_test_Dslash.c                   |    0
 {test => src/bin}/scalar_prod_r_test.c        |    0
 {test => src/bin}/test_eigenvalues.c          |    0
 test_lemon.c => src/bin/test_lemon.c          |    0
 src/lib/CMakeLists.txt                        |  457 ++++
 .../lib/DDalphaAMG_interface.c                |    0
 .../lib/DDalphaAMG_interface.h                |    0
 Ptilde_nd.c => src/lib/Ptilde_nd.c            |    0
 Ptilde_nd.h => src/lib/Ptilde_nd.h            |    0
 aligned_malloc.c => src/lib/aligned_malloc.c  |    0
 aligned_malloc.h => src/lib/aligned_malloc.h  |    0
 block.c => src/lib/block.c                    |    0
 block.h => src/lib/block.h                    |    0
 boundary.c => src/lib/boundary.c              |    0
 boundary.h => src/lib/boundary.h              |    0
 {buffers => src/lib/buffers}/Makefile.in      |    0
 {buffers => src/lib/buffers}/gauge.c          |    0
 {buffers => src/lib/buffers}/gauge.h          |    0
 {buffers => src/lib/buffers}/gauge.ih         |    0
 .../buffers}/gauge_allocate_gauge_buffers.c   |    0
 .../buffers}/gauge_finalize_gauge_buffers.c   |    0
 .../gauge_free_unused_gauge_buffers.c         |    0
 .../lib/buffers}/gauge_get_gauge_field.c      |    0
 .../buffers}/gauge_get_gauge_field_array.c    |    0
 .../buffers}/gauge_initialize_gauge_buffers.c |    0
 .../lib/buffers}/gauge_return_gauge_field.c   |    0
 .../buffers}/gauge_return_gauge_field_array.c |    0
 {buffers => src/lib/buffers}/utils.h          |    0
 {buffers => src/lib/buffers}/utils.ih         |    0
 .../utils_generic_exchange.blocking.inc       |    0
 .../lib/buffers}/utils_generic_exchange.c     |    0
 .../utils_generic_exchange.nonblocking.inc    |    0
 .../lib/chebyshev_polynomial.c                |    4 +-
 .../lib/chebyshev_polynomial.h                |    0
 .../lib/chebyshev_polynomial_nd.c             |    0
 .../lib/chebyshev_polynomial_nd.h             |    0
 clenshaw_coef.c => src/lib/clenshaw_coef.c    |    0
 clenshaw_coef.h => src/lib/clenshaw_coef.h    |    0
 .../lib/compare_derivative.c                  |    0
 .../lib/compare_derivative.h                  |    0
 {cu => src/lib/cu}/COPYING                    |    0
 {cu => src/lib/cu}/COPYING.LESSER             |    0
 {cu => src/lib/cu}/Makefile.in                |    0
 {cu => src/lib/cu}/check-regressions          |    0
 {cu => src/lib/cu}/cu.c                       |    0
 {cu => src/lib/cu}/cu.h                       |    0
 .../lib/default_input_values.h                |    0
 deriv_Sb.c => src/lib/deriv_Sb.c              |    0
 deriv_Sb.h => src/lib/deriv_Sb.h              |    0
 deriv_Sb_D_psi.c => src/lib/deriv_Sb_D_psi.c  |    0
 deriv_Sb_D_psi.h => src/lib/deriv_Sb_D_psi.h  |    0
 expo.c => src/lib/expo.c                      |    0
 expo.h => src/lib/expo.h                      |    0
 fatal_error.c => src/lib/fatal_error.c        |    0
 fatal_error.h => src/lib/fatal_error.h        |    0
 gamma.c => src/lib/gamma.c                    |    0
 gamma.h => src/lib/gamma.h                    |    0
 geometry_eo.c => src/lib/geometry_eo.c        |    0
 geometry_eo.h => src/lib/geometry_eo.h        |    0
 .../lib/get_rectangle_staples.c               |    0
 .../lib/get_rectangle_staples.h               |    0
 get_staples.c => src/lib/get_staples.c        |    0
 get_staples.h => src/lib/get_staples.h        |    0
 getopt.c => src/lib/getopt.c                  |    0
 getopt.h => src/lib/getopt.h                  |    0
 gettime.c => src/lib/gettime.c                |    0
 gettime.h => src/lib/gettime.h                |    0
 global.h => src/lib/global.h                  |    0
 .../lib/hamiltonian_field.h                   |    0
 {include => src/lib/include}/tmLQCD.h         |    0
 {include => src/lib/include}/tmlqcd_config.h  |    0
 {init => src/lib/init}/Makefile.in            |    0
 {init => src/lib/init}/init.h                 |    0
 {init => src/lib/init}/init_bispinor_field.c  |    0
 {init => src/lib/init}/init_bispinor_field.h  |    0
 .../lib/init}/init_chi_spinor_field.c         |    0
 .../lib/init}/init_chi_spinor_field.h         |    0
 .../lib/init}/init_critical_globals.c         |    0
 .../lib/init}/init_critical_globals.h         |    0
 .../lib/init}/init_dirac_halfspinor.c         |    0
 .../lib/init}/init_dirac_halfspinor.h         |    0
 {init => src/lib/init}/init_gauge_fg.c        |    0
 {init => src/lib/init}/init_gauge_fg.h        |    0
 {init => src/lib/init}/init_gauge_field.c     |    0
 {init => src/lib/init}/init_gauge_field.h     |    0
 {init => src/lib/init}/init_gauge_tmp.c       |    0
 {init => src/lib/init}/init_gauge_tmp.h       |    0
 .../lib/init}/init_geometry_indices.c         |    0
 .../lib/init}/init_geometry_indices.h         |    0
 {init => src/lib/init}/init_global_states.c   |    0
 {init => src/lib/init}/init_global_states.h   |    0
 {init => src/lib/init}/init_moment_field.c    |    0
 {init => src/lib/init}/init_moment_field.h    |    0
 .../lib/init}/init_omp_accumulators.c         |    0
 .../lib/init}/init_omp_accumulators.h         |    0
 {init => src/lib/init}/init_openmp.c          |    0
 {init => src/lib/init}/init_openmp.h          |    0
 {init => src/lib/init}/init_parallel.c        |    0
 {init => src/lib/init}/init_parallel.h        |    0
 {init => src/lib/init}/init_spinor_field.c    |    0
 {init => src/lib/init}/init_spinor_field.h    |    0
 .../lib/init}/init_stout_smear_vars.c         |    0
 .../lib/init}/init_stout_smear_vars.h         |    0
 integrator.c => src/lib/integrator.c          |    0
 integrator.h => src/lib/integrator.h          |    0
 .../lib/invert_clover_eo.c                    |    0
 .../lib/invert_clover_eo.h                    |    0
 .../lib/invert_doublet_eo.c                   |    0
 .../lib/invert_doublet_eo.h                   |    0
 invert_eo.c => src/lib/invert_eo.c            |    1 +
 invert_eo.h => src/lib/invert_eo.h            |    0
 invert_overlap.c => src/lib/invert_overlap.c  |    0
 invert_overlap.h => src/lib/invert_overlap.h  |    0
 {io => src/lib/io}/DML_crc32.c                |    0
 {io => src/lib/io}/deri_write_stdout.c        |    0
 {io => src/lib/io}/deri_write_stdout.h        |    0
 {io => src/lib/io}/dml.c                      |    0
 {io => src/lib/io}/dml.h                      |    0
 {io => src/lib/io}/eospinor.h                 |    0
 {io => src/lib/io}/eospinor.ih                |    0
 {io => src/lib/io}/eospinor_read.c            |    0
 {io => src/lib/io}/eospinor_write.c           |    0
 {io => src/lib/io}/gauge.h                    |    0
 {io => src/lib/io}/gauge.ih                   |    0
 {io => src/lib/io}/gauge_read.c               |    0
 {io => src/lib/io}/gauge_read_binary.c        |    0
 {io => src/lib/io}/gauge_write.c              |    0
 {io => src/lib/io}/gauge_write_binary.c       |    0
 .../lib/io}/gauge_write_luscher_binary.c      |    0
 .../lib/io}/gauge_write_luscher_binary.h      |    0
 {io => src/lib/io}/io_cm.c                    |    0
 {io => src/lib/io}/io_cm.h                    |    0
 {io => src/lib/io}/params.h                   |    0
 {io => src/lib/io}/params.ih                  |    0
 .../lib/io}/params_construct_InverterInfo.c   |    0
 .../lib/io}/params_construct_ildgFormat.c     |    0
 .../io}/params_construct_propagatorFormat.c   |    0
 .../lib/io}/params_construct_sourceFormat.c   |    0
 {io => src/lib/io}/params_construct_xlfInfo.c |    0
 {io => src/lib/io}/selector.h                 |    0
 {io => src/lib/io}/spinor.h                   |    0
 {io => src/lib/io}/spinor.ih                  |    0
 {io => src/lib/io}/spinor_read.c              |    0
 {io => src/lib/io}/spinor_read_binary.c       |    0
 {io => src/lib/io}/spinor_write.c             |    0
 {io => src/lib/io}/spinor_write_binary.c      |    0
 {io => src/lib/io}/spinor_write_info.c        |    0
 .../lib/io}/spinor_write_propagator_format.c  |    0
 .../lib/io}/spinor_write_propagator_type.c    |    0
 .../lib/io}/spinor_write_source_format.c      |    0
 {io => src/lib/io}/spinor_write_stdout.c      |    0
 {io => src/lib/io}/spinor_write_stdout.h      |    0
 {io => src/lib/io}/sw_write_stdout.c          |    0
 {io => src/lib/io}/sw_write_stdout.h          |    0
 {io => src/lib/io}/utils.c                    |    0
 {io => src/lib/io}/utils.h                    |    2 -
 {io => src/lib/io}/utils.ih                   |    4 +-
 .../lib/io}/utils_close_reader_record.c       |    0
 .../lib/io}/utils_close_writer_record.c       |    0
 {io => src/lib/io}/utils_construct_reader.c   |    2 +
 {io => src/lib/io}/utils_construct_writer.c   |    0
 {io => src/lib/io}/utils_destruct_reader.c    |    0
 {io => src/lib/io}/utils_destruct_writer.c    |    0
 {io => src/lib/io}/utils_engineering.c        |    0
 {io => src/lib/io}/utils_kill_with_error.c    |    0
 {io => src/lib/io}/utils_parse_checksum_xml.c |    0
 .../lib/io}/utils_parse_ildgformat_xml.c      |    0
 .../lib/io}/utils_parse_propagator_type.c     |    0
 {io => src/lib/io}/utils_read_message.c       |    0
 {io => src/lib/io}/utils_write_checksum.c     |    0
 .../lib/io}/utils_write_first_message.c       |    0
 {io => src/lib/io}/utils_write_header.c       |    0
 {io => src/lib/io}/utils_write_ildg_format.c  |    0
 .../lib/io}/utils_write_inverter_info.c       |    0
 {io => src/lib/io}/utils_write_message.c      |    0
 {io => src/lib/io}/utils_write_xlf.c          |    0
 {io => src/lib/io}/utils_write_xlf_xml.c      |    0
 .../lib/kahan_summation.h                     |    0
 {linalg => src/lib/linalg}/Makefile.in        |    0
 {linalg => src/lib/linalg}/add.c              |    0
 {linalg => src/lib/linalg}/add.h              |    0
 {linalg => src/lib/linalg}/addto_32.c         |    0
 {linalg => src/lib/linalg}/addto_32.h         |    0
 {linalg => src/lib/linalg}/assign.c           |    0
 {linalg => src/lib/linalg}/assign.h           |    0
 {linalg => src/lib/linalg}/assign_add_mul.c   |    0
 {linalg => src/lib/linalg}/assign_add_mul.h   |    0
 .../lib/linalg}/assign_add_mul_add_mul.c      |    0
 .../lib/linalg}/assign_add_mul_add_mul.h      |    0
 .../lib/linalg}/assign_add_mul_add_mul_r.c    |    0
 .../lib/linalg}/assign_add_mul_add_mul_r.h    |    0
 .../lib/linalg}/assign_add_mul_body.c         |    0
 {linalg => src/lib/linalg}/assign_add_mul_r.c |    0
 {linalg => src/lib/linalg}/assign_add_mul_r.h |    0
 .../lib/linalg}/assign_add_mul_r_32.c         |    5 +-
 .../lib/linalg}/assign_add_mul_r_32.h         |    0
 .../lib/linalg}/assign_add_mul_r_add_mul.c    |    0
 .../lib/linalg}/assign_add_mul_r_add_mul.h    |    0
 {linalg => src/lib/linalg}/assign_diff_mul.c  |    0
 {linalg => src/lib/linalg}/assign_diff_mul.h  |    0
 {linalg => src/lib/linalg}/assign_mul_add.c   |    0
 {linalg => src/lib/linalg}/assign_mul_add.h   |    0
 .../lib/linalg}/assign_mul_add_mul.c          |    0
 .../lib/linalg}/assign_mul_add_mul.h          |    0
 .../assign_mul_add_mul_add_mul_add_mul_r.c    |    0
 .../assign_mul_add_mul_add_mul_add_mul_r.h    |    0
 .../linalg}/assign_mul_add_mul_add_mul_r.c    |    0
 .../linalg}/assign_mul_add_mul_add_mul_r.h    |    0
 .../lib/linalg}/assign_mul_add_mul_r.c        |    0
 .../lib/linalg}/assign_mul_add_mul_r.h        |    0
 .../lib/linalg}/assign_mul_add_mul_r_32.c     |    0
 .../lib/linalg}/assign_mul_add_mul_r_32.h     |    0
 {linalg => src/lib/linalg}/assign_mul_add_r.c |    0
 {linalg => src/lib/linalg}/assign_mul_add_r.h |    0
 .../lib/linalg}/assign_mul_add_r_32.c         |    0
 .../lib/linalg}/assign_mul_add_r_32.h         |    0
 .../lib/linalg}/assign_mul_add_r_and_square.c |    0
 .../lib/linalg}/assign_mul_add_r_and_square.h |    0
 .../linalg}/assign_mul_bra_add_mul_ket_add.c  |    0
 .../linalg}/assign_mul_bra_add_mul_ket_add.h  |    0
 .../assign_mul_bra_add_mul_ket_add_r.c        |    0
 .../assign_mul_bra_add_mul_ket_add_r.h        |    0
 .../lib/linalg}/assign_mul_bra_add_mul_r.c    |    0
 .../lib/linalg}/assign_mul_bra_add_mul_r.h    |    0
 {linalg => src/lib/linalg}/assign_to_32.c     |    0
 {linalg => src/lib/linalg}/assign_to_32.h     |    0
 {linalg => src/lib/linalg}/blas.h             |    0
 {linalg => src/lib/linalg}/comp_decomp.c      |    0
 {linalg => src/lib/linalg}/comp_decomp.h      |    0
 .../lib/linalg}/convert_eo_to_lexic.c         |    0
 .../lib/linalg}/convert_eo_to_lexic.h         |    0
 .../lib/linalg}/convert_even_to_lexic.c       |    0
 .../lib/linalg}/convert_even_to_lexic.h       |    0
 .../lib/linalg}/convert_odd_to_lexic.c        |    0
 .../lib/linalg}/convert_odd_to_lexic.h        |    0
 {linalg => src/lib/linalg}/diff.c             |    0
 {linalg => src/lib/linalg}/diff.h             |    0
 {linalg => src/lib/linalg}/diff_32.c          |    0
 {linalg => src/lib/linalg}/diff_32.h          |    0
 .../lib/linalg}/diff_and_square_norm.c        |    0
 .../lib/linalg}/diff_and_square_norm.h        |    0
 {linalg => src/lib/linalg}/fortran.h          |    0
 {linalg => src/lib/linalg}/lapack.h           |    0
 {linalg => src/lib/linalg}/map_to_blas.h      |    0
 {linalg => src/lib/linalg}/mattimesvec.c      |    0
 {linalg => src/lib/linalg}/mattimesvec.h      |    0
 {linalg => src/lib/linalg}/mul.c              |    0
 {linalg => src/lib/linalg}/mul.h              |    0
 {linalg => src/lib/linalg}/mul_add_mul.c      |    0
 {linalg => src/lib/linalg}/mul_add_mul.h      |    0
 {linalg => src/lib/linalg}/mul_add_mul_r.c    |    0
 {linalg => src/lib/linalg}/mul_add_mul_r.h    |    0
 {linalg => src/lib/linalg}/mul_diff_mul.c     |    0
 {linalg => src/lib/linalg}/mul_diff_mul.h     |    0
 {linalg => src/lib/linalg}/mul_diff_mul_r.c   |    0
 {linalg => src/lib/linalg}/mul_diff_mul_r.h   |    0
 {linalg => src/lib/linalg}/mul_diff_r.c       |    0
 {linalg => src/lib/linalg}/mul_diff_r.h       |    0
 {linalg => src/lib/linalg}/mul_gamma5.c       |    0
 {linalg => src/lib/linalg}/mul_gamma5.h       |    0
 {linalg => src/lib/linalg}/mul_r.c            |    0
 {linalg => src/lib/linalg}/mul_r.h            |    0
 {linalg => src/lib/linalg}/mul_r_32.c         |    0
 {linalg => src/lib/linalg}/mul_r_32.h         |    0
 {linalg => src/lib/linalg}/mul_r_gamma5.c     |    0
 {linalg => src/lib/linalg}/mul_r_gamma5.h     |    0
 {linalg => src/lib/linalg}/print_spinor.c     |    0
 {linalg => src/lib/linalg}/print_spinor.h     |    0
 .../linalg}/print_spinor_similar_components.c |    0
 .../linalg}/print_spinor_similar_components.h |    0
 {linalg => src/lib/linalg}/ratio.c            |    0
 {linalg => src/lib/linalg}/ratio.h            |    0
 {linalg => src/lib/linalg}/scalar_prod.c      |    0
 {linalg => src/lib/linalg}/scalar_prod.h      |    0
 {linalg => src/lib/linalg}/scalar_prod_body.c |    0
 {linalg => src/lib/linalg}/scalar_prod_i.c    |    0
 {linalg => src/lib/linalg}/scalar_prod_i.h    |    0
 {linalg => src/lib/linalg}/scalar_prod_r.c    |    0
 {linalg => src/lib/linalg}/scalar_prod_r.h    |    0
 {linalg => src/lib/linalg}/scalar_prod_r_32.c |    0
 {linalg => src/lib/linalg}/scalar_prod_r_32.h |    0
 {linalg => src/lib/linalg}/set_even_to_zero.c |    0
 {linalg => src/lib/linalg}/set_even_to_zero.h |    0
 .../lib/linalg}/square_and_minmax.c           |    0
 .../lib/linalg}/square_and_minmax.h           |    0
 .../lib/linalg}/square_and_prod_r.c           |    0
 .../lib/linalg}/square_and_prod_r.h           |    0
 {linalg => src/lib/linalg}/square_norm.c      |    0
 {linalg => src/lib/linalg}/square_norm.h      |    0
 {linalg => src/lib/linalg}/square_norm_32.c   |    0
 {linalg => src/lib/linalg}/square_norm_32.h   |    0
 linalg_eo.h => src/lib/linalg_eo.h            |    0
 little_D.c => src/lib/little_D.c              |    0
 little_D.h => src/lib/little_D.h              |    0
 little_D_body.c => src/lib/little_D_body.c    |    0
 matrix_utils.c => src/lib/matrix_utils.c      |    0
 matrix_utils.h => src/lib/matrix_utils.h      |    0
 {meas => src/lib/meas}/Makefile.in            |    0
 {meas => src/lib/meas}/correlators.c          |    0
 {meas => src/lib/meas}/correlators.h          |    0
 {meas => src/lib/meas}/field_strength_types.h |    0
 {meas => src/lib/meas}/gradient_flow.c        |    0
 {meas => src/lib/meas}/gradient_flow.h        |    0
 ...easure_clover_field_strength_observables.c |    0
 ...easure_clover_field_strength_observables.h |    0
 {meas => src/lib/meas}/measurements.c         |    0
 {meas => src/lib/meas}/measurements.h         |    0
 {meas => src/lib/meas}/oriented_plaquettes.c  |    0
 {meas => src/lib/meas}/oriented_plaquettes.h  |    0
 {meas => src/lib/meas}/pion_norm.c            |    0
 {meas => src/lib/meas}/pion_norm.h            |    0
 {meas => src/lib/meas}/polyakov_loop.c        |    0
 {meas => src/lib/meas}/polyakov_loop.h        |    0
 .../lib/measure_gauge_action.c                |    0
 .../lib/measure_gauge_action.h                |    0
 .../lib/measure_rectangles.c                  |    0
 .../lib/measure_rectangles.h                  |    0
 misc_types.h => src/lib/misc_types.h          |    2 +-
 {monomial => src/lib/monomial}/Makefile.in    |    0
 .../lib/monomial}/clover_trlog_monomial.c     |    0
 .../lib/monomial}/clover_trlog_monomial.h     |    0
 .../lib/monomial}/cloverdet_monomial.c        |    0
 .../lib/monomial}/cloverdet_monomial.h        |    0
 .../lib/monomial}/cloverdetratio_monomial.c   |    0
 .../lib/monomial}/cloverdetratio_monomial.h   |    0
 .../lib/monomial}/cloverdetratio_rwmonomial.c |    0
 .../lib/monomial}/cloverdetratio_rwmonomial.h |    0
 .../lib/monomial}/clovernd_trlog_monomial.c   |    0
 .../lib/monomial}/clovernd_trlog_monomial.h   |    0
 .../lib/monomial}/cloverndpoly_monomial.c     |    0
 .../lib/monomial}/cloverndpoly_monomial.h     |    0
 {monomial => src/lib/monomial}/det_monomial.c |    0
 {monomial => src/lib/monomial}/det_monomial.h |    0
 .../lib/monomial}/detratio_monomial.c         |    0
 .../lib/monomial}/detratio_monomial.h         |    0
 .../lib/monomial}/gauge_monomial.c            |    0
 .../lib/monomial}/gauge_monomial.h            |    0
 .../lib/monomial}/moment_energy.c             |    0
 .../lib/monomial}/moment_energy.h             |    0
 .../lib/monomial}/monitor_forces.c            |    0
 .../lib/monomial}/monitor_forces.h            |    0
 {monomial => src/lib/monomial}/monomial.c     |    0
 {monomial => src/lib/monomial}/monomial.h     |    0
 .../lib/monomial}/nddetratio_monomial.c       |    0
 .../lib/monomial}/nddetratio_monomial.h       |    0
 .../lib/monomial}/ndpoly_monomial.c           |    0
 .../lib/monomial}/ndpoly_monomial.h           |    0
 .../lib/monomial}/ndrat_monomial.c            |    0
 .../lib/monomial}/ndrat_monomial.h            |    0
 .../lib/monomial}/ndratcor_monomial.c         |    0
 .../lib/monomial}/ndratcor_monomial.h         |    0
 .../lib/monomial}/poly_monomial.c             |    0
 .../lib/monomial}/poly_monomial.h             |    0
 {monomial => src/lib/monomial}/rat_monomial.c |    0
 {monomial => src/lib/monomial}/rat_monomial.h |    0
 .../lib/monomial}/ratcor_monomial.c           |    0
 .../lib/monomial}/ratcor_monomial.h           |    0
 mpi_init.c => src/lib/mpi_init.c              |    0
 mpi_init.h => src/lib/mpi_init.h              |    0
 .../lib/omp_accumulator.h                     |    0
 operator.c => src/lib/operator.c              |    0
 operator.h => src/lib/operator.h              |    0
 .../lib/operator}/Block_D_psi_body.c          |    0
 {operator => src/lib/operator}/D_psi.c        |    0
 {operator => src/lib/operator}/D_psi.h        |    0
 {operator => src/lib/operator}/D_psi_body.c   |    0
 {operator => src/lib/operator}/Dov_proj.c     |    0
 {operator => src/lib/operator}/Dov_proj.h     |    0
 {operator => src/lib/operator}/Dov_psi.c      |    0
 {operator => src/lib/operator}/Dov_psi.h      |    0
 .../lib/operator}/Hopping_Matrix.c            |    0
 .../lib/operator}/Hopping_Matrix.h            |    0
 .../lib/operator}/Hopping_Matrix_32.c         |    0
 .../lib/operator}/Hopping_Matrix_32.h         |    0
 .../lib/operator}/Hopping_Matrix_32_nocom.c   |    0
 .../lib/operator}/Hopping_Matrix_nocom.c      |    0
 .../lib/operator}/Hopping_Matrix_nocom.h      |    0
 {operator => src/lib/operator}/Makefile.in    |    0
 .../assign_mul_one_sw_pm_imu_inv_block_body.c |    0
 ...assign_mul_one_sw_pm_imu_site_lexic_body.c |    0
 .../lib/operator}/clover_accumulate_deriv.c   |    0
 {operator => src/lib/operator}/clover_deriv.c |    0
 {operator => src/lib/operator}/clover_det.c   |    0
 .../lib/operator}/clover_inline.h             |    0
 .../lib/operator}/clover_invert.c             |    0
 {operator => src/lib/operator}/clover_leaf.c  |    0
 {operator => src/lib/operator}/clover_leaf.h  |    0
 {operator => src/lib/operator}/clover_term.c  |    0
 .../lib/operator}/clovertm_operators.c        |    0
 .../lib/operator}/clovertm_operators.h        |    0
 .../lib/operator}/clovertm_operators_32.c     |    0
 .../lib/operator}/clovertm_operators_32.h     |    0
 .../lib/operator}/halfspinor_body.c           |    0
 .../lib/operator}/halfspinor_body_32.c        |    0
 .../lib/operator}/halfspinor_hopping.h        |    0
 .../lib/operator}/halfspinor_hopping_32.h     |    0
 .../lib/operator}/hopping_bg_dbl.c            |    0
 .../lib/operator}/hopping_body_dbl.c          |    0
 {operator => src/lib/operator}/hopping_sgl.c  |    0
 .../lib/operator}/mul_one_pm_imu_inv_body.c   |    0
 .../operator}/mul_one_pm_imu_sub_mul_body.c   |    0
 {operator => src/lib/operator}/tm_operators.c |    0
 {operator => src/lib/operator}/tm_operators.h |    0
 .../lib/operator}/tm_operators_32.c           |    0
 .../lib/operator}/tm_operators_32.h           |    0
 .../lib/operator}/tm_operators_nd.c           |    0
 .../lib/operator}/tm_operators_nd.h           |    0
 .../lib/operator}/tm_operators_nd_32.c        |    0
 .../lib/operator}/tm_operators_nd_32.h        |    0
 .../lib/operator}/tm_sub_Hopping_Matrix.c     |    0
 .../lib/operator}/tm_sub_Hopping_Matrix.h     |    0
 .../lib/operator}/tm_times_Hopping_Matrix.c   |    0
 .../lib/operator}/tm_times_Hopping_Matrix.h   |    0
 operator_types.h => src/lib/operator_types.h  |    0
 overrelaxation.c => src/lib/overrelaxation.c  |    2 +-
 overrelaxation.h => src/lib/overrelaxation.h  |    0
 parallel_io.h => src/lib/parallel_io.h        |    0
 phmc.c => src/lib/phmc.c                      |    0
 phmc.h => src/lib/phmc.h                      |    0
 prepare_source.c => src/lib/prepare_source.c  |    0
 prepare_source.h => src/lib/prepare_source.h  |    0
 .../lib/profiling}/hmc/Readme.md              |    0
 .../lib/profiling}/hmc/example_profile.pdf    |  Bin
 .../lib/profiling}/hmc/profile.Rmd            |    0
 {profiling => src/lib/profiling}/hmc/timing.R |    0
 .../lib/profiling}/hmc_mk2/.gitignore         |    0
 .../lib/profiling}/hmc_mk2/README.md          |    0
 .../profiling}/hmc_mk2/logs/example_log.out   |    0
 .../lib/profiling}/hmc_mk2/make_profile.R     |    0
 .../lib/profiling}/hmc_mk2/profile.Rmd        |    0
 src/lib/qphix/qphix_base_classes.hpp          |  771 ++++++
 src/lib/qphix/qphix_interface.cpp             | 2192 +++++++++++++++++
 src/lib/qphix/qphix_interface.hpp             |   51 +
 src/lib/qphix/qphix_interface_utils.hpp       |   33 +
 .../lib/qphix_interface.h                     |    0
 qphix_types.h => src/lib/qphix_types.h        |    0
 qphix_veclen.h => src/lib/qphix_veclen.h      |    0
 .../lib/quda_dummy_types.h                    |    0
 src/lib/quda_gauge_paths.inc                  |  158 ++
 quda_interface.c => src/lib/quda_interface.c  |    0
 quda_interface.h => src/lib/quda_interface.h  |    0
 quda_types.h => src/lib/quda_types.h          |    0
 ranlxd.c => src/lib/ranlxd.c                  |    0
 ranlxd.h => src/lib/ranlxd.h                  |    0
 ranlxs.c => src/lib/ranlxs.c                  |    0
 ranlxs.h => src/lib/ranlxs.h                  |    0
 {rational => src/lib/rational}/Makefile.in    |    0
 {rational => src/lib/rational}/elliptic.c     |    0
 {rational => src/lib/rational}/elliptic.h     |    0
 {rational => src/lib/rational}/rational.c     |    0
 {rational => src/lib/rational}/rational.h     |    0
 {rational => src/lib/rational}/zolotarev.c    |    0
 {rational => src/lib/rational}/zolotarev.h    |    0
 read_input.h => src/lib/read_input.h          |    0
 read_input.l => src/lib/read_input.l          |    0
 .../lib/reweighting_factor.c                  |    0
 .../lib/reweighting_factor.h                  |    0
 .../lib/reweighting_factor_nd.c               |    0
 .../lib/reweighting_factor_nd.h               |    0
 .../lib/rnd_gauge_trafo.c                     |    0
 .../lib/rnd_gauge_trafo.h                     |    0
 sighandler.c => src/lib/sighandler.c          |    0
 sighandler.h => src/lib/sighandler.h          |    0
 {smearing => src/lib/smearing}/Makefile.in    |    0
 {smearing => src/lib/smearing}/ape.h          |    0
 {smearing => src/lib/smearing}/ape.ih         |    0
 .../lib/smearing}/ape_ape_smear.c             |    0
 {smearing => src/lib/smearing}/hex.h          |    0
 {smearing => src/lib/smearing}/hex.ih         |    0
 .../lib/smearing}/hex_hex_smear.c             |    0
 .../lib/smearing}/hex_stout_exclude_none.c    |    0
 .../lib/smearing}/hex_stout_exclude_one.c     |    0
 .../lib/smearing}/hex_stout_exclude_two.c     |    0
 {smearing => src/lib/smearing}/hyp.h          |    0
 {smearing => src/lib/smearing}/hyp.ih         |    0
 .../smearing}/hyp_APE_project_exclude_none.c  |    0
 .../smearing}/hyp_APE_project_exclude_one.c   |    0
 .../smearing}/hyp_APE_project_exclude_two.c   |    0
 .../lib/smearing}/hyp_hyp_smear.c             |    0
 .../smearing}/hyp_hyp_staples_exclude_none.c  |    0
 .../smearing}/hyp_hyp_staples_exclude_one.c   |    0
 .../smearing}/hyp_hyp_staples_exclude_two.c   |    0
 {smearing => src/lib/smearing}/stout.h        |    0
 {smearing => src/lib/smearing}/stout.ih       |    0
 .../lib/smearing}/stout_stout_smear.c         |    0
 .../smearing}/uils_print_config_to_screen.c   |    0
 {smearing => src/lib/smearing}/utils.h        |    0
 {smearing => src/lib/smearing}/utils.ih       |    0
 .../lib/smearing}/utils_generic_staples.c     |    0
 .../smearing}/utils_print_config_to_screen.c  |    0
 .../lib/smearing}/utils_print_su3.c           |    0
 .../lib/smearing}/utils_project_antiherm.c    |    0
 .../lib/smearing}/utils_project_herm.c        |    0
 .../lib/smearing}/utils_reunitarize.c         |    0
 .../lib/smearing}/utils_reunitarize_MILC.c    |    8 +-
 .../lib/solver}/M_plus_block_psi_body.c       |    0
 {solver => src/lib/solver}/Makefile.in        |    0
 {solver => src/lib/solver}/Msap.c             |    0
 {solver => src/lib/solver}/Msap.h             |    0
 {solver => src/lib/solver}/bicg_complex.c     |    0
 {solver => src/lib/solver}/bicg_complex.h     |    0
 {solver => src/lib/solver}/bicgstab2.c        |    0
 {solver => src/lib/solver}/bicgstab2.h        |    0
 {solver => src/lib/solver}/bicgstab_complex.c |    0
 {solver => src/lib/solver}/bicgstab_complex.h |    0
 .../lib/solver}/bicgstab_complex_bi.c         |    0
 .../lib/solver}/bicgstab_complex_bi.h         |    0
 {solver => src/lib/solver}/bicgstabell.c      |    0
 {solver => src/lib/solver}/bicgstabell.h      |    0
 {solver => src/lib/solver}/cg_her.c           |    0
 {solver => src/lib/solver}/cg_her.h           |    0
 {solver => src/lib/solver}/cg_her_bi.c        |    0
 {solver => src/lib/solver}/cg_her_bi.h        |    0
 {solver => src/lib/solver}/cg_her_nd.c        |    0
 {solver => src/lib/solver}/cg_her_nd.h        |    0
 {solver => src/lib/solver}/cg_mms_tm.c        |    0
 {solver => src/lib/solver}/cg_mms_tm.h        |    0
 {solver => src/lib/solver}/cg_mms_tm_nd.c     |    0
 {solver => src/lib/solver}/cg_mms_tm_nd.h     |    0
 {solver => src/lib/solver}/cgne4complex.c     |    0
 {solver => src/lib/solver}/cgne4complex.h     |    0
 {solver => src/lib/solver}/cgs_real.c         |    0
 {solver => src/lib/solver}/cgs_real.h         |    0
 {solver => src/lib/solver}/chrono_guess.c     |    0
 {solver => src/lib/solver}/chrono_guess.h     |    0
 {solver => src/lib/solver}/cr.c               |    0
 {solver => src/lib/solver}/cr.h               |    0
 {solver => src/lib/solver}/dfl_projector.c    |    0
 {solver => src/lib/solver}/dfl_projector.h    |    0
 .../lib/solver}/diagonalise_general_matrix.c  |    0
 .../lib/solver}/diagonalise_general_matrix.h  |    0
 .../lib/solver}/dirac_operator_eigenvectors.c |    0
 .../lib/solver}/dirac_operator_eigenvectors.h |    0
 {solver => src/lib/solver}/eigcg.c            |    0
 {solver => src/lib/solver}/eigcg.h            |    0
 {solver => src/lib/solver}/eigenvalues.c      |    0
 {solver => src/lib/solver}/eigenvalues.h      |    0
 {solver => src/lib/solver}/eigenvalues_bi.c   |    0
 {solver => src/lib/solver}/eigenvalues_bi.h   |    0
 {solver => src/lib/solver}/fgmres.c           |    0
 {solver => src/lib/solver}/fgmres.h           |    0
 {solver => src/lib/solver}/fgmres4complex.c   |    0
 {solver => src/lib/solver}/fgmres4complex.h   |    0
 .../lib/solver}/fgmres4complex_body.c         |    0
 {solver => src/lib/solver}/gcr.c              |    0
 {solver => src/lib/solver}/gcr.h              |    0
 {solver => src/lib/solver}/gcr4complex.c      |    0
 {solver => src/lib/solver}/gcr4complex.h      |    0
 {solver => src/lib/solver}/gcr4complex_body.c |    0
 {solver => src/lib/solver}/gcr4complex_body.h |    0
 .../lib/solver}/generate_dfl_subspace.c       |    0
 .../lib/solver}/generate_dfl_subspace.h       |    0
 {solver => src/lib/solver}/gmres.c            |    0
 {solver => src/lib/solver}/gmres.h            |    0
 {solver => src/lib/solver}/gmres_dr.c         |    0
 {solver => src/lib/solver}/gmres_dr.h         |    0
 {solver => src/lib/solver}/gmres_precon.c     |    0
 {solver => src/lib/solver}/gmres_precon.h     |    0
 {solver => src/lib/solver}/gram-schmidt.c     |    0
 {solver => src/lib/solver}/gram-schmidt.h     |    0
 {solver => src/lib/solver}/incr_eigcg.c       |    0
 {solver => src/lib/solver}/incr_eigcg.h       |    0
 {solver => src/lib/solver}/index_jd.c         |    0
 {solver => src/lib/solver}/index_jd.h         |    0
 {solver => src/lib/solver}/init_guess.c       |    0
 {solver => src/lib/solver}/init_guess.h       |    0
 {solver => src/lib/solver}/jdher.c            |    0
 {solver => src/lib/solver}/jdher.h            |    0
 {solver => src/lib/solver}/jdher_bi.c         |    0
 {solver => src/lib/solver}/jdher_bi.h         |    0
 .../lib/solver}/little_mg_precon_body.c       |    0
 .../lib/solver}/little_project_eo_body.c      |    0
 {solver => src/lib/solver}/lu_solve.c         |    0
 {solver => src/lib/solver}/lu_solve.h         |    0
 .../lib/solver}/matrix_mult_typedef.h         |    0
 .../lib/solver}/matrix_mult_typedef_bi.h      |    0
 .../lib/solver}/matrix_mult_typedef_nd.h      |    0
 {solver => src/lib/solver}/mcr.c              |    0
 {solver => src/lib/solver}/mcr.h              |    0
 {solver => src/lib/solver}/mcr4complex.c      |    0
 {solver => src/lib/solver}/mcr4complex.h      |    0
 {solver => src/lib/solver}/mixed_cg_her.c     |    0
 {solver => src/lib/solver}/mixed_cg_her.h     |    0
 .../lib/solver}/mixed_cg_mms_tm_nd.c          |    0
 .../lib/solver}/mixed_cg_mms_tm_nd.h          |    0
 {solver => src/lib/solver}/monomial_solve.c   |    0
 {solver => src/lib/solver}/monomial_solve.h   |    0
 {solver => src/lib/solver}/mr.c               |    0
 {solver => src/lib/solver}/mr.h               |    0
 {solver => src/lib/solver}/mr4complex.c       |    0
 {solver => src/lib/solver}/mr4complex.h       |    0
 {solver => src/lib/solver}/mrblk_body.c       |    0
 {solver => src/lib/solver}/ortho.c            |    0
 {solver => src/lib/solver}/ortho.h            |    0
 {solver => src/lib/solver}/pcg_her.c          |    0
 {solver => src/lib/solver}/pcg_her.h          |    0
 {solver => src/lib/solver}/poly_precon.c      |    0
 {solver => src/lib/solver}/poly_precon.h      |    0
 {solver => src/lib/solver}/quicksort.c        |    0
 {solver => src/lib/solver}/quicksort.h        |    0
 {solver => src/lib/solver}/restart_X.c        |    0
 {solver => src/lib/solver}/restart_X.h        |    0
 {solver => src/lib/solver}/rg_mixed_cg_her.c  |    0
 {solver => src/lib/solver}/rg_mixed_cg_her.h  |    0
 .../lib/solver}/rg_mixed_cg_her_nd.c          |    0
 .../lib/solver}/rg_mixed_cg_her_nd.h          |    0
 .../lib/solver}/rg_mixed_cg_typedef.h         |    0
 {solver => src/lib/solver}/solver.h           |    0
 {solver => src/lib/solver}/solver_field.c     |    0
 {solver => src/lib/solver}/solver_field.h     |    0
 {solver => src/lib/solver}/solver_params.h    |    0
 {solver => src/lib/solver}/solver_types.c     |    0
 {solver => src/lib/solver}/solver_types.h     |    0
 {solver => src/lib/solver}/sub_low_ev.c       |    0
 {solver => src/lib/solver}/sub_low_ev.h       |    0
 {solver => src/lib/solver}/sumr.c             |    0
 {solver => src/lib/solver}/sumr.h             |    0
 .../lib/source_generation.c                   |    0
 .../lib/source_generation.h                   |    0
 spinor_fft.c => src/lib/spinor_fft.c          |    0
 spinor_fft.h => src/lib/spinor_fft.h          |    0
 start.c => src/lib/start.c                    |    0
 start.h => src/lib/start.h                    |    0
 .../lib/struct_accessors.h                    |    0
 su3.h => src/lib/su3.h                        |    0
 su3adj.h => src/lib/su3adj.h                  |    0
 su3spinor.h => src/lib/su3spinor.h            |    0
 tensors.h => src/lib/tensors.h                |    0
 {test => src/lib/test}/Makefile               |    0
 {test => src/lib/test}/check_geometry.c       |    0
 {test => src/lib/test}/check_geometry.h       |    0
 {test => src/lib/test}/check_nan.c            |    0
 {test => src/lib/test}/check_nan.h            |    0
 {test => src/lib/test}/check_overlap.c        |    0
 {test => src/lib/test}/check_xchange.c        |    0
 {test => src/lib/test}/hopping_test.README    |    0
 .../lib/test}/hopping_test.input.compare      |    0
 {test => src/lib/test}/hopping_test.input.new |    0
 .../lib/test}/hopping_test.input.start        |    0
 .../lib/test}/hopping_test_generate_script    |    0
 {test => src/lib/test}/hopping_test_qscript   |    0
 .../lib/test}/measure_rectangles.debug.c      |    0
 {test => src/lib/test}/overlaptests.c         |    0
 {test => src/lib/test}/overlaptests.h         |    0
 {test => src/lib/test}/qdran64.h              |    0
 .../lib/tm_debug_printf.c                     |    0
 .../lib/tm_debug_printf.h                     |    0
 .../lib/update_backward_gauge.c               |    0
 .../lib/update_backward_gauge.h               |    0
 update_gauge.c => src/lib/update_gauge.c      |    0
 update_gauge.h => src/lib/update_gauge.h      |    0
 update_momenta.c => src/lib/update_momenta.c  |    0
 update_momenta.h => src/lib/update_momenta.h  |    0
 .../lib/update_momenta_fg.c                   |    0
 .../lib/update_momenta_fg.h                   |    0
 update_tm.c => src/lib/update_tm.c            |    0
 update_tm.h => src/lib/update_tm.h            |    0
 {util => src/lib/util}/io.c                   |    0
 {util => src/lib/util}/io.h                   |    0
 {util => src/lib/util}/laguer/Makefile        |    0
 {util => src/lib/util}/laguer/chebyRoot.C     |    0
 {util => src/lib/util}/laguer/chebyRoot.H     |    0
 {util => src/lib/util}/laguer/laguer.c        |    0
 {util => src/lib/util}/laguer/quadroptRoot.C  |    0
 {util => src/lib/util}/oox/Makefile           |    0
 {util => src/lib/util}/oox/oox.c              |    0
 {util => src/lib/util}/oox/oox_gawrapper.cxx  |    0
 {util => src/lib/util}/oox/oox_gawrapper.h    |    0
 {util => src/lib/util}/swapendian.c           |    0
 {util => src/lib/util}/tmlqcd-indent          |    0
 {wrapper => src/lib/wrapper}/Makefile.in      |    0
 {wrapper => src/lib/wrapper}/lib_wrapper.c    |    0
 {xchange => src/lib/xchange}/Makefile.in      |    0
 .../lib/xchange}/little_field_gather.c        |    0
 .../lib/xchange}/little_field_gather.h        |    0
 .../lib/xchange}/little_field_gather_body.c   |    0
 {xchange => src/lib/xchange}/xchange.h        |    0
 .../lib/xchange}/xchange_2fields.c            |    0
 .../lib/xchange}/xchange_2fields.h            |    0
 {xchange => src/lib/xchange}/xchange_deri.c   |    0
 {xchange => src/lib/xchange}/xchange_deri.h   |    0
 {xchange => src/lib/xchange}/xchange_field.c  |    0
 {xchange => src/lib/xchange}/xchange_field.h  |    0
 {xchange => src/lib/xchange}/xchange_gauge.c  |    0
 {xchange => src/lib/xchange}/xchange_gauge.h  |    0
 .../lib/xchange}/xchange_halffield.c          |    0
 .../lib/xchange}/xchange_halffield.h          |    0
 .../lib/xchange}/xchange_lexicfield.c         |    0
 .../lib/xchange}/xchange_lexicfield.h         |    0
 708 files changed, 4614 insertions(+), 240 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 cmake/FindCLime.cmake
 create mode 100644 cmake/FindLemon.cmake
 create mode 100644 cmake/git_hash.h.in
 rename {include => cmake}/tmlqcd_config_internal.h.in (56%)
 create mode 100644 cmake_includes.txt
 delete mode 100644 io/Makefile.in
 create mode 100644 src/bin/CMakeLists.txt
 rename LapH_ev.c => src/bin/LapH_ev.c (100%)
 rename benchmark.c => src/bin/benchmark.c (100%)
 rename check_locallity.c => src/bin/check_locallity.c (99%)
 rename deriv_mg_tune.c => src/bin/deriv_mg_tune.c (100%)
 rename gen_sources.c => src/bin/gen_sources.c (100%)
 rename hmc_tm.c => src/bin/hmc_tm.c (100%)
 rename hopping_test.c => src/bin/hopping_test.c (100%)
 rename invert.c => src/bin/invert.c (100%)
 rename {util => src/bin}/main_ildg2uk.c (100%)
 rename offline_measurement.c => src/bin/offline_measurement.c (100%)
 rename qphix_test_Dslash.c => src/bin/qphix_test_Dslash.c (100%)
 rename {test => src/bin}/scalar_prod_r_test.c (100%)
 rename {test => src/bin}/test_eigenvalues.c (100%)
 rename test_lemon.c => src/bin/test_lemon.c (100%)
 create mode 100644 src/lib/CMakeLists.txt
 rename DDalphaAMG_interface.c => src/lib/DDalphaAMG_interface.c (100%)
 rename DDalphaAMG_interface.h => src/lib/DDalphaAMG_interface.h (100%)
 rename Ptilde_nd.c => src/lib/Ptilde_nd.c (100%)
 rename Ptilde_nd.h => src/lib/Ptilde_nd.h (100%)
 rename aligned_malloc.c => src/lib/aligned_malloc.c (100%)
 rename aligned_malloc.h => src/lib/aligned_malloc.h (100%)
 rename block.c => src/lib/block.c (100%)
 rename block.h => src/lib/block.h (100%)
 rename boundary.c => src/lib/boundary.c (100%)
 rename boundary.h => src/lib/boundary.h (100%)
 rename {buffers => src/lib/buffers}/Makefile.in (100%)
 rename {buffers => src/lib/buffers}/gauge.c (100%)
 rename {buffers => src/lib/buffers}/gauge.h (100%)
 rename {buffers => src/lib/buffers}/gauge.ih (100%)
 rename {buffers => src/lib/buffers}/gauge_allocate_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_finalize_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_free_unused_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_get_gauge_field.c (100%)
 rename {buffers => src/lib/buffers}/gauge_get_gauge_field_array.c (100%)
 rename {buffers => src/lib/buffers}/gauge_initialize_gauge_buffers.c (100%)
 rename {buffers => src/lib/buffers}/gauge_return_gauge_field.c (100%)
 rename {buffers => src/lib/buffers}/gauge_return_gauge_field_array.c (100%)
 rename {buffers => src/lib/buffers}/utils.h (100%)
 rename {buffers => src/lib/buffers}/utils.ih (100%)
 rename {buffers => src/lib/buffers}/utils_generic_exchange.blocking.inc (100%)
 rename {buffers => src/lib/buffers}/utils_generic_exchange.c (100%)
 rename {buffers => src/lib/buffers}/utils_generic_exchange.nonblocking.inc (100%)
 rename chebyshev_polynomial.c => src/lib/chebyshev_polynomial.c (98%)
 rename chebyshev_polynomial.h => src/lib/chebyshev_polynomial.h (100%)
 rename chebyshev_polynomial_nd.c => src/lib/chebyshev_polynomial_nd.c (100%)
 rename chebyshev_polynomial_nd.h => src/lib/chebyshev_polynomial_nd.h (100%)
 rename clenshaw_coef.c => src/lib/clenshaw_coef.c (100%)
 rename clenshaw_coef.h => src/lib/clenshaw_coef.h (100%)
 rename compare_derivative.c => src/lib/compare_derivative.c (100%)
 rename compare_derivative.h => src/lib/compare_derivative.h (100%)
 rename {cu => src/lib/cu}/COPYING (100%)
 rename {cu => src/lib/cu}/COPYING.LESSER (100%)
 rename {cu => src/lib/cu}/Makefile.in (100%)
 rename {cu => src/lib/cu}/check-regressions (100%)
 rename {cu => src/lib/cu}/cu.c (100%)
 rename {cu => src/lib/cu}/cu.h (100%)
 rename default_input_values.h => src/lib/default_input_values.h (100%)
 rename deriv_Sb.c => src/lib/deriv_Sb.c (100%)
 rename deriv_Sb.h => src/lib/deriv_Sb.h (100%)
 rename deriv_Sb_D_psi.c => src/lib/deriv_Sb_D_psi.c (100%)
 rename deriv_Sb_D_psi.h => src/lib/deriv_Sb_D_psi.h (100%)
 rename expo.c => src/lib/expo.c (100%)
 rename expo.h => src/lib/expo.h (100%)
 rename fatal_error.c => src/lib/fatal_error.c (100%)
 rename fatal_error.h => src/lib/fatal_error.h (100%)
 rename gamma.c => src/lib/gamma.c (100%)
 rename gamma.h => src/lib/gamma.h (100%)
 rename geometry_eo.c => src/lib/geometry_eo.c (100%)
 rename geometry_eo.h => src/lib/geometry_eo.h (100%)
 rename get_rectangle_staples.c => src/lib/get_rectangle_staples.c (100%)
 rename get_rectangle_staples.h => src/lib/get_rectangle_staples.h (100%)
 rename get_staples.c => src/lib/get_staples.c (100%)
 rename get_staples.h => src/lib/get_staples.h (100%)
 rename getopt.c => src/lib/getopt.c (100%)
 rename getopt.h => src/lib/getopt.h (100%)
 rename gettime.c => src/lib/gettime.c (100%)
 rename gettime.h => src/lib/gettime.h (100%)
 rename global.h => src/lib/global.h (100%)
 rename hamiltonian_field.h => src/lib/hamiltonian_field.h (100%)
 rename {include => src/lib/include}/tmLQCD.h (100%)
 rename {include => src/lib/include}/tmlqcd_config.h (100%)
 rename {init => src/lib/init}/Makefile.in (100%)
 rename {init => src/lib/init}/init.h (100%)
 rename {init => src/lib/init}/init_bispinor_field.c (100%)
 rename {init => src/lib/init}/init_bispinor_field.h (100%)
 rename {init => src/lib/init}/init_chi_spinor_field.c (100%)
 rename {init => src/lib/init}/init_chi_spinor_field.h (100%)
 rename {init => src/lib/init}/init_critical_globals.c (100%)
 rename {init => src/lib/init}/init_critical_globals.h (100%)
 rename {init => src/lib/init}/init_dirac_halfspinor.c (100%)
 rename {init => src/lib/init}/init_dirac_halfspinor.h (100%)
 rename {init => src/lib/init}/init_gauge_fg.c (100%)
 rename {init => src/lib/init}/init_gauge_fg.h (100%)
 rename {init => src/lib/init}/init_gauge_field.c (100%)
 rename {init => src/lib/init}/init_gauge_field.h (100%)
 rename {init => src/lib/init}/init_gauge_tmp.c (100%)
 rename {init => src/lib/init}/init_gauge_tmp.h (100%)
 rename {init => src/lib/init}/init_geometry_indices.c (100%)
 rename {init => src/lib/init}/init_geometry_indices.h (100%)
 rename {init => src/lib/init}/init_global_states.c (100%)
 rename {init => src/lib/init}/init_global_states.h (100%)
 rename {init => src/lib/init}/init_moment_field.c (100%)
 rename {init => src/lib/init}/init_moment_field.h (100%)
 rename {init => src/lib/init}/init_omp_accumulators.c (100%)
 rename {init => src/lib/init}/init_omp_accumulators.h (100%)
 rename {init => src/lib/init}/init_openmp.c (100%)
 rename {init => src/lib/init}/init_openmp.h (100%)
 rename {init => src/lib/init}/init_parallel.c (100%)
 rename {init => src/lib/init}/init_parallel.h (100%)
 rename {init => src/lib/init}/init_spinor_field.c (100%)
 rename {init => src/lib/init}/init_spinor_field.h (100%)
 rename {init => src/lib/init}/init_stout_smear_vars.c (100%)
 rename {init => src/lib/init}/init_stout_smear_vars.h (100%)
 rename integrator.c => src/lib/integrator.c (100%)
 rename integrator.h => src/lib/integrator.h (100%)
 rename invert_clover_eo.c => src/lib/invert_clover_eo.c (100%)
 rename invert_clover_eo.h => src/lib/invert_clover_eo.h (100%)
 rename invert_doublet_eo.c => src/lib/invert_doublet_eo.c (100%)
 rename invert_doublet_eo.h => src/lib/invert_doublet_eo.h (100%)
 rename invert_eo.c => src/lib/invert_eo.c (99%)
 rename invert_eo.h => src/lib/invert_eo.h (100%)
 rename invert_overlap.c => src/lib/invert_overlap.c (100%)
 rename invert_overlap.h => src/lib/invert_overlap.h (100%)
 rename {io => src/lib/io}/DML_crc32.c (100%)
 rename {io => src/lib/io}/deri_write_stdout.c (100%)
 rename {io => src/lib/io}/deri_write_stdout.h (100%)
 rename {io => src/lib/io}/dml.c (100%)
 rename {io => src/lib/io}/dml.h (100%)
 rename {io => src/lib/io}/eospinor.h (100%)
 rename {io => src/lib/io}/eospinor.ih (100%)
 rename {io => src/lib/io}/eospinor_read.c (100%)
 rename {io => src/lib/io}/eospinor_write.c (100%)
 rename {io => src/lib/io}/gauge.h (100%)
 rename {io => src/lib/io}/gauge.ih (100%)
 rename {io => src/lib/io}/gauge_read.c (100%)
 rename {io => src/lib/io}/gauge_read_binary.c (100%)
 rename {io => src/lib/io}/gauge_write.c (100%)
 rename {io => src/lib/io}/gauge_write_binary.c (100%)
 rename {io => src/lib/io}/gauge_write_luscher_binary.c (100%)
 rename {io => src/lib/io}/gauge_write_luscher_binary.h (100%)
 rename {io => src/lib/io}/io_cm.c (100%)
 rename {io => src/lib/io}/io_cm.h (100%)
 rename {io => src/lib/io}/params.h (100%)
 rename {io => src/lib/io}/params.ih (100%)
 rename {io => src/lib/io}/params_construct_InverterInfo.c (100%)
 rename {io => src/lib/io}/params_construct_ildgFormat.c (100%)
 rename {io => src/lib/io}/params_construct_propagatorFormat.c (100%)
 rename {io => src/lib/io}/params_construct_sourceFormat.c (100%)
 rename {io => src/lib/io}/params_construct_xlfInfo.c (100%)
 rename {io => src/lib/io}/selector.h (100%)
 rename {io => src/lib/io}/spinor.h (100%)
 rename {io => src/lib/io}/spinor.ih (100%)
 rename {io => src/lib/io}/spinor_read.c (100%)
 rename {io => src/lib/io}/spinor_read_binary.c (100%)
 rename {io => src/lib/io}/spinor_write.c (100%)
 rename {io => src/lib/io}/spinor_write_binary.c (100%)
 rename {io => src/lib/io}/spinor_write_info.c (100%)
 rename {io => src/lib/io}/spinor_write_propagator_format.c (100%)
 rename {io => src/lib/io}/spinor_write_propagator_type.c (100%)
 rename {io => src/lib/io}/spinor_write_source_format.c (100%)
 rename {io => src/lib/io}/spinor_write_stdout.c (100%)
 rename {io => src/lib/io}/spinor_write_stdout.h (100%)
 rename {io => src/lib/io}/sw_write_stdout.c (100%)
 rename {io => src/lib/io}/sw_write_stdout.h (100%)
 rename {io => src/lib/io}/utils.c (100%)
 rename {io => src/lib/io}/utils.h (99%)
 rename {io => src/lib/io}/utils.ih (96%)
 rename {io => src/lib/io}/utils_close_reader_record.c (100%)
 rename {io => src/lib/io}/utils_close_writer_record.c (100%)
 rename {io => src/lib/io}/utils_construct_reader.c (97%)
 rename {io => src/lib/io}/utils_construct_writer.c (100%)
 rename {io => src/lib/io}/utils_destruct_reader.c (100%)
 rename {io => src/lib/io}/utils_destruct_writer.c (100%)
 rename {io => src/lib/io}/utils_engineering.c (100%)
 rename {io => src/lib/io}/utils_kill_with_error.c (100%)
 rename {io => src/lib/io}/utils_parse_checksum_xml.c (100%)
 rename {io => src/lib/io}/utils_parse_ildgformat_xml.c (100%)
 rename {io => src/lib/io}/utils_parse_propagator_type.c (100%)
 rename {io => src/lib/io}/utils_read_message.c (100%)
 rename {io => src/lib/io}/utils_write_checksum.c (100%)
 rename {io => src/lib/io}/utils_write_first_message.c (100%)
 rename {io => src/lib/io}/utils_write_header.c (100%)
 rename {io => src/lib/io}/utils_write_ildg_format.c (100%)
 rename {io => src/lib/io}/utils_write_inverter_info.c (100%)
 rename {io => src/lib/io}/utils_write_message.c (100%)
 rename {io => src/lib/io}/utils_write_xlf.c (100%)
 rename {io => src/lib/io}/utils_write_xlf_xml.c (100%)
 rename kahan_summation.h => src/lib/kahan_summation.h (100%)
 rename {linalg => src/lib/linalg}/Makefile.in (100%)
 rename {linalg => src/lib/linalg}/add.c (100%)
 rename {linalg => src/lib/linalg}/add.h (100%)
 rename {linalg => src/lib/linalg}/addto_32.c (100%)
 rename {linalg => src/lib/linalg}/addto_32.h (100%)
 rename {linalg => src/lib/linalg}/assign.c (100%)
 rename {linalg => src/lib/linalg}/assign.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_body.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_32.c (93%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_32.h (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_add_mul_r_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_diff_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_diff_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r_32.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_mul_r_32.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_32.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_32.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_and_square.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_add_r_and_square.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_ket_add_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/assign_mul_bra_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/assign_to_32.c (100%)
 rename {linalg => src/lib/linalg}/assign_to_32.h (100%)
 rename {linalg => src/lib/linalg}/blas.h (100%)
 rename {linalg => src/lib/linalg}/comp_decomp.c (100%)
 rename {linalg => src/lib/linalg}/comp_decomp.h (100%)
 rename {linalg => src/lib/linalg}/convert_eo_to_lexic.c (100%)
 rename {linalg => src/lib/linalg}/convert_eo_to_lexic.h (100%)
 rename {linalg => src/lib/linalg}/convert_even_to_lexic.c (100%)
 rename {linalg => src/lib/linalg}/convert_even_to_lexic.h (100%)
 rename {linalg => src/lib/linalg}/convert_odd_to_lexic.c (100%)
 rename {linalg => src/lib/linalg}/convert_odd_to_lexic.h (100%)
 rename {linalg => src/lib/linalg}/diff.c (100%)
 rename {linalg => src/lib/linalg}/diff.h (100%)
 rename {linalg => src/lib/linalg}/diff_32.c (100%)
 rename {linalg => src/lib/linalg}/diff_32.h (100%)
 rename {linalg => src/lib/linalg}/diff_and_square_norm.c (100%)
 rename {linalg => src/lib/linalg}/diff_and_square_norm.h (100%)
 rename {linalg => src/lib/linalg}/fortran.h (100%)
 rename {linalg => src/lib/linalg}/lapack.h (100%)
 rename {linalg => src/lib/linalg}/map_to_blas.h (100%)
 rename {linalg => src/lib/linalg}/mattimesvec.c (100%)
 rename {linalg => src/lib/linalg}/mattimesvec.h (100%)
 rename {linalg => src/lib/linalg}/mul.c (100%)
 rename {linalg => src/lib/linalg}/mul.h (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul.c (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul.h (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_add_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul.c (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul.h (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_diff_mul_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_diff_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_diff_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_gamma5.c (100%)
 rename {linalg => src/lib/linalg}/mul_gamma5.h (100%)
 rename {linalg => src/lib/linalg}/mul_r.c (100%)
 rename {linalg => src/lib/linalg}/mul_r.h (100%)
 rename {linalg => src/lib/linalg}/mul_r_32.c (100%)
 rename {linalg => src/lib/linalg}/mul_r_32.h (100%)
 rename {linalg => src/lib/linalg}/mul_r_gamma5.c (100%)
 rename {linalg => src/lib/linalg}/mul_r_gamma5.h (100%)
 rename {linalg => src/lib/linalg}/print_spinor.c (100%)
 rename {linalg => src/lib/linalg}/print_spinor.h (100%)
 rename {linalg => src/lib/linalg}/print_spinor_similar_components.c (100%)
 rename {linalg => src/lib/linalg}/print_spinor_similar_components.h (100%)
 rename {linalg => src/lib/linalg}/ratio.c (100%)
 rename {linalg => src/lib/linalg}/ratio.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_body.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_i.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_i.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r.h (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r_32.c (100%)
 rename {linalg => src/lib/linalg}/scalar_prod_r_32.h (100%)
 rename {linalg => src/lib/linalg}/set_even_to_zero.c (100%)
 rename {linalg => src/lib/linalg}/set_even_to_zero.h (100%)
 rename {linalg => src/lib/linalg}/square_and_minmax.c (100%)
 rename {linalg => src/lib/linalg}/square_and_minmax.h (100%)
 rename {linalg => src/lib/linalg}/square_and_prod_r.c (100%)
 rename {linalg => src/lib/linalg}/square_and_prod_r.h (100%)
 rename {linalg => src/lib/linalg}/square_norm.c (100%)
 rename {linalg => src/lib/linalg}/square_norm.h (100%)
 rename {linalg => src/lib/linalg}/square_norm_32.c (100%)
 rename {linalg => src/lib/linalg}/square_norm_32.h (100%)
 rename linalg_eo.h => src/lib/linalg_eo.h (100%)
 rename little_D.c => src/lib/little_D.c (100%)
 rename little_D.h => src/lib/little_D.h (100%)
 rename little_D_body.c => src/lib/little_D_body.c (100%)
 rename matrix_utils.c => src/lib/matrix_utils.c (100%)
 rename matrix_utils.h => src/lib/matrix_utils.h (100%)
 rename {meas => src/lib/meas}/Makefile.in (100%)
 rename {meas => src/lib/meas}/correlators.c (100%)
 rename {meas => src/lib/meas}/correlators.h (100%)
 rename {meas => src/lib/meas}/field_strength_types.h (100%)
 rename {meas => src/lib/meas}/gradient_flow.c (100%)
 rename {meas => src/lib/meas}/gradient_flow.h (100%)
 rename {meas => src/lib/meas}/measure_clover_field_strength_observables.c (100%)
 rename {meas => src/lib/meas}/measure_clover_field_strength_observables.h (100%)
 rename {meas => src/lib/meas}/measurements.c (100%)
 rename {meas => src/lib/meas}/measurements.h (100%)
 rename {meas => src/lib/meas}/oriented_plaquettes.c (100%)
 rename {meas => src/lib/meas}/oriented_plaquettes.h (100%)
 rename {meas => src/lib/meas}/pion_norm.c (100%)
 rename {meas => src/lib/meas}/pion_norm.h (100%)
 rename {meas => src/lib/meas}/polyakov_loop.c (100%)
 rename {meas => src/lib/meas}/polyakov_loop.h (100%)
 rename measure_gauge_action.c => src/lib/measure_gauge_action.c (100%)
 rename measure_gauge_action.h => src/lib/measure_gauge_action.h (100%)
 rename measure_rectangles.c => src/lib/measure_rectangles.c (100%)
 rename measure_rectangles.h => src/lib/measure_rectangles.h (100%)
 rename misc_types.h => src/lib/misc_types.h (99%)
 rename {monomial => src/lib/monomial}/Makefile.in (100%)
 rename {monomial => src/lib/monomial}/clover_trlog_monomial.c (100%)
 rename {monomial => src/lib/monomial}/clover_trlog_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverdet_monomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverdet_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_monomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_rwmonomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverdetratio_rwmonomial.h (100%)
 rename {monomial => src/lib/monomial}/clovernd_trlog_monomial.c (100%)
 rename {monomial => src/lib/monomial}/clovernd_trlog_monomial.h (100%)
 rename {monomial => src/lib/monomial}/cloverndpoly_monomial.c (100%)
 rename {monomial => src/lib/monomial}/cloverndpoly_monomial.h (100%)
 rename {monomial => src/lib/monomial}/det_monomial.c (100%)
 rename {monomial => src/lib/monomial}/det_monomial.h (100%)
 rename {monomial => src/lib/monomial}/detratio_monomial.c (100%)
 rename {monomial => src/lib/monomial}/detratio_monomial.h (100%)
 rename {monomial => src/lib/monomial}/gauge_monomial.c (100%)
 rename {monomial => src/lib/monomial}/gauge_monomial.h (100%)
 rename {monomial => src/lib/monomial}/moment_energy.c (100%)
 rename {monomial => src/lib/monomial}/moment_energy.h (100%)
 rename {monomial => src/lib/monomial}/monitor_forces.c (100%)
 rename {monomial => src/lib/monomial}/monitor_forces.h (100%)
 rename {monomial => src/lib/monomial}/monomial.c (100%)
 rename {monomial => src/lib/monomial}/monomial.h (100%)
 rename {monomial => src/lib/monomial}/nddetratio_monomial.c (100%)
 rename {monomial => src/lib/monomial}/nddetratio_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ndpoly_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ndpoly_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ndrat_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ndrat_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ndratcor_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ndratcor_monomial.h (100%)
 rename {monomial => src/lib/monomial}/poly_monomial.c (100%)
 rename {monomial => src/lib/monomial}/poly_monomial.h (100%)
 rename {monomial => src/lib/monomial}/rat_monomial.c (100%)
 rename {monomial => src/lib/monomial}/rat_monomial.h (100%)
 rename {monomial => src/lib/monomial}/ratcor_monomial.c (100%)
 rename {monomial => src/lib/monomial}/ratcor_monomial.h (100%)
 rename mpi_init.c => src/lib/mpi_init.c (100%)
 rename mpi_init.h => src/lib/mpi_init.h (100%)
 rename omp_accumulator.h => src/lib/omp_accumulator.h (100%)
 rename operator.c => src/lib/operator.c (100%)
 rename operator.h => src/lib/operator.h (100%)
 rename {operator => src/lib/operator}/Block_D_psi_body.c (100%)
 rename {operator => src/lib/operator}/D_psi.c (100%)
 rename {operator => src/lib/operator}/D_psi.h (100%)
 rename {operator => src/lib/operator}/D_psi_body.c (100%)
 rename {operator => src/lib/operator}/Dov_proj.c (100%)
 rename {operator => src/lib/operator}/Dov_proj.h (100%)
 rename {operator => src/lib/operator}/Dov_psi.c (100%)
 rename {operator => src/lib/operator}/Dov_psi.h (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix.h (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_32.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_32.h (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_32_nocom.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_nocom.c (100%)
 rename {operator => src/lib/operator}/Hopping_Matrix_nocom.h (100%)
 rename {operator => src/lib/operator}/Makefile.in (100%)
 rename {operator => src/lib/operator}/assign_mul_one_sw_pm_imu_inv_block_body.c (100%)
 rename {operator => src/lib/operator}/assign_mul_one_sw_pm_imu_site_lexic_body.c (100%)
 rename {operator => src/lib/operator}/clover_accumulate_deriv.c (100%)
 rename {operator => src/lib/operator}/clover_deriv.c (100%)
 rename {operator => src/lib/operator}/clover_det.c (100%)
 rename {operator => src/lib/operator}/clover_inline.h (100%)
 rename {operator => src/lib/operator}/clover_invert.c (100%)
 rename {operator => src/lib/operator}/clover_leaf.c (100%)
 rename {operator => src/lib/operator}/clover_leaf.h (100%)
 rename {operator => src/lib/operator}/clover_term.c (100%)
 rename {operator => src/lib/operator}/clovertm_operators.c (100%)
 rename {operator => src/lib/operator}/clovertm_operators.h (100%)
 rename {operator => src/lib/operator}/clovertm_operators_32.c (100%)
 rename {operator => src/lib/operator}/clovertm_operators_32.h (100%)
 rename {operator => src/lib/operator}/halfspinor_body.c (100%)
 rename {operator => src/lib/operator}/halfspinor_body_32.c (100%)
 rename {operator => src/lib/operator}/halfspinor_hopping.h (100%)
 rename {operator => src/lib/operator}/halfspinor_hopping_32.h (100%)
 rename {operator => src/lib/operator}/hopping_bg_dbl.c (100%)
 rename {operator => src/lib/operator}/hopping_body_dbl.c (100%)
 rename {operator => src/lib/operator}/hopping_sgl.c (100%)
 rename {operator => src/lib/operator}/mul_one_pm_imu_inv_body.c (100%)
 rename {operator => src/lib/operator}/mul_one_pm_imu_sub_mul_body.c (100%)
 rename {operator => src/lib/operator}/tm_operators.c (100%)
 rename {operator => src/lib/operator}/tm_operators.h (100%)
 rename {operator => src/lib/operator}/tm_operators_32.c (100%)
 rename {operator => src/lib/operator}/tm_operators_32.h (100%)
 rename {operator => src/lib/operator}/tm_operators_nd.c (100%)
 rename {operator => src/lib/operator}/tm_operators_nd.h (100%)
 rename {operator => src/lib/operator}/tm_operators_nd_32.c (100%)
 rename {operator => src/lib/operator}/tm_operators_nd_32.h (100%)
 rename {operator => src/lib/operator}/tm_sub_Hopping_Matrix.c (100%)
 rename {operator => src/lib/operator}/tm_sub_Hopping_Matrix.h (100%)
 rename {operator => src/lib/operator}/tm_times_Hopping_Matrix.c (100%)
 rename {operator => src/lib/operator}/tm_times_Hopping_Matrix.h (100%)
 rename operator_types.h => src/lib/operator_types.h (100%)
 rename overrelaxation.c => src/lib/overrelaxation.c (99%)
 rename overrelaxation.h => src/lib/overrelaxation.h (100%)
 rename parallel_io.h => src/lib/parallel_io.h (100%)
 rename phmc.c => src/lib/phmc.c (100%)
 rename phmc.h => src/lib/phmc.h (100%)
 rename prepare_source.c => src/lib/prepare_source.c (100%)
 rename prepare_source.h => src/lib/prepare_source.h (100%)
 rename {profiling => src/lib/profiling}/hmc/Readme.md (100%)
 rename {profiling => src/lib/profiling}/hmc/example_profile.pdf (100%)
 rename {profiling => src/lib/profiling}/hmc/profile.Rmd (100%)
 rename {profiling => src/lib/profiling}/hmc/timing.R (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/.gitignore (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/README.md (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/logs/example_log.out (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/make_profile.R (100%)
 rename {profiling => src/lib/profiling}/hmc_mk2/profile.Rmd (100%)
 create mode 100644 src/lib/qphix/qphix_base_classes.hpp
 create mode 100644 src/lib/qphix/qphix_interface.cpp
 create mode 100644 src/lib/qphix/qphix_interface.hpp
 create mode 100644 src/lib/qphix/qphix_interface_utils.hpp
 rename qphix_interface.h => src/lib/qphix_interface.h (100%)
 rename qphix_types.h => src/lib/qphix_types.h (100%)
 rename qphix_veclen.h => src/lib/qphix_veclen.h (100%)
 rename quda_dummy_types.h => src/lib/quda_dummy_types.h (100%)
 create mode 100644 src/lib/quda_gauge_paths.inc
 rename quda_interface.c => src/lib/quda_interface.c (100%)
 rename quda_interface.h => src/lib/quda_interface.h (100%)
 rename quda_types.h => src/lib/quda_types.h (100%)
 rename ranlxd.c => src/lib/ranlxd.c (100%)
 rename ranlxd.h => src/lib/ranlxd.h (100%)
 rename ranlxs.c => src/lib/ranlxs.c (100%)
 rename ranlxs.h => src/lib/ranlxs.h (100%)
 rename {rational => src/lib/rational}/Makefile.in (100%)
 rename {rational => src/lib/rational}/elliptic.c (100%)
 rename {rational => src/lib/rational}/elliptic.h (100%)
 rename {rational => src/lib/rational}/rational.c (100%)
 rename {rational => src/lib/rational}/rational.h (100%)
 rename {rational => src/lib/rational}/zolotarev.c (100%)
 rename {rational => src/lib/rational}/zolotarev.h (100%)
 rename read_input.h => src/lib/read_input.h (100%)
 rename read_input.l => src/lib/read_input.l (100%)
 rename reweighting_factor.c => src/lib/reweighting_factor.c (100%)
 rename reweighting_factor.h => src/lib/reweighting_factor.h (100%)
 rename reweighting_factor_nd.c => src/lib/reweighting_factor_nd.c (100%)
 rename reweighting_factor_nd.h => src/lib/reweighting_factor_nd.h (100%)
 rename rnd_gauge_trafo.c => src/lib/rnd_gauge_trafo.c (100%)
 rename rnd_gauge_trafo.h => src/lib/rnd_gauge_trafo.h (100%)
 rename sighandler.c => src/lib/sighandler.c (100%)
 rename sighandler.h => src/lib/sighandler.h (100%)
 rename {smearing => src/lib/smearing}/Makefile.in (100%)
 rename {smearing => src/lib/smearing}/ape.h (100%)
 rename {smearing => src/lib/smearing}/ape.ih (100%)
 rename {smearing => src/lib/smearing}/ape_ape_smear.c (100%)
 rename {smearing => src/lib/smearing}/hex.h (100%)
 rename {smearing => src/lib/smearing}/hex.ih (100%)
 rename {smearing => src/lib/smearing}/hex_hex_smear.c (100%)
 rename {smearing => src/lib/smearing}/hex_stout_exclude_none.c (100%)
 rename {smearing => src/lib/smearing}/hex_stout_exclude_one.c (100%)
 rename {smearing => src/lib/smearing}/hex_stout_exclude_two.c (100%)
 rename {smearing => src/lib/smearing}/hyp.h (100%)
 rename {smearing => src/lib/smearing}/hyp.ih (100%)
 rename {smearing => src/lib/smearing}/hyp_APE_project_exclude_none.c (100%)
 rename {smearing => src/lib/smearing}/hyp_APE_project_exclude_one.c (100%)
 rename {smearing => src/lib/smearing}/hyp_APE_project_exclude_two.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_smear.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_staples_exclude_none.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_staples_exclude_one.c (100%)
 rename {smearing => src/lib/smearing}/hyp_hyp_staples_exclude_two.c (100%)
 rename {smearing => src/lib/smearing}/stout.h (100%)
 rename {smearing => src/lib/smearing}/stout.ih (100%)
 rename {smearing => src/lib/smearing}/stout_stout_smear.c (100%)
 rename {smearing => src/lib/smearing}/uils_print_config_to_screen.c (100%)
 rename {smearing => src/lib/smearing}/utils.h (100%)
 rename {smearing => src/lib/smearing}/utils.ih (100%)
 rename {smearing => src/lib/smearing}/utils_generic_staples.c (100%)
 rename {smearing => src/lib/smearing}/utils_print_config_to_screen.c (100%)
 rename {smearing => src/lib/smearing}/utils_print_su3.c (100%)
 rename {smearing => src/lib/smearing}/utils_project_antiherm.c (100%)
 rename {smearing => src/lib/smearing}/utils_project_herm.c (100%)
 rename {smearing => src/lib/smearing}/utils_reunitarize.c (100%)
 rename {smearing => src/lib/smearing}/utils_reunitarize_MILC.c (88%)
 rename {solver => src/lib/solver}/M_plus_block_psi_body.c (100%)
 rename {solver => src/lib/solver}/Makefile.in (100%)
 rename {solver => src/lib/solver}/Msap.c (100%)
 rename {solver => src/lib/solver}/Msap.h (100%)
 rename {solver => src/lib/solver}/bicg_complex.c (100%)
 rename {solver => src/lib/solver}/bicg_complex.h (100%)
 rename {solver => src/lib/solver}/bicgstab2.c (100%)
 rename {solver => src/lib/solver}/bicgstab2.h (100%)
 rename {solver => src/lib/solver}/bicgstab_complex.c (100%)
 rename {solver => src/lib/solver}/bicgstab_complex.h (100%)
 rename {solver => src/lib/solver}/bicgstab_complex_bi.c (100%)
 rename {solver => src/lib/solver}/bicgstab_complex_bi.h (100%)
 rename {solver => src/lib/solver}/bicgstabell.c (100%)
 rename {solver => src/lib/solver}/bicgstabell.h (100%)
 rename {solver => src/lib/solver}/cg_her.c (100%)
 rename {solver => src/lib/solver}/cg_her.h (100%)
 rename {solver => src/lib/solver}/cg_her_bi.c (100%)
 rename {solver => src/lib/solver}/cg_her_bi.h (100%)
 rename {solver => src/lib/solver}/cg_her_nd.c (100%)
 rename {solver => src/lib/solver}/cg_her_nd.h (100%)
 rename {solver => src/lib/solver}/cg_mms_tm.c (100%)
 rename {solver => src/lib/solver}/cg_mms_tm.h (100%)
 rename {solver => src/lib/solver}/cg_mms_tm_nd.c (100%)
 rename {solver => src/lib/solver}/cg_mms_tm_nd.h (100%)
 rename {solver => src/lib/solver}/cgne4complex.c (100%)
 rename {solver => src/lib/solver}/cgne4complex.h (100%)
 rename {solver => src/lib/solver}/cgs_real.c (100%)
 rename {solver => src/lib/solver}/cgs_real.h (100%)
 rename {solver => src/lib/solver}/chrono_guess.c (100%)
 rename {solver => src/lib/solver}/chrono_guess.h (100%)
 rename {solver => src/lib/solver}/cr.c (100%)
 rename {solver => src/lib/solver}/cr.h (100%)
 rename {solver => src/lib/solver}/dfl_projector.c (100%)
 rename {solver => src/lib/solver}/dfl_projector.h (100%)
 rename {solver => src/lib/solver}/diagonalise_general_matrix.c (100%)
 rename {solver => src/lib/solver}/diagonalise_general_matrix.h (100%)
 rename {solver => src/lib/solver}/dirac_operator_eigenvectors.c (100%)
 rename {solver => src/lib/solver}/dirac_operator_eigenvectors.h (100%)
 rename {solver => src/lib/solver}/eigcg.c (100%)
 rename {solver => src/lib/solver}/eigcg.h (100%)
 rename {solver => src/lib/solver}/eigenvalues.c (100%)
 rename {solver => src/lib/solver}/eigenvalues.h (100%)
 rename {solver => src/lib/solver}/eigenvalues_bi.c (100%)
 rename {solver => src/lib/solver}/eigenvalues_bi.h (100%)
 rename {solver => src/lib/solver}/fgmres.c (100%)
 rename {solver => src/lib/solver}/fgmres.h (100%)
 rename {solver => src/lib/solver}/fgmres4complex.c (100%)
 rename {solver => src/lib/solver}/fgmres4complex.h (100%)
 rename {solver => src/lib/solver}/fgmres4complex_body.c (100%)
 rename {solver => src/lib/solver}/gcr.c (100%)
 rename {solver => src/lib/solver}/gcr.h (100%)
 rename {solver => src/lib/solver}/gcr4complex.c (100%)
 rename {solver => src/lib/solver}/gcr4complex.h (100%)
 rename {solver => src/lib/solver}/gcr4complex_body.c (100%)
 rename {solver => src/lib/solver}/gcr4complex_body.h (100%)
 rename {solver => src/lib/solver}/generate_dfl_subspace.c (100%)
 rename {solver => src/lib/solver}/generate_dfl_subspace.h (100%)
 rename {solver => src/lib/solver}/gmres.c (100%)
 rename {solver => src/lib/solver}/gmres.h (100%)
 rename {solver => src/lib/solver}/gmres_dr.c (100%)
 rename {solver => src/lib/solver}/gmres_dr.h (100%)
 rename {solver => src/lib/solver}/gmres_precon.c (100%)
 rename {solver => src/lib/solver}/gmres_precon.h (100%)
 rename {solver => src/lib/solver}/gram-schmidt.c (100%)
 rename {solver => src/lib/solver}/gram-schmidt.h (100%)
 rename {solver => src/lib/solver}/incr_eigcg.c (100%)
 rename {solver => src/lib/solver}/incr_eigcg.h (100%)
 rename {solver => src/lib/solver}/index_jd.c (100%)
 rename {solver => src/lib/solver}/index_jd.h (100%)
 rename {solver => src/lib/solver}/init_guess.c (100%)
 rename {solver => src/lib/solver}/init_guess.h (100%)
 rename {solver => src/lib/solver}/jdher.c (100%)
 rename {solver => src/lib/solver}/jdher.h (100%)
 rename {solver => src/lib/solver}/jdher_bi.c (100%)
 rename {solver => src/lib/solver}/jdher_bi.h (100%)
 rename {solver => src/lib/solver}/little_mg_precon_body.c (100%)
 rename {solver => src/lib/solver}/little_project_eo_body.c (100%)
 rename {solver => src/lib/solver}/lu_solve.c (100%)
 rename {solver => src/lib/solver}/lu_solve.h (100%)
 rename {solver => src/lib/solver}/matrix_mult_typedef.h (100%)
 rename {solver => src/lib/solver}/matrix_mult_typedef_bi.h (100%)
 rename {solver => src/lib/solver}/matrix_mult_typedef_nd.h (100%)
 rename {solver => src/lib/solver}/mcr.c (100%)
 rename {solver => src/lib/solver}/mcr.h (100%)
 rename {solver => src/lib/solver}/mcr4complex.c (100%)
 rename {solver => src/lib/solver}/mcr4complex.h (100%)
 rename {solver => src/lib/solver}/mixed_cg_her.c (100%)
 rename {solver => src/lib/solver}/mixed_cg_her.h (100%)
 rename {solver => src/lib/solver}/mixed_cg_mms_tm_nd.c (100%)
 rename {solver => src/lib/solver}/mixed_cg_mms_tm_nd.h (100%)
 rename {solver => src/lib/solver}/monomial_solve.c (100%)
 rename {solver => src/lib/solver}/monomial_solve.h (100%)
 rename {solver => src/lib/solver}/mr.c (100%)
 rename {solver => src/lib/solver}/mr.h (100%)
 rename {solver => src/lib/solver}/mr4complex.c (100%)
 rename {solver => src/lib/solver}/mr4complex.h (100%)
 rename {solver => src/lib/solver}/mrblk_body.c (100%)
 rename {solver => src/lib/solver}/ortho.c (100%)
 rename {solver => src/lib/solver}/ortho.h (100%)
 rename {solver => src/lib/solver}/pcg_her.c (100%)
 rename {solver => src/lib/solver}/pcg_her.h (100%)
 rename {solver => src/lib/solver}/poly_precon.c (100%)
 rename {solver => src/lib/solver}/poly_precon.h (100%)
 rename {solver => src/lib/solver}/quicksort.c (100%)
 rename {solver => src/lib/solver}/quicksort.h (100%)
 rename {solver => src/lib/solver}/restart_X.c (100%)
 rename {solver => src/lib/solver}/restart_X.h (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her.c (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her.h (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her_nd.c (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_her_nd.h (100%)
 rename {solver => src/lib/solver}/rg_mixed_cg_typedef.h (100%)
 rename {solver => src/lib/solver}/solver.h (100%)
 rename {solver => src/lib/solver}/solver_field.c (100%)
 rename {solver => src/lib/solver}/solver_field.h (100%)
 rename {solver => src/lib/solver}/solver_params.h (100%)
 rename {solver => src/lib/solver}/solver_types.c (100%)
 rename {solver => src/lib/solver}/solver_types.h (100%)
 rename {solver => src/lib/solver}/sub_low_ev.c (100%)
 rename {solver => src/lib/solver}/sub_low_ev.h (100%)
 rename {solver => src/lib/solver}/sumr.c (100%)
 rename {solver => src/lib/solver}/sumr.h (100%)
 rename source_generation.c => src/lib/source_generation.c (100%)
 rename source_generation.h => src/lib/source_generation.h (100%)
 rename spinor_fft.c => src/lib/spinor_fft.c (100%)
 rename spinor_fft.h => src/lib/spinor_fft.h (100%)
 rename start.c => src/lib/start.c (100%)
 rename start.h => src/lib/start.h (100%)
 rename struct_accessors.h => src/lib/struct_accessors.h (100%)
 rename su3.h => src/lib/su3.h (100%)
 rename su3adj.h => src/lib/su3adj.h (100%)
 rename su3spinor.h => src/lib/su3spinor.h (100%)
 rename tensors.h => src/lib/tensors.h (100%)
 rename {test => src/lib/test}/Makefile (100%)
 rename {test => src/lib/test}/check_geometry.c (100%)
 rename {test => src/lib/test}/check_geometry.h (100%)
 rename {test => src/lib/test}/check_nan.c (100%)
 rename {test => src/lib/test}/check_nan.h (100%)
 rename {test => src/lib/test}/check_overlap.c (100%)
 rename {test => src/lib/test}/check_xchange.c (100%)
 rename {test => src/lib/test}/hopping_test.README (100%)
 rename {test => src/lib/test}/hopping_test.input.compare (100%)
 rename {test => src/lib/test}/hopping_test.input.new (100%)
 rename {test => src/lib/test}/hopping_test.input.start (100%)
 rename {test => src/lib/test}/hopping_test_generate_script (100%)
 rename {test => src/lib/test}/hopping_test_qscript (100%)
 rename {test => src/lib/test}/measure_rectangles.debug.c (100%)
 rename {test => src/lib/test}/overlaptests.c (100%)
 rename {test => src/lib/test}/overlaptests.h (100%)
 rename {test => src/lib/test}/qdran64.h (100%)
 rename tm_debug_printf.c => src/lib/tm_debug_printf.c (100%)
 rename tm_debug_printf.h => src/lib/tm_debug_printf.h (100%)
 rename update_backward_gauge.c => src/lib/update_backward_gauge.c (100%)
 rename update_backward_gauge.h => src/lib/update_backward_gauge.h (100%)
 rename update_gauge.c => src/lib/update_gauge.c (100%)
 rename update_gauge.h => src/lib/update_gauge.h (100%)
 rename update_momenta.c => src/lib/update_momenta.c (100%)
 rename update_momenta.h => src/lib/update_momenta.h (100%)
 rename update_momenta_fg.c => src/lib/update_momenta_fg.c (100%)
 rename update_momenta_fg.h => src/lib/update_momenta_fg.h (100%)
 rename update_tm.c => src/lib/update_tm.c (100%)
 rename update_tm.h => src/lib/update_tm.h (100%)
 rename {util => src/lib/util}/io.c (100%)
 rename {util => src/lib/util}/io.h (100%)
 rename {util => src/lib/util}/laguer/Makefile (100%)
 rename {util => src/lib/util}/laguer/chebyRoot.C (100%)
 rename {util => src/lib/util}/laguer/chebyRoot.H (100%)
 rename {util => src/lib/util}/laguer/laguer.c (100%)
 rename {util => src/lib/util}/laguer/quadroptRoot.C (100%)
 rename {util => src/lib/util}/oox/Makefile (100%)
 rename {util => src/lib/util}/oox/oox.c (100%)
 rename {util => src/lib/util}/oox/oox_gawrapper.cxx (100%)
 rename {util => src/lib/util}/oox/oox_gawrapper.h (100%)
 rename {util => src/lib/util}/swapendian.c (100%)
 rename {util => src/lib/util}/tmlqcd-indent (100%)
 rename {wrapper => src/lib/wrapper}/Makefile.in (100%)
 rename {wrapper => src/lib/wrapper}/lib_wrapper.c (100%)
 rename {xchange => src/lib/xchange}/Makefile.in (100%)
 rename {xchange => src/lib/xchange}/little_field_gather.c (100%)
 rename {xchange => src/lib/xchange}/little_field_gather.h (100%)
 rename {xchange => src/lib/xchange}/little_field_gather_body.c (100%)
 rename {xchange => src/lib/xchange}/xchange.h (100%)
 rename {xchange => src/lib/xchange}/xchange_2fields.c (100%)
 rename {xchange => src/lib/xchange}/xchange_2fields.h (100%)
 rename {xchange => src/lib/xchange}/xchange_deri.c (100%)
 rename {xchange => src/lib/xchange}/xchange_deri.h (100%)
 rename {xchange => src/lib/xchange}/xchange_field.c (100%)
 rename {xchange => src/lib/xchange}/xchange_field.h (100%)
 rename {xchange => src/lib/xchange}/xchange_gauge.c (100%)
 rename {xchange => src/lib/xchange}/xchange_gauge.h (100%)
 rename {xchange => src/lib/xchange}/xchange_halffield.c (100%)
 rename {xchange => src/lib/xchange}/xchange_halffield.h (100%)
 rename {xchange => src/lib/xchange}/xchange_lexicfield.c (100%)
 rename {xchange => src/lib/xchange}/xchange_lexicfield.h (100%)

diff --git a/.gitignore b/.gitignore
index 79e2bc1b2..0a2e35fba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,7 +18,6 @@ tags*
 hmc_tm
 invert
 offline_measurement
-lib/
 benchmark
 *.data
 *.para
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..9dc9f71f2
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,390 @@
+cmake_minimum_required(VERSION 3.24)
+
+project(
+  tmlqcd
+  DESCRIPTION "tmlQCD"
+  HOMEPAGE_URL "http://www.itkp.uni-bonn.de/~urbach/software.html"
+  VERSION "6.0.0"
+  LANGUAGES C CXX)
+
+# include our cmake snippets
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+# =================================================================================================
+# REQUIRE OUT-OF-SOURCE BUILDS
+file(TO_CMAKE_PATH "${PROJECT_BINARY_DIR}/CMakeLists.txt" LOC_PATH)
+if(EXISTS "${LOC_PATH}")
+  message(
+    FATAL_ERROR
+      "You cannot build in a source directory (or any directory with a CMakeLists.txt file). Please make a build subdirectory."
+  )
+endif()
+
+# =================================================================================================
+# PROJECT AND VERSION
+include(CMakeDependentOption)
+include(CheckSymbolExists)
+include(CheckLibraryExists)
+include(CheckFunctionExists)
+include(GNUInstallDirs)
+
+cmake_policy(SET CMP0048 NEW)
+
+if(POLICY CMP0144)
+  cmake_policy(SET CMP0144 NEW)
+endif()
+
+if(NOT DEFINED CMAKE_CUDA_STANDARD)
+  set(CMAKE_CUDA_STANDARD 14)
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
+if(NOT DEFINED CMAKE_C_STANDARD)
+  set(CMAKE_C_STANDARD 11)
+  set(CMAKE_C_STANDARD_REQUIRED ON)
+endif()
+
+if(NOT DEFINED CMAKE_HIP_STANDARD)
+  set(CMAKE_HIP_STANDARD 14)
+  set(CMAKE_HIP_STANDARD_REQUIRED ON)
+endif()
+
+find_package(PkgConfig)
+
+# ##############################################################################
+# Define the paths for static libraries and executables
+# ##############################################################################
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY
+    ${cp2k_BINARY_DIR}/lib
+    CACHE PATH "Single output directory for building all libraries.")
+
+# Search for rocm in common locations
+foreach(__var ROCM_ROOT CRAY_ROCM_ROOT ORNL_ROCM_ROOT CRAY_ROCM_PREFIX
+              ROCM_PREFIX CRAY_ROCM_DIR)
+  if($ENV{${__var}})
+    list(APPEND CMAKE_PREFIX_PATH $ENV{__var})
+    set(ROCM_PATH
+        $ENV{__var}
+        CACHE PATH "Path to ROCm installation")
+  endif()
+endforeach()
+
+option(CMAKE_POSITION_INDEPENDENT_CODE "Enable position independent code" ON)
+option(BUILD_SHARED_LIBS "Enable shared library" ON)
+option(TM_USE_FFTW "Enable fftw support" OFF)
+option(TM_USE_MPI "Enable MPI support" OFF)
+option(TM_USE_CUDA "Enable QUDA support" OFF)
+option(TM_USE_HIP "Enable HIP support" OFF)
+option(TM_USE_DDALPHAAMG "Enable DDalphaAMG support" OFF)
+option(TM_USE_OPENMP "Enable openMP" ON)
+option(TM_FIXED_VOLUME "fix volume at compile time" OFF)
+set(
+  TM_ENABLE_ALIGNMENT
+  "auto"
+  CACHE STRING   "Automatically or expliclty align arrays to byte number. auto, none, 16, 32, 64")
+
+set_property(
+  CACHE TM_ENABLE_ALIGNMENT
+  PROPERTY STRINGS
+  "auto"
+  "none"
+  "16"
+  "32"
+  "64")
+
+option(TM_BGL_DRAM "use BGL dram window (BGL only!)" ON)
+option(TM_USE_OPTIMIZATION "enable optimisation" ON)
+option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
+option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
+option(TM_USE_TSPLITPAR "Enable timeslice-splitted communications" ON)
+option(TM_USE_QPHIX "enable QPhiX" OFF)
+option(TM_USE_SHMEM "Use shmem API" OFF)
+option(TM_USE_QUDA "Enable QUDA support" OFF)
+option(TM_USE_GPROF "Enable gprof profiler" OFF)
+option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
+
+# MPI dependent options
+cmake_dependent_option(
+  TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]"
+  OFF "TM_USE_MPI" OFF)
+cmake_dependent_option(
+  TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gaug" ON
+  "TM_USE_MPI" OFF)
+
+# need to do it properly. Just a place holder
+cmake_dependent_option(
+  TM_MPI_DIMENSION "Use n dimensional parallelisation [default=4]" 4
+  "TM_USE_MPI" OFF)
+
+# HIP dependent options
+cmake_dependent_option(TM_USE_CUDA_HIP "Enable CUDA support in HIP" OFF
+                       "TM_USE_HIP" OFF)
+
+# clime and lemon depend on MPI
+cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF
+                       "TM_USE_MPI" ON)
+
+# GPU dependent options
+cmake_dependent_option(TM_USE_QUDA_EXPERIMENTAL "Enable QUDA support" ON
+                       "TM_USE_QUDA" OFF)
+cmake_dependent_option(
+  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA"
+  ON "TM_USE_QUDA" OFF)
+
+cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
+                       "TM_USE_CUDA" OFF)
+
+# search for blas and lapack
+find_package(BLAS REQUIRED)
+#
+find_package(LAPACK REQUIRED)
+set(HAVE_LAPACK ON)
+find_package(FLEX REQUIRED)
+# do we need bison ?
+find_package(BISON REQUIRED)
+
+set(PACKAGE_NAME ${PROJECT_DESCRIPTION})
+set(PACKAGE_VERSION ${PROJECT_VERSION})
+set(PACKAGE_TARNAME "tmlqcd")
+set(PACKAGE_BUGREPORT "curbach@gmx.de")
+set(PACKAGE_STRING "${PROJECT_DESCRIPTION} ${PROJECT_VERSION}")
+
+unset(TM_USE_MPI)
+unset(TM_USE_OMP)
+unset(HAVE_LIBLEMON)
+unset(HAVE_LIBLIME)
+unset(FIXEDVOLUME)
+unset(_PERSISTENT)
+unset(_NON_BLOCKING)
+unset(HAVE_LIBQUDA)
+unset(TM_USE_QUDA)
+unset(TM_QUDA_EXPERIMENTAL)
+unset(TM_QUDA_FERMIONIC_FORCES)
+unset(DDalphaAMG)
+unset(TM_USE_QPHIX)
+unset(QPHIX_SOALEN)
+unset(_NEW_GEOMETRY)
+unset(_NON_BLOCKING)
+unset(_USE_SHMEM)
+unset(_USE_HALFSPINOR)
+set(ALIGN " ")
+set(ALIGN_BASE "0")
+set(ALIGN_BASE32 "0")
+set(ALIGN32 " ")
+
+message("${TM_ENABLE_ALIGNMENT}")
+if (${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
+  set(ALIGN_BASE "0x00")
+  set(ALIGN " ")
+  set(ALIGN_BASE32 "0x00")
+  set(ALIGN32 " ")
+elseif (TM_ENABLE_ALIGNMENT EQUAL 16)
+  set(ALIGN_BASE "0x0F")
+  set(ALIGN "__attribute__ ((aligned (16)))")
+  set(ALIGN_BASE32 "0x0F")
+  set(ALIGN32 "__attribute__ ((aligned (16)))")
+elseif (TM_ENABLE_ALIGNMENT EQUAL 32)
+  set(ALIGN_BASE "0x2F")
+  set(ALIGN "__attribute__ ((aligned (32)))")
+  set(ALIGN_BASE32 "0x2F")
+  set(ALIGN32 "__attribute__ ((aligned (32)))")
+elseif (TM_ENABLE_ALIGNMENT EQUAL 64)
+  set(ALIGN_BASE "0x3F")
+  set(ALIGN "__attribute__ ((aligned (64)))")
+  set(ALIGN_BASE32 "0x3F")
+  set(ALIGN32 "__attribute__ ((aligned (64)))")
+else()
+  message(FATAL_ERROR "Unusable value for array alignment. Allowed values are: auto, none, 16, 32, 64")
+endif()
+
+if(TM_USE_HALFSPINOR)
+  set(_USE_HALFSPINOR ON)
+endif()
+
+if(TM_FIXED_VOLUME)
+  set(FIXEDVOLUME ON)
+endif()
+
+if(TM_PERSISTENT_MPI)
+  set(_PERSISTENT ON)
+endif()
+
+if(TM_USE_MPI)
+  find_package(MPI REQUIRED)
+  set(TM_USE_MPI ON)
+  if(TM_NONBLOCKING_MPI)
+    set(_NON_BLOCKING ON)
+  endif()
+endif()
+
+if(TM_USE_OPENMP)
+  find_package(OpenMP REQUIRED COMPONENTS C CXX)
+  set(TM_USE_OMP ON)
+endif()
+
+if(TM_USE_HDF5)
+  find_package(HDF5 REQUIRED COMPONENTS C)
+endif()
+
+if(TM_USE_LEMON)
+  find_package(Clemon REQUIRED)
+  set(HAVE_LIBLEMON ON)
+endif()
+
+find_package(CLime REQUIRED)
+set(HAVE_LIBLIME ON)
+
+if(TM_USE_QUDA)
+  find_package(QUDA REQUIRED config)
+  set(HAVE_LIBQUDA ON)
+  if(TM_USE_QUDA_EXPERIMENTAL)
+    set(TM_QUDA_EXPERIMENTAL ON)
+  endif()
+  if(TM_QUDA_FERMIONIC_FORCES)
+    set(TM_QUDA_FERMIONIC_FORCES ON)
+  endif()
+  if(TM_USE_CUDA OR TM_USE_HIP)
+    set(TM_USE_QUDA ON)
+  endif()
+endif()
+
+if(TM_USE_CUDA AND TM_USE_HIP)
+  message(
+    ERROR
+    "HIP and CUDA are mutually exclusive. Please choose one GPU support only")
+endif()
+
+if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
+  enable_language(CUDA)
+  if(TM_USE_NVHPC)
+    find_package(NVHPC REQUIRED COMPONENTS CUDA MATH HOSTUTILS NCCL)
+  else()
+    find_package(CUDAToolkit REQUIRED)
+  endif()
+endif()
+
+message("QUDA_TARGET: ${QUDA_TARGET_CUDA}")
+if(TM_USE_HIP OR QUDA_TARGET_HIP)
+  enable_language(hip)
+
+  # we may want to use hip-cuda for development or debugging purposes especially
+  # if AMD GPU access is not possible. So allow it
+  if(TM_USE_CUDA_HIP)
+    find_package(CUDA)
+  endif()
+
+  if(CMAKE_HIP_PLATFORM MATCHES "amd")
+    set(TM_GPU_PLATFORM_DFLAGS "__HIP_PLATFORM_AMD__")
+  else()
+    set(TM_GPU_PLATFORM_DFLAGS "__HIP_PLATFORM_NVIDIA__")
+  endif()
+endif()
+
+if(TM_USE_SHMEM)
+  set(_USE_SHMEM ON)
+endif()
+
+if(TM_USE_QPIHX)
+  find_package(QPhiX REQUIRED)
+  if(NOT TARGET tmlqcd::qphix)
+    add_library(tmlqcd::qphix INTERFACE IMPORTED)
+    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
+      "${QPHIX_LIBRARIES}")
+    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+      "${QPHIX_INCLUDE_DIRS}")
+  endif()
+  set(TM_USE_QPHIX ON)
+endif()
+
+# check for fftw3 (rely on pkgconfig).
+if(TM_USE_FFTW)
+  pkg_search_module(tmlqcd_fftw3 IMPORTED_TARGET GLOBAL fftw3)
+  if(tmlqcd_fftw3_FOUND)
+    add_library(tmlqcd::fftw3 ALIAS PkgConfig::tmlqcd_fftw3)
+  endif()
+endif()
+
+# gprofiler
+
+if (TM_USE_GPROF)
+  set(PROFILE_FLAGS "-pg;-g")
+  if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
+    list(APPEND PROFILE_FLAGS "-qfullpath")
+  endif()
+  add_compile_options($<BOOL:$<COMPILE_LANGUAGE:C>:$PROFILE_FLAGS>)
+endif()
+
+if (TM_ENABLE_WARNINGS)
+  add_compile_options(
+    $<$<COMPILE_LANG_AND_ID:C,GNU>:-Wall>
+    $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
+endif()
+
+# check for the presence of clock_gettime in libc or librt
+check_symbol_exists(clock_gettime "time.h" HAVE_CLOCK_GETTIME)
+check_library_exists(rt clock_gettime "" HAVE_CLOCK_GETTIME_IN_RT)
+check_function_exists(fseeko HAVE_FSEEKO)
+
+# set the parallelization
+
+if(TM_USE_MPI)
+  if(TM_MPI_DIMENSION EQUAL "1")
+    # T parallelisation
+    set(PARALLELT ON)
+  elseif(TM_MPI_DIMENSION EQUAL "2")
+    # XT parallelisation
+    set(PARALLELXT ON)
+  elseif(TM_MPI_DIMENSION EQUAL "3")
+    set(PARALLELXYT ON)
+    # XYZ parallelisation
+  elseif(TM_MPI_DIMENSION EQUAL "4")
+    # timeslice-splitted communications
+    set(PARALLELXYZT ON)
+  elseif(TM_MPI_DIMENSION EQUAL "X")
+    set(PARALLELX ON)
+  elseif(TM_MPI_DIMENSION EQUAL "XY")
+    set(PARALLELXY ON)
+  elseif(TM_MPI_DIMENSION EQUAL "XYZ")
+    set(PARALLELXYZ ON)
+  else()
+    set(PARALLELXYZT ON)
+  endif()
+endif()
+
+# keep the autotool config.h header.
+configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
+               "${PROJECT_BINARY_DIR}/tmlqcd_config_internal.h" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/fixed_volume.h.in"
+               "${PROJECT_BINARY_DIR}/fixed_volume.h" @ONLY)
+# check if git command exists
+find_program(GIT_EXE NAMES git)
+
+# generate version header
+string(TIMESTAMP TM_TIMESTAMP "%Y-%m-%d %H:%M:%S")
+if(DEFINED GIT_EXE AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
+  execute_process(
+    COMMAND git rev-parse HEAD
+    OUTPUT_VARIABLE TM_SHA
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  execute_process(
+    COMMAND git describe --all
+    OUTPUT_VARIABLE TM_GIT_BRANCH
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  message(STATUS "git hash ${TM_SHA}")
+else()
+  # set(TM_GIT_BRANCH "release v${SIRIUS_VERSION}")
+  set(TM_SHA
+      "https://github.com/etmc/tmLQCD/releases/tag/rel-${TMLQCD_VERSION_MAJOR}-${TMLQCD_VERSION_MINOR}"
+  )
+endif()
+
+configure_file(cmake/git_hash.h.in git_hash.h @ONLY)
+add_subdirectory(src/lib)
+add_subdirectory(src/bin)
diff --git a/cmake/FindCLime.cmake b/cmake/FindCLime.cmake
new file mode 100644
index 000000000..0c3eabe48
--- /dev/null
+++ b/cmake/FindCLime.cmake
@@ -0,0 +1,27 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TMLQCD_CLIME_LIBRARIES
+  NAMES lime
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TMLQCD_CLIME_INCLUDE_DIRS
+  NAMES lime.h
+  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
+
+message("${TMLQCD_CLIME_INCLUDE_DIRS}")
+find_package_handle_standard_args(CLime DEFAULT_MSG TMLQCD_CLIME_LIBRARIES
+                                  TMLQCD_CLIME_INCLUDE_DIRS)
+
+if(NOT TARGET tmlqcd::clime)
+  add_library(tmlqcd::clime INTERFACE IMPORTED)
+  set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_LINK_LIBRARIES
+                                                 "${TMLQCD_CLIME_LIBRARIES}")
+  set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                                 "${TMLQCD_CLIME_INCLUDE_DIRS}")
+endif()
+
+set(TMLQCD_CLIME_FOUND ON)
+mark_as_advanced(TMLQCD_CLIME_FOUND TMLQCD_CLIME_LIBRARIES
+                 TMLQCD_CLIME_INCLUDE_DIRS)
diff --git a/cmake/FindLemon.cmake b/cmake/FindLemon.cmake
new file mode 100644
index 000000000..cdeca5e42
--- /dev/null
+++ b/cmake/FindLemon.cmake
@@ -0,0 +1,25 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TMLQCD_LEMON_LIBRARIES
+  NAMES lemon
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TMLQCD_LEMON_INCLUDE_DIRS
+  NAMES lemon.h
+  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
+
+find_package_handle_standard_args(Lemon DEFAULT_MSG TMLQCD_LEMON_LIBRARIES
+                                  TMLQCD_LEMON_INCLUDE_DIRS)
+
+if(NOT TARGET tmlqcd::lemon)
+  add_library(tmlqcd::lemon INTERFACE IMPORTED)
+  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_LINK_LIBRARIES
+                                                 "${TMLQCD_LEMON_LIBRARIES}")
+  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                                 "${TMLQCD_LEMON_INCLUDE_DIRS}")
+endif()
+
+set(TMLQCD_LEMON_FOUND ON)
+mark_as_advanced(TMLQCD_LEMON_LIBRARIES TMLQCD_LEMON_INCLUDE_DIRS)
diff --git a/cmake/git_hash.h.in b/cmake/git_hash.h.in
new file mode 100644
index 000000000..23f624742
--- /dev/null
+++ b/cmake/git_hash.h.in
@@ -0,0 +1,6 @@
+#ifndef _GIT_HASH_H
+#define _GIT_HASH_H
+
+const char git_hash[] = "@TMLQCD_SHA@";
+
+#endif /* _GIT_HASH_H */
diff --git a/include/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
similarity index 56%
rename from include/tmlqcd_config_internal.h.in
rename to cmake/tmlqcd_config_internal.h.in
index 037ad84a5..5dd9c7096 100644
--- a/include/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -4,177 +4,139 @@
  * systems, for example. */
 
 /* We are on a CRAY */
-#undef CRAY
+#cmakedefine CRAY
 
 /* lapack available */
-#undef HAVE_LAPACK
+#cmakedefine HAVE_LAPACK 
 
 /* Define to 1 if you have the `lime' library (-llime). */
-#undef HAVE_LIBLIME
+#cmakedefine HAVE_LIBLIME 
 
 /* Define to 1 if you have the `lemon' library (-llemon). */
-#undef HAVE_LIBLEMON
+#cmakedefine HAVE_LIBLEMON 
 
 /* 1 if clock_gettime is available for use in benchmark */
-#undef HAVE_CLOCK_GETTIME
+#cmakedefine HAVE_CLOCK_GETTIME 
 
 /* Compile with MPI support */
-#undef TM_USE_MPI
+#cmakedefine TM_USE_MPI
 
 /* Compile with OpenMP support */
-#undef TM_USE_OMP
+#cmakedefine TM_USE_OMP
 
 /* Compile with FFTW support */
-#undef HAVE_FFTW
+#cmakedefine HAVE_FFTW 
 
 /* Fortran has not extra _ */
-#undef NOF77_
+#cmakedefine NOF77_
 
 /* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
+#define PACKAGE_BUGREPORT "@PACKAGE_BUGREPORT@"
 
 /* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
+#define PACKAGE_NAME "@PROJECT_DESCRIPTION@"
 /* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
+#define PACKAGE_STRING "@PROJECT_VERSION@"
 
 /* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
+#define PACKAGE_TARNAME "@PACKAGE_TARNAME@"
 
 /* Define to the version of this package. */
-#undef PACKAGE_VERSION
+#define PACKAGE_VERSION "@PROJECT_DESCRIPTION@ @PROJECT_VERSION@"
 
 /* X parallelisation */
-#undef PARALLELX
+#cmakedefine PARALLELX 
 
 /* XY parallelisation */
-#undef PARALLELXY
+#cmakedefine PARALLELXY 
 
 /* XYZ parallelisation */
-#undef PARALLELXYZ
+#cmakedefine PARALLELXYZ
 
 /* One dimensional parallelisation */
-#undef PARALLELT
+#cmakedefine PARALLELT
 
 /* Two dimensional parallelisation */
-#undef PARALLELXT
+#cmakedefine PARALLELXT
 
 /* Three dimensional parallelisation */
-#undef PARALLELXYT
+#cmakedefine PARALLELXYT
 
 /* Four dimensional parallelisation */
-#undef PARALLELXYZT
+#cmakedefine PARALLELXYZT
 
 /* Fixed volume at compiletime */
-#undef FIXEDVOLUME
+#cmakedefine FIXEDVOLUME
 
 /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#undef HAVE_FSEEKO
+#cmakedefine HAVE_FSEEKO
 
 /* Alignment for arrays -- necessary for SSE and automated vectorization */
-#undef ALIGN_BASE
+#define ALIGN_BASE @ALIGN_BASE@
 
 /* Alignment compiler hint macro */
-#undef ALIGN
+#cmakedefine ALIGN @ALIGN@
 
 /* Alignment for 32bit arrays -- necessary for SSE and automated vectorization */
-#undef ALIGN_BASE32
+#define ALIGN_BASE32 @ALIGN_BASE32@
 
 /* Alignment of 32bit fields, compiler hint macro */
-#undef ALIGN32
+#define ALIGN32 @ALIGN32@
 
 /* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
    `char[]'. */
-#undef YYTEXT_POINTER
+#cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#undef _FILE_OFFSET_BITS
+#cmakedefine _FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
-#undef _GAUGE_COPY
+#cmakedefine _GAUGE_COPY
 
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
-#undef _LARGEFILE_SOURCE
+#cmakedefine _LARGEFILE_SOURCE
 
 /* Define for large files, on AIX-style hosts. */
-#undef _LARGE_FILES
+#cmakedefine _LARGE_FILES 
 
 /* Use even/odd geometry in the gauge fields */
-#undef _NEW_GEOMETRY
+#cmakedefine _NEW_GEOMETRY
 
 /* x86 64 Bit architecture */
-#undef _x86_64
-
-/* Define to empty if `const' does not conform to ANSI C. */
-#undef const
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-#undef inline
-#endif
-
-/* Define to `long' if <sys/types.h> does not define. */
-#undef off_t
-
-/* Define to `unsigned' if <sys/types.h> does not define. */
-#undef size_t
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if the system has the type `uint16_t'. */
-#undef HAVE_UINT16_T
-
-/* Define to 1 if the system has the type `uint32_t'. */
-#undef HAVE_UINT32_T
-
-/* Define to 1 if the system has the type `uint64_t'. */
-#undef HAVE_UINT64_T
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
+#cmakedefine _x86_64
 
 /* Define to 1 if Dirac operator with halfspinor should be used */
-#undef _USE_HALFSPINOR
+#cmakedefine _USE_HALFSPINOR 
 
 /* Define to 1 if shmem API should be used */
-#undef _USE_SHMEM
+#cmakedefine _USE_SHMEM
 
 /* Define to 1 if KOJAK instrumentalisation should be done*/
-#undef _KOJAK_INST
-
-/* Define to equivalent of C99 restrict keyword, or to nothing if this is not
-   supported. Do not define if restrict is supported directly. */
-#undef restrict
+#cmakedefine _KOJAK_INST
 
 /* Define to 1 if persistent MPI calls for halfspinor should be used */
-#undef _PERSISTENT
+#cmakedefine _PERSISTENT
 
 /* Define to 1 if non-blocking MPI calls for spinor and gauge should be used */
-#undef _NON_BLOCKING
+#cmakedefine _NON_BLOCKING
 
 /* Define to 1 if you have the `quda' library (-lquda). */
-#undef HAVE_LIBQUDA
+#cmakedefine HAVE_LIBQUDA
 
 /* Using QUDA GPU */
-#undef TM_USE_QUDA
+#cmakedefine TM_USE_QUDA 
 
 /* Using experimental QUDA version */
-#undef TM_QUDA_EXPERIMENTAL
+#cmakedefine TM_QUDA_EXPERIMENTAL
 
 /* Using QUDA fermionic forces */
-#undef TM_QUDA_FERMIONIC_FORCES
+#cmakedefine TM_QUDA_FERMIONIC_FORCES
 
 /* Using DDalphaAMG */
-#undef DDalphaAMG
+#cmakedefine DDalphaAMG
 
 /* Using QPHIX */
-#undef TM_USE_QPHIX
+#cmakedefine TM_USE_QPHIX 
 
 /* Structure of Array length to use with QPhiX */
-#undef QPHIX_SOALEN
+#cmakedefine QPHIX_SOALEN @TMLQCD_QPHIX_SOALEN@
diff --git a/cmake_includes.txt b/cmake_includes.txt
new file mode 100644
index 000000000..b8e105cc0
--- /dev/null
+++ b/cmake_includes.txt
@@ -0,0 +1,425 @@
+LIST(APPEND IO_SRC_C io_srcio/utils_write_inverter_info.c
+io/gauge_read.c
+io/utils_write_xlf.c
+io/utils_construct_reader.c
+io/params_construct_xlfInfo.c
+io/utils_kill_with_error.c
+io/DML_crc32.c
+io/spinor_write_source_format.c
+io/deri_write_stdout.c
+io/spinor_write_propagator_format.c
+io/utils_engineering.c
+io/utils_parse_propagator_type.c
+io/io_cm.c
+io/utils_parse_ildgformat_xml.c
+io/utils_read_message.c
+io/utils_write_ildg_format.c
+io/utils_destruct_writer.c
+io/gauge_write.c
+io/utils_write_message.c
+io/params_construct_ildgFormat.c
+io/spinor_read.c
+io/utils_close_reader_record.c
+io/spinor_read_binary.c
+io/utils.c
+io/spinor_write_stdout.c
+io/spinor_write_info.c
+io/utils_write_checksum.c
+io/utils_write_header.c
+io/eospinor_read.c
+io/utils_write_first_message.c
+io/params_construct_InverterInfo.c
+io/utils_parse_checksum_xml.c
+io/utils_construct_writer.c
+io/sw_write_stdout.c
+io/spinor_write_propagator_type.c
+io/gauge_write_binary.c
+io/spinor_write.c
+io/utils_write_xlf_xml.c
+io/params_construct_propagatorFormat.c
+io/gauge_read_binary.c
+io/dml.c
+io/spinor_write_binary.c
+io/utils_destruct_reader.c
+io/utils_close_writer_record.c
+io/eospinor_write.c
+io/gauge_write_luscher_binary.c
+io/params_construct_sourceFormat.c)
+
+list(APPEND INIT_SRC_C init/init_dirac_halfspinor.c
+     init/init_geometry_indices.c
+     init/init_openmp.c
+     init/init_gauge_field.c
+     init/init_parallel.c
+     init/init_chi_spinor_field.c
+     init/init_gauge_fg.c
+     init/init_spinor_field.c
+     init/init_global_states.c
+     init/init_bispinor_field.c
+     init/init_gauge_tmp.c
+     init/init_critical_globals.c
+     init/init_omp_accumulators.c
+     init/init_jacobi_field.c
+     init/init_stout_smear_vars.c
+     init/init_moment_field.c)
+
+list(APPEND SOLVER_SRC_C
+solver/bicg_complex.c
+solver/dfl_projector.c
+solver/eigenvalues_Jacobi.c
+solver/gcr.c
+solver/gmres_precon.c
+solver/chrono_guess.c
+solver/gcr4complex.c
+solver/jdher.c
+solver/gcr4complex_body.c
+solver/gmres_dr.c
+solver/fgmres4complex_body.c
+solver/cg_her_bi.c
+solver/solver_field.c
+solver/quicksort.c
+solver/bicgstab2.c
+solver/cgs_real.c
+solver/M_plus_block_psi_body.c
+solver/little_mg_precon_body.c
+solver/cg_her_su3vect.c
+solver/little_project_eo_body.c
+solver/monomial_solve.c
+solver/cr.c
+solver/gram-schmidt.c
+solver/solver_types.c
+solver/mode_number.c
+solver/cg_her.c
+solver/jdher_bi.c
+solver/mrblk_body.c
+solver/eigcg.c
+solver/jdher_su3vect.c
+solver/poly_precon.c
+solver/Msap.c
+solver/fgmres.c
+solver/dirac_operator_eigenvectors.c
+solver/incr_eigcg.c
+solver/index_jd.c
+solver/sumr.c
+solver/cgne4complex.c
+solver/eigenvalues_bi.c
+solver/gmres.c
+solver/lu_solve.c
+solver/diagonalise_general_matrix.c
+solver/mcr.c
+solver/bicgstabell.c
+solver/rg_mixed_cg_her.c
+solver/mixed_cg_her.c
+solver/mixed_cg_mms_tm_nd.c
+solver/rg_mixed_cg_her_nd.c
+solver/spectral_proj.c
+solver/restart_X.c
+solver/generate_dfl_subspace.c
+solver/eigenvalues.c
+solver/mcr4complex.c
+solver/mr4complex.c
+solver/bicgstab_complex.c
+solver/cg_mms_tm_nd.c
+solver/mr.c
+solver/cg_her_nd.c
+solver/bicgstab_complex_bi.c
+solver/sub_low_ev.c
+solver/ortho.c
+solver/pcg_her.c
+solver/fgmres4complex.c
+solver/cg_mms_tm.c
+solver/init_guess.c)
+
+list(APPEND LINALG_SRC_C linalg/assign_mul_bra_add_mul_r.c
+     linalg/mul_r_gamma5.c
+     linalg/convert_eo_to_lexic.c
+     linalg/print_spinor.c
+     linalg/assign_add_mul_body.c
+     linalg/mul_diff_mul_r.c
+     linalg/square_norm_32.c
+     linalg/mul.c
+     linalg/mul_r.c
+     linalg/mul_gamma5.c
+     linalg/ratio.c
+     linalg/square_norm.c
+     linalg/mul_diff_mul.c
+     linalg/square_and_minmax.c
+     linalg/add.c
+     linalg/assign_add_mul_add_mul_r.c
+     linalg/comp_decomp.c
+     linalg/mul_add_mul.c
+     linalg/diff_32.c
+     linalg/assign_add_mul.c
+     linalg/addto_32.c
+     linalg/assign_mul_add_mul_add_mul_add_mul_r.c
+     linalg/assign_add_mul_r.c
+     linalg/diff.c
+     linalg/assign_mul_add_mul_r.c
+     linalg/scalar_prod_r.c
+     linalg/assign_to_32.c
+     linalg/assign_add_mul_add_mul.c
+     linalg/mul_diff_r.c
+     linalg/assign_mul_add_r_and_square.c
+     linalg/assign_mul_add_mul_r_32.c
+     linalg/assign_mul_add_mul.c
+     linalg/assign_mul_add_mul_add_mul_r.c
+     linalg/scalar_prod_r_32.c
+     linalg/assign_mul_add_r.c
+     linalg/assign_mul_add_r_32.c
+     linalg/scalar_prod_su3spinor.c
+     linalg/convert_even_to_lexic.c
+     linalg/mul_r_32.c
+     linalg/assign_add_mul_r_add_mul.c
+     linalg/convert_odd_to_lexic.c
+     linalg/diff_and_square_norm.c
+     linalg/scalar_prod_i.c
+     linalg/mul_add_mul_r.c
+     linalg/assign_diff_mul.c
+     linalg/assign_mul_bra_add_mul_ket_add_r.c
+     linalg/set_even_to_zero.c
+     linalg/assign_mul_add.c
+     linalg/square_and_prod_r.c
+     linalg/scalar_prod_body.c
+     linalg/assign_mul_bra_add_mul_ket_add.c
+     linalg/assign_add_mul_r_32.c
+     linalg/scalar_prod.c
+     linalg/mattimesvec.c
+     linalg/assign.c
+     linalg/print_spinor_similar_components.c)
+
+list(APPEND RATIONAL_SRC_C rational/zolotarev.c
+     rational/rational.c
+     rational/elliptic.c)
+
+list(APPEND OPERATOR_SRC_C operator/clover_invert.c
+     operator/hopping_body_dbl.c
+     operator/tm_operators_nd_32.c
+     operator/hopping_sse_dbl.c
+     operator/halfspinor_body.c
+     operator/Block_D_psi_body.c
+     operator/mul_one_pm_imu_sub_mul_body.c
+     operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
+     operator/assign_mul_one_sw_pm_imu_inv_block_body.c
+     operator/clover_accumulate_deriv.c
+     operator/Hopping_Matrix.c
+     operator/hopping_bg_dbl.c
+     operator/tm_operators.c
+     operator/tm_times_Hopping_Matrix.c
+     operator/clovertm_operators_32.c
+     operator/hopping_sgl.c
+     operator/Dov_proj.c
+     operator/clover_deriv.c
+     operator/halfspinor_bg_dbl.c
+     operator/clover_det.c
+     operator/clover_leaf.c
+     operator/D_psi_body.c
+     operator/clovertm_operators.c
+     operator/hopping_sse_sgl.c
+     operator/halfspinor_sse_dbl.c
+     operator/Dov_psi.c
+     operator/tm_operators_nd.c
+     operator/tm_sub_Hopping_Matrix.c
+     operator/Hopping_Matrix_nocom.c
+     operator/clover_term.c
+     operator/halfspinor_bgq_dbl.c
+     operator/Hopping_Matrix_32_nocom.c
+     operator/D_psi.c
+     operator/tm_operators_32.c
+     operator/Hopping_Matrix_32.c
+     operator/halfspinor_body_32.c
+     operator/mul_one_pm_imu_inv_body.c)
+
+list(APPEND SMEARING_SRC_C smearing/hex_stout_exclude_two.c
+     smearing/hex_hex_smear.c
+     smearing/utils_print_su3.c
+     smearing/hyp_APE_project_exclude_none.c
+     smearing/hyp_hyp_staples_exclude_one.c
+     smearing/hyp_APE_project_exclude_one.c
+     smearing/hex_stout_exclude_one.c
+     smearing/hyp_hyp_staples_exclude_two.c
+     smearing/hex_stout_exclude_none.c
+     smearing/stout_stout_smear.c
+     smearing/hyp_hyp_smear.c
+     smearing/hyp_APE_project_exclude_two.c
+     smearing/utils_project_herm.c
+     smearing/utils_reunitarize.c
+     smearing/utils_generic_staples.c
+     smearing/hyp_hyp_staples_exclude_none.c
+     smearing/ape_ape_smear.c
+     smearing/uils_print_config_to_screen.c
+     smearing/utils_project_antiherm.c
+     smearing/utils_print_config_to_screen.c
+     smearing/utils_reunitarize_MILC.c)
+
+list(APPEND BUFFER_SRC_C
+     buffers/gauge_return_gauge_field.c
+     buffers/gauge_get_gauge_field.c
+     buffers/gauge_finalize_gauge_buffers.c
+     buffers/gauge_initialize_gauge_buffers.c
+     buffers/gauge.c
+     buffers/gauge_free_unused_gauge_buffers.c
+     buffers/gauge_get_gauge_field_array.c
+     buffers/utils_generic_exchange.c
+     buffers/gauge_allocate_gauge_buffers.c
+     buffers/gauge_return_gauge_field_array.c)
+
+list(APPEND MONOMIAL_SRC_C
+     monomial/detratio_monomial.c
+     monomial/sf_gauge_monomial.c
+     monomial/poly_monomial.c
+     monomial/cloverdetratio_monomial.c
+     monomial/ndrat_monomial.c
+     monomial/cloverdet_monomial.c
+     monomial/clover_trlog_monomial.c
+     monomial/cloverndpoly_monomial.c
+     monomial/monitor_forces.c
+     monomial/ndpoly_monomial.c
+     monomial/det_monomial.c
+     monomial/monomial.c
+     monomial/cloverdetratio_rwmonomial.c
+     monomial/gauge_monomial.c
+     monomial/clovernd_trlog_monomial.c
+     monomial/ratcor_monomial.c
+     monomial/nddetratio_monomial.c
+     monomial/rat_monomial.c
+     monomial/ndratcor_monomial.c
+     monomial/moment_energy.c)
+
+list(APPEND EXCHANGE_SRC_C xchange/xchange_lexicfield.c
+xchange/xchange_2fields.c
+xchange/xchange_gauge.c
+xchange/xchange_halffield.c
+xchange/xchange_jacobi.c
+xchange/little_field_gather_body.c
+xchange/little_field_gather.c
+xchange/xchange_deri.c
+xchange/xchange_field.c
+xchange/xchange_field_tslice.c)
+
+list(APPEND MEAS_SRC_C
+meas/pion_norm.c
+meas/correlators.c
+meas/polyakov_loop.c
+meas/measurements.c
+meas/oriented_plaquettes.c
+meas/gradient_flow.c
+meas/measure_clover_field_strength_observables.c)
+
+list(APPEND SF_SRC_C sf/sf_calc_action.c
+     sf/sf_get_rectangle_staples.c
+     sf/sf_get_staples.c
+     sf/sf_observables.c
+     sf/sf_utils.c
+     )
+
+list(APPEND MAIN_SRC_C
+measure_gauge_action.c
+start.c
+deriv_Sb.c
+reweighting_factor_nd.c
+ranlxs.c
+source_generation.c
+read_input.c
+invert_doublet_eo.c
+geometry_eo.c
+getopt.c
+offline_measurement.c
+tm_debug_printf.c
+chebyshev_polynomial_nd.c
+invert_eo.c
+little_D.c
+get_rectangle_staples.c
+gen_sources.c
+rnd_gauge_trafo.c
+test_lemon.c
+LapH_ev.c
+benchmark.c
+measure_rectangles.c
+check_locallity.c
+invert.c
+deriv_Sb_D_psi.c
+deriv_mg_tune.c
+mpi_init.c
+update_momenta_fg.c
+gamma.c
+matrix_utils.c
+reweighting_factor.c
+update_tm.c
+jacobi.c
+invert_overlap.c
+phmc.c
+get_staples.c
+clenshaw_coef.c
+block.c
+spinor_fft.c
+boundary.c
+little_D_body.c
+X_psi.c
+prepare_source.c
+DDalphaAMG_interface.c
+update_backward_gauge.c
+invert_clover_eo.c
+gettime.c
+hmc_tm.c
+update_momenta.c
+sighandler.c
+compare_derivative.c
+ranlxd.c
+DirectPut.c
+aligned_malloc.c
+fatal_error.c
+operator.c
+cu/cu.c
+chebyshev_polynomial.c
+qphix_test_Dslash.c
+expo.c
+overrelaxation.c
+Ptilde_nd.c
+update_gauge.c
+hopping_test.c
+integrator.c
+P_M_eta.c)
+
+if (TMLQCD_USE_QPHIX)
+list(APPEND MAIN_SRC_C qphix_interface.cpp)
+endif()
+
+if (TMLQCD_USE_QUDA)
+list(APPEND MAIN_SRC_C quda_interface.c)
+endif()
+
+list(APPEND ALL_SRC ${MAIN_SRC_C} ${SF_SRC_C} ${XCHANGE_SRC_C} ${MONOMIAL_SRC_C} ${BUFFER_SRC_C} ${SMEARING_SRC_C} ${OPERATOR_SRC_C} ${RATIONAL_SRC_C} ${LINALG_SRC_C} ${IO_SRC_C} ${INIT_SRC_C} ${SOLVER_SRC_C})
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+# cmake 4.0 uses a different syntax for the option
+flex_target(tmlqcd_input_read input_read.l input_read.c
+            $<$<VERSION_LESS:${CMAKE_MAJOR_VERSION},4>:COMPILE_FLAGS "-Ca -Ptmlqcd">
+            $<$<VERSION_GREATER_EQUAL:${CMAKE_MAJOR_VERSION},4>:OPTIONS "-Ca;-Ptmlqcd">)
+
+# create a target library with namespacing because cmake does not know name space at all
+add_library(tmlqcd::hmc ALL_SRC ${FLEX_tmlqcd_input_read_OUTPUTS})
+set_target_properties(tmlqcd::hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
+
+# define a library and add the dependencies
+target_link_libraries(tmlqcd::hmc
+                      $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
+                      $<$<BOOL:${TMLQCD_USE_LIME}>:tmlqcd::lime>
+                      $<$<BOOL:${TMLQCD_USE_LEMON}>:tmlqcd::lemon>
+                      $<$<BOOL:${TMLQCD_USE_QPHIX}>:tmlqcd::qphix>
+                      $<$<BOOL:${TMLQCD_USE_FFTW}>:tmlqcd::fftw3>
+                      $<$<BOOL:${TMLQCD_USE_MPI}>:MPI::MPI_C MPI::MPI_CXX>
+                      $<$<BOOL:${TMLQCD_USE_QUDA}>:quda::quda>
+                      $<$<BOOL:${TMLQCD_USE_CUDA}>:CUDA::cufft CUDA::cufftw CUDA::cublas CUDA::cudart CUDA::cuda_driver>
+                      $<$<BOOL:${TMLQCD_USE_HIP}>:hip::hipfft roc::hipblas hip::host>
+                      ${LAPACK_LIBRARIES}
+                      ${BLAS_LIBRARIES}
+                      $<$<BOOL:${TMLQCD_USE_OPENMP}>:OpenMP::OpenMP_C OpenMP::OpenMP_CXX>
+                      m)
+
+target_compile_definitions(tmlqcd::hmc
+                           $<$<BOOL:${TMLQCD_USE_HIP}>:${TMLQCD_GPU_PLATFORM_DFLAGS}>
+                           )
+
+target_include_directories(tmlqcd::hmc PUBLIC $<INSTALL_INTERFACE:include>
+                           PRIVATE "init io linalg meas monomial operator profiling rational sf smearing solver util xchange wrapper")
diff --git a/io/Makefile.in b/io/Makefile.in
deleted file mode 100644
index 41b5b78ce..000000000
--- a/io/Makefile.in
+++ /dev/null
@@ -1,135 +0,0 @@
-
-srcdir = @srcdir@
-top_builddir =  @top_builddir@
-abs_top_builddir = @abs_top_builddir@
-top_srcdir = @top_srcdir@
-abs_top_srcdir = @abs_top_srcdir@
-subdir = io
-builddir = @builddir@
-
-CFLAGS = @CFLAGS@
-DEPFLAGS = @DEPFLAGS@
-LDFLAGS = @LDFLAGS@
-DEFS = @DEFS@
-OPTARGS = @OPTARGS@
-
-AR = @AR@
-RANLIB = @RANLIB@
-CC = @CC@
-CCDEP = @CCDEP@
-CCLD = $(CC)
-LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@
-LEX = @LEX@
-AUTOCONF = @AUTOCONF@
-DEFS = @DEFS@
-
-LEMON_AVAILABLE = @LEMON_AVAILABLE@
-
-INCLUDES = @INCLUDES@
-LDADD =
-COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS}
-
-LIBRARIES = libio
-
-libio_TARGETS = utils_engineering \
-		utils_parse_checksum_xml \
-		utils_write_message \
-		utils_read_message \
-		gauge_write_binary \
-		gauge_read_binary \
-		gauge_read \
-		gauge_write \
-		utils_write_xlf \
-		utils_write_xlf_xml \
-		utils_write_ildg_format \
-		utils_write_header \
-		utils_write_checksum \
-		utils_write_inverter_info \
-		utils_kill_with_error \
-		utils_construct_reader \
-		utils_destruct_reader \
-		utils_construct_writer \
-		utils_destruct_writer \
-		utils_close_writer_record \
-		utils_close_reader_record \
-		utils_write_first_message \
-		utils_parse_propagator_type \
-		utils_parse_ildgformat_xml \
-		params_construct_ildgFormat \
-		params_construct_propagatorFormat \
-		params_construct_sourceFormat \
-		params_construct_xlfInfo \
-		params_construct_InverterInfo \
-		spinor_write \
-		spinor_read \
-		spinor_write_binary \
-		spinor_read_binary \
-		spinor_write_info \
-		spinor_write_source_format \
-		spinor_write_propagator_format \
-		spinor_write_propagator_type \
-		utils DML_crc32 dml \
-		eospinor_write \
-		eospinor_read \
-		io_cm \
-		deri_write_stdout spinor_write_stdout sw_write_stdout \
-		gauge_write_luscher_binary
-
-libio_OBJECTS = $(addsuffix .o, ${libio_TARGETS})
-
-# default rule
-
-all: Makefile dep libio.a
-
-# rules for debugging
-debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@
-debug all-debug: all
-
-# rules for profiling information
-profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@
-profile all-profile: all
-
-
-#include dep rules
-
-
--include $(addsuffix .d,${libio_TARGETS})
-
-include ${top_srcdir}/Makefile.global
-
-# rule to compile objects
-
-%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/include/tmlqcd_config_internal.h
-	$(COMPILE) -c $<
-
-
-# rule to make libio
-libio.a: ${libio_OBJECTS} Makefile
-	@rm -f libio.a
-	@${AR} cru libio.a $(libio_OBJECTS)
-	@$(RANLIB) libio.a
-	@cp libio.a ${top_builddir}/lib/libio.a
-
-# rule to generate .d files
-
-$(addsuffix .d,$(libio_TARGETS)): %.d: ${srcdir}/%.c Makefile
-	@$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@
-
-# rule to make dependencies
-
-dep: ${addsuffix .d, ${libio_TARGETS}}
-
-# rules to clean
-
-compile-clean: Makefile
-	rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d
-
-clean: compile-clean
-	rm -f $(addsuffix .a, ${LIBRARIES})
-	rm -f ../lib/libio.a
-
-distclean: clean
-	rm -f Makefile
-
-
-.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
new file mode 100644
index 000000000..29c9c1d8a
--- /dev/null
+++ b/src/bin/CMakeLists.txt
@@ -0,0 +1,19 @@
+list(APPEND tmlqcd_prog "benchmark;deriv_mg_tune;hmc_tm;offline_measurement")
+
+include_directories(
+  $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/src/lib/include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<$<BOOL:${TM_USE_LEMON}>:${TM_LEMON_INCLUDE_DIRS}>
+  ${TM_CLIME_INCLUDE_DIRS})
+
+foreach(_prog ${tmlqcd_prog})
+  add_executable(${_prog} "${_prog}.c")
+
+  target_link_libraries(${_prog} PUBLIC hmc)
+  set_target_properties(
+    ${_prog}
+    PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+               POSITION_INDEPENDENT_CODE ON
+               LINKER_LANGUAGE "CXX")
+endforeach()
diff --git a/LapH_ev.c b/src/bin/LapH_ev.c
similarity index 100%
rename from LapH_ev.c
rename to src/bin/LapH_ev.c
diff --git a/benchmark.c b/src/bin/benchmark.c
similarity index 100%
rename from benchmark.c
rename to src/bin/benchmark.c
diff --git a/check_locallity.c b/src/bin/check_locallity.c
similarity index 99%
rename from check_locallity.c
rename to src/bin/check_locallity.c
index 9ed46daee..52ea21209 100644
--- a/check_locallity.c
+++ b/src/bin/check_locallity.c
@@ -17,10 +17,8 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 
-#include "lime.h"
-#ifdef HAVE_CONFIG_H
+#include <lime.h>
 #include <tmlqcd_config.h>
-#endif
 #include <math.h>
 #include <signal.h>
 #include <stdio.h>
diff --git a/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
similarity index 100%
rename from deriv_mg_tune.c
rename to src/bin/deriv_mg_tune.c
diff --git a/gen_sources.c b/src/bin/gen_sources.c
similarity index 100%
rename from gen_sources.c
rename to src/bin/gen_sources.c
diff --git a/hmc_tm.c b/src/bin/hmc_tm.c
similarity index 100%
rename from hmc_tm.c
rename to src/bin/hmc_tm.c
diff --git a/hopping_test.c b/src/bin/hopping_test.c
similarity index 100%
rename from hopping_test.c
rename to src/bin/hopping_test.c
diff --git a/invert.c b/src/bin/invert.c
similarity index 100%
rename from invert.c
rename to src/bin/invert.c
diff --git a/util/main_ildg2uk.c b/src/bin/main_ildg2uk.c
similarity index 100%
rename from util/main_ildg2uk.c
rename to src/bin/main_ildg2uk.c
diff --git a/offline_measurement.c b/src/bin/offline_measurement.c
similarity index 100%
rename from offline_measurement.c
rename to src/bin/offline_measurement.c
diff --git a/qphix_test_Dslash.c b/src/bin/qphix_test_Dslash.c
similarity index 100%
rename from qphix_test_Dslash.c
rename to src/bin/qphix_test_Dslash.c
diff --git a/test/scalar_prod_r_test.c b/src/bin/scalar_prod_r_test.c
similarity index 100%
rename from test/scalar_prod_r_test.c
rename to src/bin/scalar_prod_r_test.c
diff --git a/test/test_eigenvalues.c b/src/bin/test_eigenvalues.c
similarity index 100%
rename from test/test_eigenvalues.c
rename to src/bin/test_eigenvalues.c
diff --git a/test_lemon.c b/src/bin/test_lemon.c
similarity index 100%
rename from test_lemon.c
rename to src/bin/test_lemon.c
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
new file mode 100644
index 000000000..4ace6c997
--- /dev/null
+++ b/src/lib/CMakeLists.txt
@@ -0,0 +1,457 @@
+list(
+  APPEND
+  IO_SRC_C
+  io/utils_write_inverter_info.c
+  io/gauge_read.c
+  io/utils_write_xlf.c
+  io/utils_construct_reader.c
+  io/params_construct_xlfInfo.c
+  io/utils_kill_with_error.c
+  io/DML_crc32.c
+  io/spinor_write_source_format.c
+  io/deri_write_stdout.c
+  io/spinor_write_propagator_format.c
+  io/utils_engineering.c
+  io/utils_parse_propagator_type.c
+  io/io_cm.c
+  io/utils_parse_ildgformat_xml.c
+  io/utils_read_message.c
+  io/utils_write_ildg_format.c
+  io/utils_destruct_writer.c
+  io/gauge_write.c
+  io/utils_write_message.c
+  io/params_construct_ildgFormat.c
+  io/spinor_read.c
+  io/utils_close_reader_record.c
+  io/spinor_read_binary.c
+  io/utils.c
+  io/spinor_write_stdout.c
+  io/spinor_write_info.c
+  io/utils_write_checksum.c
+  io/utils_write_header.c
+  io/eospinor_read.c
+  io/utils_write_first_message.c
+  io/params_construct_InverterInfo.c
+  io/utils_parse_checksum_xml.c
+  io/utils_construct_writer.c
+  io/sw_write_stdout.c
+  io/spinor_write_propagator_type.c
+  io/gauge_write_binary.c
+  io/spinor_write.c
+  io/utils_write_xlf_xml.c
+  io/params_construct_propagatorFormat.c
+  io/gauge_read_binary.c
+  io/dml.c
+  io/spinor_write_binary.c
+  io/utils_destruct_reader.c
+  io/utils_close_writer_record.c
+  io/eospinor_write.c
+  io/gauge_write_luscher_binary.c
+  io/params_construct_sourceFormat.c)
+
+list(
+  APPEND
+  INIT_SRC_C
+  init/init_dirac_halfspinor.c
+  init/init_geometry_indices.c
+  init/init_openmp.c
+  init/init_gauge_field.c
+  init/init_parallel.c
+  init/init_chi_spinor_field.c
+  init/init_gauge_fg.c
+  init/init_spinor_field.c
+  init/init_global_states.c
+  init/init_bispinor_field.c
+  init/init_gauge_tmp.c
+  init/init_critical_globals.c
+  init/init_omp_accumulators.c
+  # init/init_stout_smear_vars.c
+  init/init_moment_field.c)
+
+list(
+  APPEND
+  SOLVER_SRC_C
+  solver/bicg_complex.c
+  solver/dfl_projector.c
+  solver/gcr.c
+  # solver/gmres_precon.c
+  solver/chrono_guess.c
+  solver/gcr4complex.c
+  solver/jdher.c
+  # solver/gcr4complex_body.c
+  solver/gmres_dr.c
+  # solver/fgmres4complex_body.c
+  solver/cg_her_bi.c
+  solver/solver_field.c
+  solver/quicksort.c
+  solver/bicgstab2.c
+  solver/cgs_real.c
+  # solver/M_plus_block_psi_body.c solver/little_mg_precon_body.c
+  # solver/little_project_eo_body.c
+  solver/monomial_solve.c
+  solver/cr.c
+  solver/gram-schmidt.c
+  solver/solver_types.c
+  solver/cg_her.c
+  solver/jdher_bi.c
+  # solver/mrblk_body.c
+  solver/eigcg.c
+  solver/poly_precon.c
+  solver/Msap.c
+  solver/fgmres.c
+  solver/dirac_operator_eigenvectors.c
+  solver/incr_eigcg.c
+  solver/index_jd.c
+  solver/sumr.c
+  solver/cgne4complex.c
+  solver/eigenvalues_bi.c
+  solver/gmres.c
+  solver/lu_solve.c
+  solver/diagonalise_general_matrix.c
+  solver/mcr.c
+  solver/bicgstabell.c
+  solver/rg_mixed_cg_her.c
+  solver/mixed_cg_her.c
+  solver/mixed_cg_mms_tm_nd.c
+  solver/rg_mixed_cg_her_nd.c
+  solver/restart_X.c
+  solver/generate_dfl_subspace.c
+  solver/eigenvalues.c
+  solver/mcr4complex.c
+  solver/mr4complex.c
+  solver/bicgstab_complex.c
+  solver/cg_mms_tm_nd.c
+  solver/mr.c
+  solver/cg_her_nd.c
+  solver/bicgstab_complex_bi.c
+  solver/sub_low_ev.c
+  solver/ortho.c
+  solver/pcg_her.c
+  solver/fgmres4complex.c
+  solver/cg_mms_tm.c
+  solver/init_guess.c)
+
+list(
+  APPEND
+  LINALG_SRC_C
+  linalg/assign_mul_bra_add_mul_r.c
+  linalg/mul_r_gamma5.c
+  linalg/convert_eo_to_lexic.c
+  linalg/print_spinor.c
+  # linalg/assign_add_mul_body.c
+  linalg/mul_diff_mul_r.c
+  linalg/square_norm_32.c
+  linalg/mul.c
+  linalg/mul_r.c
+  linalg/mul_gamma5.c
+  linalg/ratio.c
+  linalg/square_norm.c
+  linalg/mul_diff_mul.c
+  linalg/square_and_minmax.c
+  linalg/add.c
+  linalg/assign_add_mul_add_mul_r.c
+  linalg/comp_decomp.c
+  linalg/mul_add_mul.c
+  linalg/diff_32.c
+  linalg/assign_add_mul.c
+  linalg/addto_32.c
+  linalg/assign_mul_add_mul_add_mul_add_mul_r.c
+  linalg/assign_add_mul_r.c
+  linalg/diff.c
+  linalg/assign_mul_add_mul_r.c
+  linalg/scalar_prod_r.c
+  linalg/assign_to_32.c
+  linalg/assign_add_mul_add_mul.c
+  linalg/mul_diff_r.c
+  linalg/assign_mul_add_r_and_square.c
+  linalg/assign_mul_add_mul_r_32.c
+  linalg/assign_mul_add_mul.c
+  linalg/assign_mul_add_mul_add_mul_r.c
+  linalg/scalar_prod_r_32.c
+  linalg/assign_mul_add_r.c
+  linalg/assign_mul_add_r_32.c
+  linalg/assign_add_mul_r_32.c
+  linalg/convert_even_to_lexic.c
+  linalg/mul_r_32.c
+  linalg/assign_add_mul_r_add_mul.c
+  linalg/convert_odd_to_lexic.c
+  linalg/diff_and_square_norm.c
+  linalg/scalar_prod_i.c
+  linalg/mul_add_mul_r.c
+  linalg/assign_diff_mul.c
+  linalg/assign_mul_bra_add_mul_ket_add_r.c
+  linalg/set_even_to_zero.c
+  linalg/assign_mul_add.c
+  linalg/square_and_prod_r.c
+  # linalg/scalar_prod_body.c
+  linalg/assign_mul_bra_add_mul_ket_add.c
+  linalg/assign_add_mul_r_32.c
+  linalg/scalar_prod.c
+  linalg/mattimesvec.c
+  linalg/assign.c
+  linalg/print_spinor_similar_components.c)
+
+list(APPEND RATIONAL_SRC_C rational/zolotarev.c rational/rational.c
+     rational/elliptic.c)
+
+list(
+  APPEND
+  OPERATOR_SRC_C
+  operator/clover_invert.c
+  # operator/hopping_body_dbl.c
+  operator/tm_operators_nd_32.c
+  # operator/halfspinor_body.c operator/Block_D_psi_body.c
+  # operator/mul_one_pm_imu_sub_mul_body.c
+  # operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
+  # operator/assign_mul_one_sw_pm_imu_inv_block_body.c
+  operator/clover_accumulate_deriv.c
+  operator/Hopping_Matrix.c
+  operator/tm_operators.c
+  operator/tm_times_Hopping_Matrix.c
+  operator/clovertm_operators_32.c
+  # operator/hopping_sgl.c
+  operator/Dov_proj.c
+  operator/clover_deriv.c
+  operator/clover_det.c
+  operator/clover_leaf.c
+  # operator/D_psi_body.c
+  operator/clovertm_operators.c
+  operator/Dov_psi.c
+  operator/tm_operators_nd.c
+  operator/tm_sub_Hopping_Matrix.c
+  operator/Hopping_Matrix_nocom.c
+  operator/clover_term.c
+  operator/Hopping_Matrix_32_nocom.c
+  operator/D_psi.c
+  operator/tm_operators_32.c
+  operator/Hopping_Matrix_32.c)
+# operator/halfspinor_body_32.c operator/mul_one_pm_imu_inv_body.c)
+
+list(
+  APPEND
+  SMEARING_SRC_C
+  smearing/hex_stout_exclude_two.c
+  smearing/hex_hex_smear.c
+  smearing/utils_print_su3.c
+  smearing/hyp_APE_project_exclude_none.c
+  smearing/hyp_hyp_staples_exclude_one.c
+  smearing/hyp_APE_project_exclude_one.c
+  smearing/hex_stout_exclude_one.c
+  smearing/hyp_hyp_staples_exclude_two.c
+  smearing/hex_stout_exclude_none.c
+  smearing/stout_stout_smear.c
+  smearing/hyp_hyp_smear.c
+  smearing/hyp_APE_project_exclude_two.c
+  smearing/utils_project_herm.c
+  smearing/utils_reunitarize.c
+  smearing/utils_generic_staples.c
+  smearing/hyp_hyp_staples_exclude_none.c
+  smearing/ape_ape_smear.c
+  smearing/uils_print_config_to_screen.c
+  smearing/utils_project_antiherm.c)
+# smearing/utils_print_config_to_screen.c smearing/utils_reunitarize_MILC.c)
+
+list(
+  APPEND
+  BUFFER_SRC_C
+  buffers/gauge_return_gauge_field.c
+  buffers/gauge_get_gauge_field.c
+  buffers/gauge_finalize_gauge_buffers.c
+  buffers/gauge_initialize_gauge_buffers.c
+  buffers/gauge.c
+  buffers/gauge_free_unused_gauge_buffers.c
+  buffers/gauge_get_gauge_field_array.c
+  buffers/utils_generic_exchange.c
+  buffers/gauge_allocate_gauge_buffers.c
+  buffers/gauge_return_gauge_field_array.c)
+
+list(
+  APPEND
+  MONOMIAL_SRC_C
+  monomial/detratio_monomial.c
+  monomial/poly_monomial.c
+  monomial/cloverdetratio_monomial.c
+  monomial/ndrat_monomial.c
+  monomial/cloverdet_monomial.c
+  monomial/clover_trlog_monomial.c
+  monomial/cloverndpoly_monomial.c
+  monomial/monitor_forces.c
+  monomial/ndpoly_monomial.c
+  monomial/det_monomial.c
+  monomial/monomial.c
+  monomial/cloverdetratio_rwmonomial.c
+  monomial/gauge_monomial.c
+  monomial/clovernd_trlog_monomial.c
+  monomial/ratcor_monomial.c
+  monomial/nddetratio_monomial.c
+  monomial/rat_monomial.c
+  monomial/ndratcor_monomial.c
+  monomial/moment_energy.c)
+
+list(
+  APPEND
+  EXCHANGE_SRC_C
+  xchange/xchange_lexicfield.c
+  xchange/xchange_2fields.c
+  xchange/xchange_gauge.c
+  xchange/xchange_halffield.c
+  # xchange/xchange_jacobi.c xchange/little_field_gather_body.c
+  xchange/little_field_gather.c
+  xchange/xchange_deri.c
+  xchange/xchange_field.c)
+# xchange/xchange_field_tslice.c)
+
+list(
+  APPEND
+  MEAS_SRC_C
+  meas/pion_norm.c
+  meas/correlators.c
+  meas/polyakov_loop.c
+  meas/measurements.c
+  meas/oriented_plaquettes.c
+  meas/gradient_flow.c
+  meas/measure_clover_field_strength_observables.c)
+
+list(
+  APPEND
+  MAIN_SRC_C
+  # cu/cu.c
+  measure_gauge_action.c
+  start.c
+  deriv_Sb.c
+  reweighting_factor_nd.c
+  ranlxs.c
+  source_generation.c
+  invert_doublet_eo.c
+  geometry_eo.c
+  getopt.c
+  tm_debug_printf.c
+  chebyshev_polynomial_nd.c
+  invert_eo.c
+  little_D.c
+  get_rectangle_staples.c
+  rnd_gauge_trafo.c
+  measure_rectangles.c
+  #invert.c
+  deriv_Sb_D_psi.c
+  mpi_init.c
+  update_momenta_fg.c
+  gamma.c
+  matrix_utils.c
+  reweighting_factor.c
+  update_tm.c
+  invert_overlap.c
+  phmc.c
+  get_staples.c
+  clenshaw_coef.c
+  block.c
+  spinor_fft.c
+  boundary.c
+  prepare_source.c
+  DDalphaAMG_interface.c
+  update_backward_gauge.c
+  invert_clover_eo.c
+  gettime.c
+  update_momenta.c
+  sighandler.c
+  compare_derivative.c
+  ranlxd.c
+  aligned_malloc.c
+  fatal_error.c
+  operator.c
+  # cu/cu.c chebyshev_polynomial.c qphix_test_Dslash.c
+  expo.c
+  overrelaxation.c
+  Ptilde_nd.c
+  update_gauge.c
+  # hopping_test.c
+  integrator.c)
+
+list(APPEND TEST_SRC_C test/check_xchange.c test/check_geometry.c
+     test/overlaptests.c)
+if(TMLQCD_USE_QPHIX)
+  list(APPEND MAIN_SRC_C QphiX/qphix_interface.cpp)
+endif()
+
+if(TMLQCD_USE_QUDA)
+  list(APPEND MAIN_SRC_C quda_interface.c)
+endif()
+
+list(
+  APPEND
+  ALL_SRC
+  ${MAIN_SRC_C}
+  ${EXCHANGE_SRC_C}
+  ${MONOMIAL_SRC_C}
+  ${BUFFER_SRC_C}
+  ${SMEARING_SRC_C}
+  ${OPERATOR_SRC_C}
+  ${RATIONAL_SRC_C}
+  ${LINALG_SRC_C}
+  ${IO_SRC_C}
+  ${INIT_SRC_C}
+  ${SOLVER_SRC_C}
+  ${TEST_SRC_C}
+  ${MEAS_SRC_C})
+
+include_directories(
+  $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/src/lib/include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<$<BOOL:${TMLQCD_USE_LEMON}>:${TMLQCD_LEMON_INCLUDE_DIRS}>
+  ${TMLQCD_CLIME_INCLUDE_DIRS})
+
+# cmake 4.0 uses a different syntax for the option
+if(CMAKE_MAJOR_VERSION LESS 4)
+  flex_target(tmlqcd_input_read read_input.l read_input.c
+              COMPILE_FLAGS "-Ca -Ptmlqcd")
+else()
+  flex_target(tmlqcd_input_read read_input.l read_input.c OPTIONS
+              "-Ca -Ptmlqcd")
+endif()
+
+# create a target library with namespacing because cmake does not know name
+# space at all
+
+if (BUILD_SHARED_LIBS)
+  add_library(hmc SHARED "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
+else()
+  add_library(hmc STATIC "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
+endif()
+
+set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
+
+# define a library and add the dependencies
+target_link_libraries(
+  hmc
+  PUBLIC $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
+         $<$<BOOL:${TM_USE_LEMON}>:tmlqcd::clime>
+         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
+         $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
+         $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
+         $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
+         MPI::MPI_CXX>
+         $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>
+         $<$<BOOL:${TM_USE_CUDA}>:CUDA::cufft
+         CUDA::cufftw
+         CUDA::cublas
+         CUDA::cudart
+         CUDA::cuda_driver>
+         $<$<BOOL:${TM_USE_HIP}>:hip::hipfft
+         roc::hipblas
+         hip::host>
+         ${LAPACK_LIBRARIES}
+         ${BLAS_LIBRARIES}
+         $<$<BOOL:${TM_USE_OPENMP}>:OpenMP::OpenMP_C
+         OpenMP::OpenMP_CXX>
+         m)
+
+target_compile_definitions(
+  hmc PUBLIC HAVE_CONFIG_H
+             $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
+
+target_include_directories(
+  hmc
+  PUBLIC $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
diff --git a/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
similarity index 100%
rename from DDalphaAMG_interface.c
rename to src/lib/DDalphaAMG_interface.c
diff --git a/DDalphaAMG_interface.h b/src/lib/DDalphaAMG_interface.h
similarity index 100%
rename from DDalphaAMG_interface.h
rename to src/lib/DDalphaAMG_interface.h
diff --git a/Ptilde_nd.c b/src/lib/Ptilde_nd.c
similarity index 100%
rename from Ptilde_nd.c
rename to src/lib/Ptilde_nd.c
diff --git a/Ptilde_nd.h b/src/lib/Ptilde_nd.h
similarity index 100%
rename from Ptilde_nd.h
rename to src/lib/Ptilde_nd.h
diff --git a/aligned_malloc.c b/src/lib/aligned_malloc.c
similarity index 100%
rename from aligned_malloc.c
rename to src/lib/aligned_malloc.c
diff --git a/aligned_malloc.h b/src/lib/aligned_malloc.h
similarity index 100%
rename from aligned_malloc.h
rename to src/lib/aligned_malloc.h
diff --git a/block.c b/src/lib/block.c
similarity index 100%
rename from block.c
rename to src/lib/block.c
diff --git a/block.h b/src/lib/block.h
similarity index 100%
rename from block.h
rename to src/lib/block.h
diff --git a/boundary.c b/src/lib/boundary.c
similarity index 100%
rename from boundary.c
rename to src/lib/boundary.c
diff --git a/boundary.h b/src/lib/boundary.h
similarity index 100%
rename from boundary.h
rename to src/lib/boundary.h
diff --git a/buffers/Makefile.in b/src/lib/buffers/Makefile.in
similarity index 100%
rename from buffers/Makefile.in
rename to src/lib/buffers/Makefile.in
diff --git a/buffers/gauge.c b/src/lib/buffers/gauge.c
similarity index 100%
rename from buffers/gauge.c
rename to src/lib/buffers/gauge.c
diff --git a/buffers/gauge.h b/src/lib/buffers/gauge.h
similarity index 100%
rename from buffers/gauge.h
rename to src/lib/buffers/gauge.h
diff --git a/buffers/gauge.ih b/src/lib/buffers/gauge.ih
similarity index 100%
rename from buffers/gauge.ih
rename to src/lib/buffers/gauge.ih
diff --git a/buffers/gauge_allocate_gauge_buffers.c b/src/lib/buffers/gauge_allocate_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_allocate_gauge_buffers.c
rename to src/lib/buffers/gauge_allocate_gauge_buffers.c
diff --git a/buffers/gauge_finalize_gauge_buffers.c b/src/lib/buffers/gauge_finalize_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_finalize_gauge_buffers.c
rename to src/lib/buffers/gauge_finalize_gauge_buffers.c
diff --git a/buffers/gauge_free_unused_gauge_buffers.c b/src/lib/buffers/gauge_free_unused_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_free_unused_gauge_buffers.c
rename to src/lib/buffers/gauge_free_unused_gauge_buffers.c
diff --git a/buffers/gauge_get_gauge_field.c b/src/lib/buffers/gauge_get_gauge_field.c
similarity index 100%
rename from buffers/gauge_get_gauge_field.c
rename to src/lib/buffers/gauge_get_gauge_field.c
diff --git a/buffers/gauge_get_gauge_field_array.c b/src/lib/buffers/gauge_get_gauge_field_array.c
similarity index 100%
rename from buffers/gauge_get_gauge_field_array.c
rename to src/lib/buffers/gauge_get_gauge_field_array.c
diff --git a/buffers/gauge_initialize_gauge_buffers.c b/src/lib/buffers/gauge_initialize_gauge_buffers.c
similarity index 100%
rename from buffers/gauge_initialize_gauge_buffers.c
rename to src/lib/buffers/gauge_initialize_gauge_buffers.c
diff --git a/buffers/gauge_return_gauge_field.c b/src/lib/buffers/gauge_return_gauge_field.c
similarity index 100%
rename from buffers/gauge_return_gauge_field.c
rename to src/lib/buffers/gauge_return_gauge_field.c
diff --git a/buffers/gauge_return_gauge_field_array.c b/src/lib/buffers/gauge_return_gauge_field_array.c
similarity index 100%
rename from buffers/gauge_return_gauge_field_array.c
rename to src/lib/buffers/gauge_return_gauge_field_array.c
diff --git a/buffers/utils.h b/src/lib/buffers/utils.h
similarity index 100%
rename from buffers/utils.h
rename to src/lib/buffers/utils.h
diff --git a/buffers/utils.ih b/src/lib/buffers/utils.ih
similarity index 100%
rename from buffers/utils.ih
rename to src/lib/buffers/utils.ih
diff --git a/buffers/utils_generic_exchange.blocking.inc b/src/lib/buffers/utils_generic_exchange.blocking.inc
similarity index 100%
rename from buffers/utils_generic_exchange.blocking.inc
rename to src/lib/buffers/utils_generic_exchange.blocking.inc
diff --git a/buffers/utils_generic_exchange.c b/src/lib/buffers/utils_generic_exchange.c
similarity index 100%
rename from buffers/utils_generic_exchange.c
rename to src/lib/buffers/utils_generic_exchange.c
diff --git a/buffers/utils_generic_exchange.nonblocking.inc b/src/lib/buffers/utils_generic_exchange.nonblocking.inc
similarity index 100%
rename from buffers/utils_generic_exchange.nonblocking.inc
rename to src/lib/buffers/utils_generic_exchange.nonblocking.inc
diff --git a/chebyshev_polynomial.c b/src/lib/chebyshev_polynomial.c
similarity index 98%
rename from chebyshev_polynomial.c
rename to src/lib/chebyshev_polynomial.c
index 501937b80..f67055cda 100644
--- a/chebyshev_polynomial.c
+++ b/src/lib/chebyshev_polynomial.c
@@ -280,11 +280,11 @@ void degree_of_polynomial(const int repro) {
           printf("||auxc_3||=%e\n",temp); */
 
     diff(&auxs[0], &auxs[0], &aux3s[0], VOLUME / 2);
-    temp = square_norm(&auxs[0], VOLUME / 2) / square_norm(&aux3s[0], VOLUME / 2, 1) / 4.0;
+    temp = square_norm(&auxs[0], VOLUME / 2, 1) / square_norm(&aux3s[0], VOLUME / 2, 1) / 4.0;
     if (g_proc_id == g_stdio_proc) {
       printf("difference=%e\n", temp);
       diff(&auxc[0], &auxc[0], &aux3c[0], VOLUME / 2);
-      temp = square_norm(&auxc[0], VOLUME / 2) / square_norm(&aux3c[0], VOLUME / 2, 1) / 4.0;
+      temp = square_norm(&auxc[0], VOLUME / 2, 1) / square_norm(&aux3c[0], VOLUME / 2, 1) / 4.0;
       printf("difference=%e\n", temp);
     }
     if (temp < stopeps) break;
diff --git a/chebyshev_polynomial.h b/src/lib/chebyshev_polynomial.h
similarity index 100%
rename from chebyshev_polynomial.h
rename to src/lib/chebyshev_polynomial.h
diff --git a/chebyshev_polynomial_nd.c b/src/lib/chebyshev_polynomial_nd.c
similarity index 100%
rename from chebyshev_polynomial_nd.c
rename to src/lib/chebyshev_polynomial_nd.c
diff --git a/chebyshev_polynomial_nd.h b/src/lib/chebyshev_polynomial_nd.h
similarity index 100%
rename from chebyshev_polynomial_nd.h
rename to src/lib/chebyshev_polynomial_nd.h
diff --git a/clenshaw_coef.c b/src/lib/clenshaw_coef.c
similarity index 100%
rename from clenshaw_coef.c
rename to src/lib/clenshaw_coef.c
diff --git a/clenshaw_coef.h b/src/lib/clenshaw_coef.h
similarity index 100%
rename from clenshaw_coef.h
rename to src/lib/clenshaw_coef.h
diff --git a/compare_derivative.c b/src/lib/compare_derivative.c
similarity index 100%
rename from compare_derivative.c
rename to src/lib/compare_derivative.c
diff --git a/compare_derivative.h b/src/lib/compare_derivative.h
similarity index 100%
rename from compare_derivative.h
rename to src/lib/compare_derivative.h
diff --git a/cu/COPYING b/src/lib/cu/COPYING
similarity index 100%
rename from cu/COPYING
rename to src/lib/cu/COPYING
diff --git a/cu/COPYING.LESSER b/src/lib/cu/COPYING.LESSER
similarity index 100%
rename from cu/COPYING.LESSER
rename to src/lib/cu/COPYING.LESSER
diff --git a/cu/Makefile.in b/src/lib/cu/Makefile.in
similarity index 100%
rename from cu/Makefile.in
rename to src/lib/cu/Makefile.in
diff --git a/cu/check-regressions b/src/lib/cu/check-regressions
similarity index 100%
rename from cu/check-regressions
rename to src/lib/cu/check-regressions
diff --git a/cu/cu.c b/src/lib/cu/cu.c
similarity index 100%
rename from cu/cu.c
rename to src/lib/cu/cu.c
diff --git a/cu/cu.h b/src/lib/cu/cu.h
similarity index 100%
rename from cu/cu.h
rename to src/lib/cu/cu.h
diff --git a/default_input_values.h b/src/lib/default_input_values.h
similarity index 100%
rename from default_input_values.h
rename to src/lib/default_input_values.h
diff --git a/deriv_Sb.c b/src/lib/deriv_Sb.c
similarity index 100%
rename from deriv_Sb.c
rename to src/lib/deriv_Sb.c
diff --git a/deriv_Sb.h b/src/lib/deriv_Sb.h
similarity index 100%
rename from deriv_Sb.h
rename to src/lib/deriv_Sb.h
diff --git a/deriv_Sb_D_psi.c b/src/lib/deriv_Sb_D_psi.c
similarity index 100%
rename from deriv_Sb_D_psi.c
rename to src/lib/deriv_Sb_D_psi.c
diff --git a/deriv_Sb_D_psi.h b/src/lib/deriv_Sb_D_psi.h
similarity index 100%
rename from deriv_Sb_D_psi.h
rename to src/lib/deriv_Sb_D_psi.h
diff --git a/expo.c b/src/lib/expo.c
similarity index 100%
rename from expo.c
rename to src/lib/expo.c
diff --git a/expo.h b/src/lib/expo.h
similarity index 100%
rename from expo.h
rename to src/lib/expo.h
diff --git a/fatal_error.c b/src/lib/fatal_error.c
similarity index 100%
rename from fatal_error.c
rename to src/lib/fatal_error.c
diff --git a/fatal_error.h b/src/lib/fatal_error.h
similarity index 100%
rename from fatal_error.h
rename to src/lib/fatal_error.h
diff --git a/gamma.c b/src/lib/gamma.c
similarity index 100%
rename from gamma.c
rename to src/lib/gamma.c
diff --git a/gamma.h b/src/lib/gamma.h
similarity index 100%
rename from gamma.h
rename to src/lib/gamma.h
diff --git a/geometry_eo.c b/src/lib/geometry_eo.c
similarity index 100%
rename from geometry_eo.c
rename to src/lib/geometry_eo.c
diff --git a/geometry_eo.h b/src/lib/geometry_eo.h
similarity index 100%
rename from geometry_eo.h
rename to src/lib/geometry_eo.h
diff --git a/get_rectangle_staples.c b/src/lib/get_rectangle_staples.c
similarity index 100%
rename from get_rectangle_staples.c
rename to src/lib/get_rectangle_staples.c
diff --git a/get_rectangle_staples.h b/src/lib/get_rectangle_staples.h
similarity index 100%
rename from get_rectangle_staples.h
rename to src/lib/get_rectangle_staples.h
diff --git a/get_staples.c b/src/lib/get_staples.c
similarity index 100%
rename from get_staples.c
rename to src/lib/get_staples.c
diff --git a/get_staples.h b/src/lib/get_staples.h
similarity index 100%
rename from get_staples.h
rename to src/lib/get_staples.h
diff --git a/getopt.c b/src/lib/getopt.c
similarity index 100%
rename from getopt.c
rename to src/lib/getopt.c
diff --git a/getopt.h b/src/lib/getopt.h
similarity index 100%
rename from getopt.h
rename to src/lib/getopt.h
diff --git a/gettime.c b/src/lib/gettime.c
similarity index 100%
rename from gettime.c
rename to src/lib/gettime.c
diff --git a/gettime.h b/src/lib/gettime.h
similarity index 100%
rename from gettime.h
rename to src/lib/gettime.h
diff --git a/global.h b/src/lib/global.h
similarity index 100%
rename from global.h
rename to src/lib/global.h
diff --git a/hamiltonian_field.h b/src/lib/hamiltonian_field.h
similarity index 100%
rename from hamiltonian_field.h
rename to src/lib/hamiltonian_field.h
diff --git a/include/tmLQCD.h b/src/lib/include/tmLQCD.h
similarity index 100%
rename from include/tmLQCD.h
rename to src/lib/include/tmLQCD.h
diff --git a/include/tmlqcd_config.h b/src/lib/include/tmlqcd_config.h
similarity index 100%
rename from include/tmlqcd_config.h
rename to src/lib/include/tmlqcd_config.h
diff --git a/init/Makefile.in b/src/lib/init/Makefile.in
similarity index 100%
rename from init/Makefile.in
rename to src/lib/init/Makefile.in
diff --git a/init/init.h b/src/lib/init/init.h
similarity index 100%
rename from init/init.h
rename to src/lib/init/init.h
diff --git a/init/init_bispinor_field.c b/src/lib/init/init_bispinor_field.c
similarity index 100%
rename from init/init_bispinor_field.c
rename to src/lib/init/init_bispinor_field.c
diff --git a/init/init_bispinor_field.h b/src/lib/init/init_bispinor_field.h
similarity index 100%
rename from init/init_bispinor_field.h
rename to src/lib/init/init_bispinor_field.h
diff --git a/init/init_chi_spinor_field.c b/src/lib/init/init_chi_spinor_field.c
similarity index 100%
rename from init/init_chi_spinor_field.c
rename to src/lib/init/init_chi_spinor_field.c
diff --git a/init/init_chi_spinor_field.h b/src/lib/init/init_chi_spinor_field.h
similarity index 100%
rename from init/init_chi_spinor_field.h
rename to src/lib/init/init_chi_spinor_field.h
diff --git a/init/init_critical_globals.c b/src/lib/init/init_critical_globals.c
similarity index 100%
rename from init/init_critical_globals.c
rename to src/lib/init/init_critical_globals.c
diff --git a/init/init_critical_globals.h b/src/lib/init/init_critical_globals.h
similarity index 100%
rename from init/init_critical_globals.h
rename to src/lib/init/init_critical_globals.h
diff --git a/init/init_dirac_halfspinor.c b/src/lib/init/init_dirac_halfspinor.c
similarity index 100%
rename from init/init_dirac_halfspinor.c
rename to src/lib/init/init_dirac_halfspinor.c
diff --git a/init/init_dirac_halfspinor.h b/src/lib/init/init_dirac_halfspinor.h
similarity index 100%
rename from init/init_dirac_halfspinor.h
rename to src/lib/init/init_dirac_halfspinor.h
diff --git a/init/init_gauge_fg.c b/src/lib/init/init_gauge_fg.c
similarity index 100%
rename from init/init_gauge_fg.c
rename to src/lib/init/init_gauge_fg.c
diff --git a/init/init_gauge_fg.h b/src/lib/init/init_gauge_fg.h
similarity index 100%
rename from init/init_gauge_fg.h
rename to src/lib/init/init_gauge_fg.h
diff --git a/init/init_gauge_field.c b/src/lib/init/init_gauge_field.c
similarity index 100%
rename from init/init_gauge_field.c
rename to src/lib/init/init_gauge_field.c
diff --git a/init/init_gauge_field.h b/src/lib/init/init_gauge_field.h
similarity index 100%
rename from init/init_gauge_field.h
rename to src/lib/init/init_gauge_field.h
diff --git a/init/init_gauge_tmp.c b/src/lib/init/init_gauge_tmp.c
similarity index 100%
rename from init/init_gauge_tmp.c
rename to src/lib/init/init_gauge_tmp.c
diff --git a/init/init_gauge_tmp.h b/src/lib/init/init_gauge_tmp.h
similarity index 100%
rename from init/init_gauge_tmp.h
rename to src/lib/init/init_gauge_tmp.h
diff --git a/init/init_geometry_indices.c b/src/lib/init/init_geometry_indices.c
similarity index 100%
rename from init/init_geometry_indices.c
rename to src/lib/init/init_geometry_indices.c
diff --git a/init/init_geometry_indices.h b/src/lib/init/init_geometry_indices.h
similarity index 100%
rename from init/init_geometry_indices.h
rename to src/lib/init/init_geometry_indices.h
diff --git a/init/init_global_states.c b/src/lib/init/init_global_states.c
similarity index 100%
rename from init/init_global_states.c
rename to src/lib/init/init_global_states.c
diff --git a/init/init_global_states.h b/src/lib/init/init_global_states.h
similarity index 100%
rename from init/init_global_states.h
rename to src/lib/init/init_global_states.h
diff --git a/init/init_moment_field.c b/src/lib/init/init_moment_field.c
similarity index 100%
rename from init/init_moment_field.c
rename to src/lib/init/init_moment_field.c
diff --git a/init/init_moment_field.h b/src/lib/init/init_moment_field.h
similarity index 100%
rename from init/init_moment_field.h
rename to src/lib/init/init_moment_field.h
diff --git a/init/init_omp_accumulators.c b/src/lib/init/init_omp_accumulators.c
similarity index 100%
rename from init/init_omp_accumulators.c
rename to src/lib/init/init_omp_accumulators.c
diff --git a/init/init_omp_accumulators.h b/src/lib/init/init_omp_accumulators.h
similarity index 100%
rename from init/init_omp_accumulators.h
rename to src/lib/init/init_omp_accumulators.h
diff --git a/init/init_openmp.c b/src/lib/init/init_openmp.c
similarity index 100%
rename from init/init_openmp.c
rename to src/lib/init/init_openmp.c
diff --git a/init/init_openmp.h b/src/lib/init/init_openmp.h
similarity index 100%
rename from init/init_openmp.h
rename to src/lib/init/init_openmp.h
diff --git a/init/init_parallel.c b/src/lib/init/init_parallel.c
similarity index 100%
rename from init/init_parallel.c
rename to src/lib/init/init_parallel.c
diff --git a/init/init_parallel.h b/src/lib/init/init_parallel.h
similarity index 100%
rename from init/init_parallel.h
rename to src/lib/init/init_parallel.h
diff --git a/init/init_spinor_field.c b/src/lib/init/init_spinor_field.c
similarity index 100%
rename from init/init_spinor_field.c
rename to src/lib/init/init_spinor_field.c
diff --git a/init/init_spinor_field.h b/src/lib/init/init_spinor_field.h
similarity index 100%
rename from init/init_spinor_field.h
rename to src/lib/init/init_spinor_field.h
diff --git a/init/init_stout_smear_vars.c b/src/lib/init/init_stout_smear_vars.c
similarity index 100%
rename from init/init_stout_smear_vars.c
rename to src/lib/init/init_stout_smear_vars.c
diff --git a/init/init_stout_smear_vars.h b/src/lib/init/init_stout_smear_vars.h
similarity index 100%
rename from init/init_stout_smear_vars.h
rename to src/lib/init/init_stout_smear_vars.h
diff --git a/integrator.c b/src/lib/integrator.c
similarity index 100%
rename from integrator.c
rename to src/lib/integrator.c
diff --git a/integrator.h b/src/lib/integrator.h
similarity index 100%
rename from integrator.h
rename to src/lib/integrator.h
diff --git a/invert_clover_eo.c b/src/lib/invert_clover_eo.c
similarity index 100%
rename from invert_clover_eo.c
rename to src/lib/invert_clover_eo.c
diff --git a/invert_clover_eo.h b/src/lib/invert_clover_eo.h
similarity index 100%
rename from invert_clover_eo.h
rename to src/lib/invert_clover_eo.h
diff --git a/invert_doublet_eo.c b/src/lib/invert_doublet_eo.c
similarity index 100%
rename from invert_doublet_eo.c
rename to src/lib/invert_doublet_eo.c
diff --git a/invert_doublet_eo.h b/src/lib/invert_doublet_eo.h
similarity index 100%
rename from invert_doublet_eo.h
rename to src/lib/invert_doublet_eo.h
diff --git a/invert_eo.c b/src/lib/invert_eo.c
similarity index 99%
rename from invert_eo.c
rename to src/lib/invert_eo.c
index 25ee4a297..997cab021 100644
--- a/invert_eo.c
+++ b/src/lib/invert_eo.c
@@ -34,6 +34,7 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
+
 #include <stdlib.h>
 #include "gamma.h"
 #include "global.h"
diff --git a/invert_eo.h b/src/lib/invert_eo.h
similarity index 100%
rename from invert_eo.h
rename to src/lib/invert_eo.h
diff --git a/invert_overlap.c b/src/lib/invert_overlap.c
similarity index 100%
rename from invert_overlap.c
rename to src/lib/invert_overlap.c
diff --git a/invert_overlap.h b/src/lib/invert_overlap.h
similarity index 100%
rename from invert_overlap.h
rename to src/lib/invert_overlap.h
diff --git a/io/DML_crc32.c b/src/lib/io/DML_crc32.c
similarity index 100%
rename from io/DML_crc32.c
rename to src/lib/io/DML_crc32.c
diff --git a/io/deri_write_stdout.c b/src/lib/io/deri_write_stdout.c
similarity index 100%
rename from io/deri_write_stdout.c
rename to src/lib/io/deri_write_stdout.c
diff --git a/io/deri_write_stdout.h b/src/lib/io/deri_write_stdout.h
similarity index 100%
rename from io/deri_write_stdout.h
rename to src/lib/io/deri_write_stdout.h
diff --git a/io/dml.c b/src/lib/io/dml.c
similarity index 100%
rename from io/dml.c
rename to src/lib/io/dml.c
diff --git a/io/dml.h b/src/lib/io/dml.h
similarity index 100%
rename from io/dml.h
rename to src/lib/io/dml.h
diff --git a/io/eospinor.h b/src/lib/io/eospinor.h
similarity index 100%
rename from io/eospinor.h
rename to src/lib/io/eospinor.h
diff --git a/io/eospinor.ih b/src/lib/io/eospinor.ih
similarity index 100%
rename from io/eospinor.ih
rename to src/lib/io/eospinor.ih
diff --git a/io/eospinor_read.c b/src/lib/io/eospinor_read.c
similarity index 100%
rename from io/eospinor_read.c
rename to src/lib/io/eospinor_read.c
diff --git a/io/eospinor_write.c b/src/lib/io/eospinor_write.c
similarity index 100%
rename from io/eospinor_write.c
rename to src/lib/io/eospinor_write.c
diff --git a/io/gauge.h b/src/lib/io/gauge.h
similarity index 100%
rename from io/gauge.h
rename to src/lib/io/gauge.h
diff --git a/io/gauge.ih b/src/lib/io/gauge.ih
similarity index 100%
rename from io/gauge.ih
rename to src/lib/io/gauge.ih
diff --git a/io/gauge_read.c b/src/lib/io/gauge_read.c
similarity index 100%
rename from io/gauge_read.c
rename to src/lib/io/gauge_read.c
diff --git a/io/gauge_read_binary.c b/src/lib/io/gauge_read_binary.c
similarity index 100%
rename from io/gauge_read_binary.c
rename to src/lib/io/gauge_read_binary.c
diff --git a/io/gauge_write.c b/src/lib/io/gauge_write.c
similarity index 100%
rename from io/gauge_write.c
rename to src/lib/io/gauge_write.c
diff --git a/io/gauge_write_binary.c b/src/lib/io/gauge_write_binary.c
similarity index 100%
rename from io/gauge_write_binary.c
rename to src/lib/io/gauge_write_binary.c
diff --git a/io/gauge_write_luscher_binary.c b/src/lib/io/gauge_write_luscher_binary.c
similarity index 100%
rename from io/gauge_write_luscher_binary.c
rename to src/lib/io/gauge_write_luscher_binary.c
diff --git a/io/gauge_write_luscher_binary.h b/src/lib/io/gauge_write_luscher_binary.h
similarity index 100%
rename from io/gauge_write_luscher_binary.h
rename to src/lib/io/gauge_write_luscher_binary.h
diff --git a/io/io_cm.c b/src/lib/io/io_cm.c
similarity index 100%
rename from io/io_cm.c
rename to src/lib/io/io_cm.c
diff --git a/io/io_cm.h b/src/lib/io/io_cm.h
similarity index 100%
rename from io/io_cm.h
rename to src/lib/io/io_cm.h
diff --git a/io/params.h b/src/lib/io/params.h
similarity index 100%
rename from io/params.h
rename to src/lib/io/params.h
diff --git a/io/params.ih b/src/lib/io/params.ih
similarity index 100%
rename from io/params.ih
rename to src/lib/io/params.ih
diff --git a/io/params_construct_InverterInfo.c b/src/lib/io/params_construct_InverterInfo.c
similarity index 100%
rename from io/params_construct_InverterInfo.c
rename to src/lib/io/params_construct_InverterInfo.c
diff --git a/io/params_construct_ildgFormat.c b/src/lib/io/params_construct_ildgFormat.c
similarity index 100%
rename from io/params_construct_ildgFormat.c
rename to src/lib/io/params_construct_ildgFormat.c
diff --git a/io/params_construct_propagatorFormat.c b/src/lib/io/params_construct_propagatorFormat.c
similarity index 100%
rename from io/params_construct_propagatorFormat.c
rename to src/lib/io/params_construct_propagatorFormat.c
diff --git a/io/params_construct_sourceFormat.c b/src/lib/io/params_construct_sourceFormat.c
similarity index 100%
rename from io/params_construct_sourceFormat.c
rename to src/lib/io/params_construct_sourceFormat.c
diff --git a/io/params_construct_xlfInfo.c b/src/lib/io/params_construct_xlfInfo.c
similarity index 100%
rename from io/params_construct_xlfInfo.c
rename to src/lib/io/params_construct_xlfInfo.c
diff --git a/io/selector.h b/src/lib/io/selector.h
similarity index 100%
rename from io/selector.h
rename to src/lib/io/selector.h
diff --git a/io/spinor.h b/src/lib/io/spinor.h
similarity index 100%
rename from io/spinor.h
rename to src/lib/io/spinor.h
diff --git a/io/spinor.ih b/src/lib/io/spinor.ih
similarity index 100%
rename from io/spinor.ih
rename to src/lib/io/spinor.ih
diff --git a/io/spinor_read.c b/src/lib/io/spinor_read.c
similarity index 100%
rename from io/spinor_read.c
rename to src/lib/io/spinor_read.c
diff --git a/io/spinor_read_binary.c b/src/lib/io/spinor_read_binary.c
similarity index 100%
rename from io/spinor_read_binary.c
rename to src/lib/io/spinor_read_binary.c
diff --git a/io/spinor_write.c b/src/lib/io/spinor_write.c
similarity index 100%
rename from io/spinor_write.c
rename to src/lib/io/spinor_write.c
diff --git a/io/spinor_write_binary.c b/src/lib/io/spinor_write_binary.c
similarity index 100%
rename from io/spinor_write_binary.c
rename to src/lib/io/spinor_write_binary.c
diff --git a/io/spinor_write_info.c b/src/lib/io/spinor_write_info.c
similarity index 100%
rename from io/spinor_write_info.c
rename to src/lib/io/spinor_write_info.c
diff --git a/io/spinor_write_propagator_format.c b/src/lib/io/spinor_write_propagator_format.c
similarity index 100%
rename from io/spinor_write_propagator_format.c
rename to src/lib/io/spinor_write_propagator_format.c
diff --git a/io/spinor_write_propagator_type.c b/src/lib/io/spinor_write_propagator_type.c
similarity index 100%
rename from io/spinor_write_propagator_type.c
rename to src/lib/io/spinor_write_propagator_type.c
diff --git a/io/spinor_write_source_format.c b/src/lib/io/spinor_write_source_format.c
similarity index 100%
rename from io/spinor_write_source_format.c
rename to src/lib/io/spinor_write_source_format.c
diff --git a/io/spinor_write_stdout.c b/src/lib/io/spinor_write_stdout.c
similarity index 100%
rename from io/spinor_write_stdout.c
rename to src/lib/io/spinor_write_stdout.c
diff --git a/io/spinor_write_stdout.h b/src/lib/io/spinor_write_stdout.h
similarity index 100%
rename from io/spinor_write_stdout.h
rename to src/lib/io/spinor_write_stdout.h
diff --git a/io/sw_write_stdout.c b/src/lib/io/sw_write_stdout.c
similarity index 100%
rename from io/sw_write_stdout.c
rename to src/lib/io/sw_write_stdout.c
diff --git a/io/sw_write_stdout.h b/src/lib/io/sw_write_stdout.h
similarity index 100%
rename from io/sw_write_stdout.h
rename to src/lib/io/sw_write_stdout.h
diff --git a/io/utils.c b/src/lib/io/utils.c
similarity index 100%
rename from io/utils.c
rename to src/lib/io/utils.c
diff --git a/io/utils.h b/src/lib/io/utils.h
similarity index 99%
rename from io/utils.h
rename to src/lib/io/utils.h
index afcca1553..85e98a5e2 100644
--- a/io/utils.h
+++ b/src/lib/io/utils.h
@@ -20,9 +20,7 @@
 #ifndef _UTILS_H
 #define _UTILS_H
 
-#ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
-#endif
 
 #include "io/dml.h"
 #include "io/params.h"
diff --git a/io/utils.ih b/src/lib/io/utils.ih
similarity index 96%
rename from io/utils.ih
rename to src/lib/io/utils.ih
index 073bd64b5..dd963b5b9 100644
--- a/io/utils.ih
+++ b/src/lib/io/utils.ih
@@ -18,9 +18,7 @@
 ***********************************************************************/
 
 #include <lime.h>
-#ifdef HAVE_CONFIG_H
 #include "tmlqcd_config.h"
-#endif
 
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,7 +27,7 @@
 #include <endian.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#ifdef MPI
+#ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
 #include <unistd.h>
diff --git a/io/utils_close_reader_record.c b/src/lib/io/utils_close_reader_record.c
similarity index 100%
rename from io/utils_close_reader_record.c
rename to src/lib/io/utils_close_reader_record.c
diff --git a/io/utils_close_writer_record.c b/src/lib/io/utils_close_writer_record.c
similarity index 100%
rename from io/utils_close_writer_record.c
rename to src/lib/io/utils_close_writer_record.c
diff --git a/io/utils_construct_reader.c b/src/lib/io/utils_construct_reader.c
similarity index 97%
rename from io/utils_construct_reader.c
rename to src/lib/io/utils_construct_reader.c
index 085206786..2714455b2 100644
--- a/io/utils_construct_reader.c
+++ b/src/lib/io/utils_construct_reader.c
@@ -1,5 +1,7 @@
 #include "utils.ih"
 
+extern MPI_Comm g_cart_grid;
+
 void construct_reader(READER **reader, char *filename) {
   LIME_FILE *fh = NULL;
   int status = 0;
diff --git a/io/utils_construct_writer.c b/src/lib/io/utils_construct_writer.c
similarity index 100%
rename from io/utils_construct_writer.c
rename to src/lib/io/utils_construct_writer.c
diff --git a/io/utils_destruct_reader.c b/src/lib/io/utils_destruct_reader.c
similarity index 100%
rename from io/utils_destruct_reader.c
rename to src/lib/io/utils_destruct_reader.c
diff --git a/io/utils_destruct_writer.c b/src/lib/io/utils_destruct_writer.c
similarity index 100%
rename from io/utils_destruct_writer.c
rename to src/lib/io/utils_destruct_writer.c
diff --git a/io/utils_engineering.c b/src/lib/io/utils_engineering.c
similarity index 100%
rename from io/utils_engineering.c
rename to src/lib/io/utils_engineering.c
diff --git a/io/utils_kill_with_error.c b/src/lib/io/utils_kill_with_error.c
similarity index 100%
rename from io/utils_kill_with_error.c
rename to src/lib/io/utils_kill_with_error.c
diff --git a/io/utils_parse_checksum_xml.c b/src/lib/io/utils_parse_checksum_xml.c
similarity index 100%
rename from io/utils_parse_checksum_xml.c
rename to src/lib/io/utils_parse_checksum_xml.c
diff --git a/io/utils_parse_ildgformat_xml.c b/src/lib/io/utils_parse_ildgformat_xml.c
similarity index 100%
rename from io/utils_parse_ildgformat_xml.c
rename to src/lib/io/utils_parse_ildgformat_xml.c
diff --git a/io/utils_parse_propagator_type.c b/src/lib/io/utils_parse_propagator_type.c
similarity index 100%
rename from io/utils_parse_propagator_type.c
rename to src/lib/io/utils_parse_propagator_type.c
diff --git a/io/utils_read_message.c b/src/lib/io/utils_read_message.c
similarity index 100%
rename from io/utils_read_message.c
rename to src/lib/io/utils_read_message.c
diff --git a/io/utils_write_checksum.c b/src/lib/io/utils_write_checksum.c
similarity index 100%
rename from io/utils_write_checksum.c
rename to src/lib/io/utils_write_checksum.c
diff --git a/io/utils_write_first_message.c b/src/lib/io/utils_write_first_message.c
similarity index 100%
rename from io/utils_write_first_message.c
rename to src/lib/io/utils_write_first_message.c
diff --git a/io/utils_write_header.c b/src/lib/io/utils_write_header.c
similarity index 100%
rename from io/utils_write_header.c
rename to src/lib/io/utils_write_header.c
diff --git a/io/utils_write_ildg_format.c b/src/lib/io/utils_write_ildg_format.c
similarity index 100%
rename from io/utils_write_ildg_format.c
rename to src/lib/io/utils_write_ildg_format.c
diff --git a/io/utils_write_inverter_info.c b/src/lib/io/utils_write_inverter_info.c
similarity index 100%
rename from io/utils_write_inverter_info.c
rename to src/lib/io/utils_write_inverter_info.c
diff --git a/io/utils_write_message.c b/src/lib/io/utils_write_message.c
similarity index 100%
rename from io/utils_write_message.c
rename to src/lib/io/utils_write_message.c
diff --git a/io/utils_write_xlf.c b/src/lib/io/utils_write_xlf.c
similarity index 100%
rename from io/utils_write_xlf.c
rename to src/lib/io/utils_write_xlf.c
diff --git a/io/utils_write_xlf_xml.c b/src/lib/io/utils_write_xlf_xml.c
similarity index 100%
rename from io/utils_write_xlf_xml.c
rename to src/lib/io/utils_write_xlf_xml.c
diff --git a/kahan_summation.h b/src/lib/kahan_summation.h
similarity index 100%
rename from kahan_summation.h
rename to src/lib/kahan_summation.h
diff --git a/linalg/Makefile.in b/src/lib/linalg/Makefile.in
similarity index 100%
rename from linalg/Makefile.in
rename to src/lib/linalg/Makefile.in
diff --git a/linalg/add.c b/src/lib/linalg/add.c
similarity index 100%
rename from linalg/add.c
rename to src/lib/linalg/add.c
diff --git a/linalg/add.h b/src/lib/linalg/add.h
similarity index 100%
rename from linalg/add.h
rename to src/lib/linalg/add.h
diff --git a/linalg/addto_32.c b/src/lib/linalg/addto_32.c
similarity index 100%
rename from linalg/addto_32.c
rename to src/lib/linalg/addto_32.c
diff --git a/linalg/addto_32.h b/src/lib/linalg/addto_32.h
similarity index 100%
rename from linalg/addto_32.h
rename to src/lib/linalg/addto_32.h
diff --git a/linalg/assign.c b/src/lib/linalg/assign.c
similarity index 100%
rename from linalg/assign.c
rename to src/lib/linalg/assign.c
diff --git a/linalg/assign.h b/src/lib/linalg/assign.h
similarity index 100%
rename from linalg/assign.h
rename to src/lib/linalg/assign.h
diff --git a/linalg/assign_add_mul.c b/src/lib/linalg/assign_add_mul.c
similarity index 100%
rename from linalg/assign_add_mul.c
rename to src/lib/linalg/assign_add_mul.c
diff --git a/linalg/assign_add_mul.h b/src/lib/linalg/assign_add_mul.h
similarity index 100%
rename from linalg/assign_add_mul.h
rename to src/lib/linalg/assign_add_mul.h
diff --git a/linalg/assign_add_mul_add_mul.c b/src/lib/linalg/assign_add_mul_add_mul.c
similarity index 100%
rename from linalg/assign_add_mul_add_mul.c
rename to src/lib/linalg/assign_add_mul_add_mul.c
diff --git a/linalg/assign_add_mul_add_mul.h b/src/lib/linalg/assign_add_mul_add_mul.h
similarity index 100%
rename from linalg/assign_add_mul_add_mul.h
rename to src/lib/linalg/assign_add_mul_add_mul.h
diff --git a/linalg/assign_add_mul_add_mul_r.c b/src/lib/linalg/assign_add_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_add_mul_add_mul_r.c
rename to src/lib/linalg/assign_add_mul_add_mul_r.c
diff --git a/linalg/assign_add_mul_add_mul_r.h b/src/lib/linalg/assign_add_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_add_mul_add_mul_r.h
rename to src/lib/linalg/assign_add_mul_add_mul_r.h
diff --git a/linalg/assign_add_mul_body.c b/src/lib/linalg/assign_add_mul_body.c
similarity index 100%
rename from linalg/assign_add_mul_body.c
rename to src/lib/linalg/assign_add_mul_body.c
diff --git a/linalg/assign_add_mul_r.c b/src/lib/linalg/assign_add_mul_r.c
similarity index 100%
rename from linalg/assign_add_mul_r.c
rename to src/lib/linalg/assign_add_mul_r.c
diff --git a/linalg/assign_add_mul_r.h b/src/lib/linalg/assign_add_mul_r.h
similarity index 100%
rename from linalg/assign_add_mul_r.h
rename to src/lib/linalg/assign_add_mul_r.h
diff --git a/linalg/assign_add_mul_r_32.c b/src/lib/linalg/assign_add_mul_r_32.c
similarity index 93%
rename from linalg/assign_add_mul_r_32.c
rename to src/lib/linalg/assign_add_mul_r_32.c
index 8df54858b..9f6b1a72f 100644
--- a/linalg/assign_add_mul_r_32.c
+++ b/src/lib/linalg/assign_add_mul_r_32.c
@@ -28,16 +28,13 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
-#ifdef TM_USE_OMP
-#include <omp.h>
-#endif
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "assign_add_mul_r_32.h"
 #include "su3.h"
 
-inline void assign_add_mul_r_32_orphaned(spinor32 *const R, spinor32 *const S, const float c,
+void assign_add_mul_r_32_orphaned(spinor32 *const R, spinor32 *const S, const float c,
                                          const int N) {
 #ifdef TM_USE_OMP
 #pragma omp parallel for
diff --git a/linalg/assign_add_mul_r_32.h b/src/lib/linalg/assign_add_mul_r_32.h
similarity index 100%
rename from linalg/assign_add_mul_r_32.h
rename to src/lib/linalg/assign_add_mul_r_32.h
diff --git a/linalg/assign_add_mul_r_add_mul.c b/src/lib/linalg/assign_add_mul_r_add_mul.c
similarity index 100%
rename from linalg/assign_add_mul_r_add_mul.c
rename to src/lib/linalg/assign_add_mul_r_add_mul.c
diff --git a/linalg/assign_add_mul_r_add_mul.h b/src/lib/linalg/assign_add_mul_r_add_mul.h
similarity index 100%
rename from linalg/assign_add_mul_r_add_mul.h
rename to src/lib/linalg/assign_add_mul_r_add_mul.h
diff --git a/linalg/assign_diff_mul.c b/src/lib/linalg/assign_diff_mul.c
similarity index 100%
rename from linalg/assign_diff_mul.c
rename to src/lib/linalg/assign_diff_mul.c
diff --git a/linalg/assign_diff_mul.h b/src/lib/linalg/assign_diff_mul.h
similarity index 100%
rename from linalg/assign_diff_mul.h
rename to src/lib/linalg/assign_diff_mul.h
diff --git a/linalg/assign_mul_add.c b/src/lib/linalg/assign_mul_add.c
similarity index 100%
rename from linalg/assign_mul_add.c
rename to src/lib/linalg/assign_mul_add.c
diff --git a/linalg/assign_mul_add.h b/src/lib/linalg/assign_mul_add.h
similarity index 100%
rename from linalg/assign_mul_add.h
rename to src/lib/linalg/assign_mul_add.h
diff --git a/linalg/assign_mul_add_mul.c b/src/lib/linalg/assign_mul_add_mul.c
similarity index 100%
rename from linalg/assign_mul_add_mul.c
rename to src/lib/linalg/assign_mul_add_mul.c
diff --git a/linalg/assign_mul_add_mul.h b/src/lib/linalg/assign_mul_add_mul.h
similarity index 100%
rename from linalg/assign_mul_add_mul.h
rename to src/lib/linalg/assign_mul_add_mul.h
diff --git a/linalg/assign_mul_add_mul_add_mul_add_mul_r.c b/src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_add_mul_r.c
rename to src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.c
diff --git a/linalg/assign_mul_add_mul_add_mul_add_mul_r.h b/src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_add_mul_r.h
rename to src/lib/linalg/assign_mul_add_mul_add_mul_add_mul_r.h
diff --git a/linalg/assign_mul_add_mul_add_mul_r.c b/src/lib/linalg/assign_mul_add_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_r.c
rename to src/lib/linalg/assign_mul_add_mul_add_mul_r.c
diff --git a/linalg/assign_mul_add_mul_add_mul_r.h b/src/lib/linalg/assign_mul_add_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_add_mul_add_mul_r.h
rename to src/lib/linalg/assign_mul_add_mul_add_mul_r.h
diff --git a/linalg/assign_mul_add_mul_r.c b/src/lib/linalg/assign_mul_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_add_mul_r.c
rename to src/lib/linalg/assign_mul_add_mul_r.c
diff --git a/linalg/assign_mul_add_mul_r.h b/src/lib/linalg/assign_mul_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_add_mul_r.h
rename to src/lib/linalg/assign_mul_add_mul_r.h
diff --git a/linalg/assign_mul_add_mul_r_32.c b/src/lib/linalg/assign_mul_add_mul_r_32.c
similarity index 100%
rename from linalg/assign_mul_add_mul_r_32.c
rename to src/lib/linalg/assign_mul_add_mul_r_32.c
diff --git a/linalg/assign_mul_add_mul_r_32.h b/src/lib/linalg/assign_mul_add_mul_r_32.h
similarity index 100%
rename from linalg/assign_mul_add_mul_r_32.h
rename to src/lib/linalg/assign_mul_add_mul_r_32.h
diff --git a/linalg/assign_mul_add_r.c b/src/lib/linalg/assign_mul_add_r.c
similarity index 100%
rename from linalg/assign_mul_add_r.c
rename to src/lib/linalg/assign_mul_add_r.c
diff --git a/linalg/assign_mul_add_r.h b/src/lib/linalg/assign_mul_add_r.h
similarity index 100%
rename from linalg/assign_mul_add_r.h
rename to src/lib/linalg/assign_mul_add_r.h
diff --git a/linalg/assign_mul_add_r_32.c b/src/lib/linalg/assign_mul_add_r_32.c
similarity index 100%
rename from linalg/assign_mul_add_r_32.c
rename to src/lib/linalg/assign_mul_add_r_32.c
diff --git a/linalg/assign_mul_add_r_32.h b/src/lib/linalg/assign_mul_add_r_32.h
similarity index 100%
rename from linalg/assign_mul_add_r_32.h
rename to src/lib/linalg/assign_mul_add_r_32.h
diff --git a/linalg/assign_mul_add_r_and_square.c b/src/lib/linalg/assign_mul_add_r_and_square.c
similarity index 100%
rename from linalg/assign_mul_add_r_and_square.c
rename to src/lib/linalg/assign_mul_add_r_and_square.c
diff --git a/linalg/assign_mul_add_r_and_square.h b/src/lib/linalg/assign_mul_add_r_and_square.h
similarity index 100%
rename from linalg/assign_mul_add_r_and_square.h
rename to src/lib/linalg/assign_mul_add_r_and_square.h
diff --git a/linalg/assign_mul_bra_add_mul_ket_add.c b/src/lib/linalg/assign_mul_bra_add_mul_ket_add.c
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add.c
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add.c
diff --git a/linalg/assign_mul_bra_add_mul_ket_add.h b/src/lib/linalg/assign_mul_bra_add_mul_ket_add.h
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add.h
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add.h
diff --git a/linalg/assign_mul_bra_add_mul_ket_add_r.c b/src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.c
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add_r.c
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.c
diff --git a/linalg/assign_mul_bra_add_mul_ket_add_r.h b/src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.h
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_ket_add_r.h
rename to src/lib/linalg/assign_mul_bra_add_mul_ket_add_r.h
diff --git a/linalg/assign_mul_bra_add_mul_r.c b/src/lib/linalg/assign_mul_bra_add_mul_r.c
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_r.c
rename to src/lib/linalg/assign_mul_bra_add_mul_r.c
diff --git a/linalg/assign_mul_bra_add_mul_r.h b/src/lib/linalg/assign_mul_bra_add_mul_r.h
similarity index 100%
rename from linalg/assign_mul_bra_add_mul_r.h
rename to src/lib/linalg/assign_mul_bra_add_mul_r.h
diff --git a/linalg/assign_to_32.c b/src/lib/linalg/assign_to_32.c
similarity index 100%
rename from linalg/assign_to_32.c
rename to src/lib/linalg/assign_to_32.c
diff --git a/linalg/assign_to_32.h b/src/lib/linalg/assign_to_32.h
similarity index 100%
rename from linalg/assign_to_32.h
rename to src/lib/linalg/assign_to_32.h
diff --git a/linalg/blas.h b/src/lib/linalg/blas.h
similarity index 100%
rename from linalg/blas.h
rename to src/lib/linalg/blas.h
diff --git a/linalg/comp_decomp.c b/src/lib/linalg/comp_decomp.c
similarity index 100%
rename from linalg/comp_decomp.c
rename to src/lib/linalg/comp_decomp.c
diff --git a/linalg/comp_decomp.h b/src/lib/linalg/comp_decomp.h
similarity index 100%
rename from linalg/comp_decomp.h
rename to src/lib/linalg/comp_decomp.h
diff --git a/linalg/convert_eo_to_lexic.c b/src/lib/linalg/convert_eo_to_lexic.c
similarity index 100%
rename from linalg/convert_eo_to_lexic.c
rename to src/lib/linalg/convert_eo_to_lexic.c
diff --git a/linalg/convert_eo_to_lexic.h b/src/lib/linalg/convert_eo_to_lexic.h
similarity index 100%
rename from linalg/convert_eo_to_lexic.h
rename to src/lib/linalg/convert_eo_to_lexic.h
diff --git a/linalg/convert_even_to_lexic.c b/src/lib/linalg/convert_even_to_lexic.c
similarity index 100%
rename from linalg/convert_even_to_lexic.c
rename to src/lib/linalg/convert_even_to_lexic.c
diff --git a/linalg/convert_even_to_lexic.h b/src/lib/linalg/convert_even_to_lexic.h
similarity index 100%
rename from linalg/convert_even_to_lexic.h
rename to src/lib/linalg/convert_even_to_lexic.h
diff --git a/linalg/convert_odd_to_lexic.c b/src/lib/linalg/convert_odd_to_lexic.c
similarity index 100%
rename from linalg/convert_odd_to_lexic.c
rename to src/lib/linalg/convert_odd_to_lexic.c
diff --git a/linalg/convert_odd_to_lexic.h b/src/lib/linalg/convert_odd_to_lexic.h
similarity index 100%
rename from linalg/convert_odd_to_lexic.h
rename to src/lib/linalg/convert_odd_to_lexic.h
diff --git a/linalg/diff.c b/src/lib/linalg/diff.c
similarity index 100%
rename from linalg/diff.c
rename to src/lib/linalg/diff.c
diff --git a/linalg/diff.h b/src/lib/linalg/diff.h
similarity index 100%
rename from linalg/diff.h
rename to src/lib/linalg/diff.h
diff --git a/linalg/diff_32.c b/src/lib/linalg/diff_32.c
similarity index 100%
rename from linalg/diff_32.c
rename to src/lib/linalg/diff_32.c
diff --git a/linalg/diff_32.h b/src/lib/linalg/diff_32.h
similarity index 100%
rename from linalg/diff_32.h
rename to src/lib/linalg/diff_32.h
diff --git a/linalg/diff_and_square_norm.c b/src/lib/linalg/diff_and_square_norm.c
similarity index 100%
rename from linalg/diff_and_square_norm.c
rename to src/lib/linalg/diff_and_square_norm.c
diff --git a/linalg/diff_and_square_norm.h b/src/lib/linalg/diff_and_square_norm.h
similarity index 100%
rename from linalg/diff_and_square_norm.h
rename to src/lib/linalg/diff_and_square_norm.h
diff --git a/linalg/fortran.h b/src/lib/linalg/fortran.h
similarity index 100%
rename from linalg/fortran.h
rename to src/lib/linalg/fortran.h
diff --git a/linalg/lapack.h b/src/lib/linalg/lapack.h
similarity index 100%
rename from linalg/lapack.h
rename to src/lib/linalg/lapack.h
diff --git a/linalg/map_to_blas.h b/src/lib/linalg/map_to_blas.h
similarity index 100%
rename from linalg/map_to_blas.h
rename to src/lib/linalg/map_to_blas.h
diff --git a/linalg/mattimesvec.c b/src/lib/linalg/mattimesvec.c
similarity index 100%
rename from linalg/mattimesvec.c
rename to src/lib/linalg/mattimesvec.c
diff --git a/linalg/mattimesvec.h b/src/lib/linalg/mattimesvec.h
similarity index 100%
rename from linalg/mattimesvec.h
rename to src/lib/linalg/mattimesvec.h
diff --git a/linalg/mul.c b/src/lib/linalg/mul.c
similarity index 100%
rename from linalg/mul.c
rename to src/lib/linalg/mul.c
diff --git a/linalg/mul.h b/src/lib/linalg/mul.h
similarity index 100%
rename from linalg/mul.h
rename to src/lib/linalg/mul.h
diff --git a/linalg/mul_add_mul.c b/src/lib/linalg/mul_add_mul.c
similarity index 100%
rename from linalg/mul_add_mul.c
rename to src/lib/linalg/mul_add_mul.c
diff --git a/linalg/mul_add_mul.h b/src/lib/linalg/mul_add_mul.h
similarity index 100%
rename from linalg/mul_add_mul.h
rename to src/lib/linalg/mul_add_mul.h
diff --git a/linalg/mul_add_mul_r.c b/src/lib/linalg/mul_add_mul_r.c
similarity index 100%
rename from linalg/mul_add_mul_r.c
rename to src/lib/linalg/mul_add_mul_r.c
diff --git a/linalg/mul_add_mul_r.h b/src/lib/linalg/mul_add_mul_r.h
similarity index 100%
rename from linalg/mul_add_mul_r.h
rename to src/lib/linalg/mul_add_mul_r.h
diff --git a/linalg/mul_diff_mul.c b/src/lib/linalg/mul_diff_mul.c
similarity index 100%
rename from linalg/mul_diff_mul.c
rename to src/lib/linalg/mul_diff_mul.c
diff --git a/linalg/mul_diff_mul.h b/src/lib/linalg/mul_diff_mul.h
similarity index 100%
rename from linalg/mul_diff_mul.h
rename to src/lib/linalg/mul_diff_mul.h
diff --git a/linalg/mul_diff_mul_r.c b/src/lib/linalg/mul_diff_mul_r.c
similarity index 100%
rename from linalg/mul_diff_mul_r.c
rename to src/lib/linalg/mul_diff_mul_r.c
diff --git a/linalg/mul_diff_mul_r.h b/src/lib/linalg/mul_diff_mul_r.h
similarity index 100%
rename from linalg/mul_diff_mul_r.h
rename to src/lib/linalg/mul_diff_mul_r.h
diff --git a/linalg/mul_diff_r.c b/src/lib/linalg/mul_diff_r.c
similarity index 100%
rename from linalg/mul_diff_r.c
rename to src/lib/linalg/mul_diff_r.c
diff --git a/linalg/mul_diff_r.h b/src/lib/linalg/mul_diff_r.h
similarity index 100%
rename from linalg/mul_diff_r.h
rename to src/lib/linalg/mul_diff_r.h
diff --git a/linalg/mul_gamma5.c b/src/lib/linalg/mul_gamma5.c
similarity index 100%
rename from linalg/mul_gamma5.c
rename to src/lib/linalg/mul_gamma5.c
diff --git a/linalg/mul_gamma5.h b/src/lib/linalg/mul_gamma5.h
similarity index 100%
rename from linalg/mul_gamma5.h
rename to src/lib/linalg/mul_gamma5.h
diff --git a/linalg/mul_r.c b/src/lib/linalg/mul_r.c
similarity index 100%
rename from linalg/mul_r.c
rename to src/lib/linalg/mul_r.c
diff --git a/linalg/mul_r.h b/src/lib/linalg/mul_r.h
similarity index 100%
rename from linalg/mul_r.h
rename to src/lib/linalg/mul_r.h
diff --git a/linalg/mul_r_32.c b/src/lib/linalg/mul_r_32.c
similarity index 100%
rename from linalg/mul_r_32.c
rename to src/lib/linalg/mul_r_32.c
diff --git a/linalg/mul_r_32.h b/src/lib/linalg/mul_r_32.h
similarity index 100%
rename from linalg/mul_r_32.h
rename to src/lib/linalg/mul_r_32.h
diff --git a/linalg/mul_r_gamma5.c b/src/lib/linalg/mul_r_gamma5.c
similarity index 100%
rename from linalg/mul_r_gamma5.c
rename to src/lib/linalg/mul_r_gamma5.c
diff --git a/linalg/mul_r_gamma5.h b/src/lib/linalg/mul_r_gamma5.h
similarity index 100%
rename from linalg/mul_r_gamma5.h
rename to src/lib/linalg/mul_r_gamma5.h
diff --git a/linalg/print_spinor.c b/src/lib/linalg/print_spinor.c
similarity index 100%
rename from linalg/print_spinor.c
rename to src/lib/linalg/print_spinor.c
diff --git a/linalg/print_spinor.h b/src/lib/linalg/print_spinor.h
similarity index 100%
rename from linalg/print_spinor.h
rename to src/lib/linalg/print_spinor.h
diff --git a/linalg/print_spinor_similar_components.c b/src/lib/linalg/print_spinor_similar_components.c
similarity index 100%
rename from linalg/print_spinor_similar_components.c
rename to src/lib/linalg/print_spinor_similar_components.c
diff --git a/linalg/print_spinor_similar_components.h b/src/lib/linalg/print_spinor_similar_components.h
similarity index 100%
rename from linalg/print_spinor_similar_components.h
rename to src/lib/linalg/print_spinor_similar_components.h
diff --git a/linalg/ratio.c b/src/lib/linalg/ratio.c
similarity index 100%
rename from linalg/ratio.c
rename to src/lib/linalg/ratio.c
diff --git a/linalg/ratio.h b/src/lib/linalg/ratio.h
similarity index 100%
rename from linalg/ratio.h
rename to src/lib/linalg/ratio.h
diff --git a/linalg/scalar_prod.c b/src/lib/linalg/scalar_prod.c
similarity index 100%
rename from linalg/scalar_prod.c
rename to src/lib/linalg/scalar_prod.c
diff --git a/linalg/scalar_prod.h b/src/lib/linalg/scalar_prod.h
similarity index 100%
rename from linalg/scalar_prod.h
rename to src/lib/linalg/scalar_prod.h
diff --git a/linalg/scalar_prod_body.c b/src/lib/linalg/scalar_prod_body.c
similarity index 100%
rename from linalg/scalar_prod_body.c
rename to src/lib/linalg/scalar_prod_body.c
diff --git a/linalg/scalar_prod_i.c b/src/lib/linalg/scalar_prod_i.c
similarity index 100%
rename from linalg/scalar_prod_i.c
rename to src/lib/linalg/scalar_prod_i.c
diff --git a/linalg/scalar_prod_i.h b/src/lib/linalg/scalar_prod_i.h
similarity index 100%
rename from linalg/scalar_prod_i.h
rename to src/lib/linalg/scalar_prod_i.h
diff --git a/linalg/scalar_prod_r.c b/src/lib/linalg/scalar_prod_r.c
similarity index 100%
rename from linalg/scalar_prod_r.c
rename to src/lib/linalg/scalar_prod_r.c
diff --git a/linalg/scalar_prod_r.h b/src/lib/linalg/scalar_prod_r.h
similarity index 100%
rename from linalg/scalar_prod_r.h
rename to src/lib/linalg/scalar_prod_r.h
diff --git a/linalg/scalar_prod_r_32.c b/src/lib/linalg/scalar_prod_r_32.c
similarity index 100%
rename from linalg/scalar_prod_r_32.c
rename to src/lib/linalg/scalar_prod_r_32.c
diff --git a/linalg/scalar_prod_r_32.h b/src/lib/linalg/scalar_prod_r_32.h
similarity index 100%
rename from linalg/scalar_prod_r_32.h
rename to src/lib/linalg/scalar_prod_r_32.h
diff --git a/linalg/set_even_to_zero.c b/src/lib/linalg/set_even_to_zero.c
similarity index 100%
rename from linalg/set_even_to_zero.c
rename to src/lib/linalg/set_even_to_zero.c
diff --git a/linalg/set_even_to_zero.h b/src/lib/linalg/set_even_to_zero.h
similarity index 100%
rename from linalg/set_even_to_zero.h
rename to src/lib/linalg/set_even_to_zero.h
diff --git a/linalg/square_and_minmax.c b/src/lib/linalg/square_and_minmax.c
similarity index 100%
rename from linalg/square_and_minmax.c
rename to src/lib/linalg/square_and_minmax.c
diff --git a/linalg/square_and_minmax.h b/src/lib/linalg/square_and_minmax.h
similarity index 100%
rename from linalg/square_and_minmax.h
rename to src/lib/linalg/square_and_minmax.h
diff --git a/linalg/square_and_prod_r.c b/src/lib/linalg/square_and_prod_r.c
similarity index 100%
rename from linalg/square_and_prod_r.c
rename to src/lib/linalg/square_and_prod_r.c
diff --git a/linalg/square_and_prod_r.h b/src/lib/linalg/square_and_prod_r.h
similarity index 100%
rename from linalg/square_and_prod_r.h
rename to src/lib/linalg/square_and_prod_r.h
diff --git a/linalg/square_norm.c b/src/lib/linalg/square_norm.c
similarity index 100%
rename from linalg/square_norm.c
rename to src/lib/linalg/square_norm.c
diff --git a/linalg/square_norm.h b/src/lib/linalg/square_norm.h
similarity index 100%
rename from linalg/square_norm.h
rename to src/lib/linalg/square_norm.h
diff --git a/linalg/square_norm_32.c b/src/lib/linalg/square_norm_32.c
similarity index 100%
rename from linalg/square_norm_32.c
rename to src/lib/linalg/square_norm_32.c
diff --git a/linalg/square_norm_32.h b/src/lib/linalg/square_norm_32.h
similarity index 100%
rename from linalg/square_norm_32.h
rename to src/lib/linalg/square_norm_32.h
diff --git a/linalg_eo.h b/src/lib/linalg_eo.h
similarity index 100%
rename from linalg_eo.h
rename to src/lib/linalg_eo.h
diff --git a/little_D.c b/src/lib/little_D.c
similarity index 100%
rename from little_D.c
rename to src/lib/little_D.c
diff --git a/little_D.h b/src/lib/little_D.h
similarity index 100%
rename from little_D.h
rename to src/lib/little_D.h
diff --git a/little_D_body.c b/src/lib/little_D_body.c
similarity index 100%
rename from little_D_body.c
rename to src/lib/little_D_body.c
diff --git a/matrix_utils.c b/src/lib/matrix_utils.c
similarity index 100%
rename from matrix_utils.c
rename to src/lib/matrix_utils.c
diff --git a/matrix_utils.h b/src/lib/matrix_utils.h
similarity index 100%
rename from matrix_utils.h
rename to src/lib/matrix_utils.h
diff --git a/meas/Makefile.in b/src/lib/meas/Makefile.in
similarity index 100%
rename from meas/Makefile.in
rename to src/lib/meas/Makefile.in
diff --git a/meas/correlators.c b/src/lib/meas/correlators.c
similarity index 100%
rename from meas/correlators.c
rename to src/lib/meas/correlators.c
diff --git a/meas/correlators.h b/src/lib/meas/correlators.h
similarity index 100%
rename from meas/correlators.h
rename to src/lib/meas/correlators.h
diff --git a/meas/field_strength_types.h b/src/lib/meas/field_strength_types.h
similarity index 100%
rename from meas/field_strength_types.h
rename to src/lib/meas/field_strength_types.h
diff --git a/meas/gradient_flow.c b/src/lib/meas/gradient_flow.c
similarity index 100%
rename from meas/gradient_flow.c
rename to src/lib/meas/gradient_flow.c
diff --git a/meas/gradient_flow.h b/src/lib/meas/gradient_flow.h
similarity index 100%
rename from meas/gradient_flow.h
rename to src/lib/meas/gradient_flow.h
diff --git a/meas/measure_clover_field_strength_observables.c b/src/lib/meas/measure_clover_field_strength_observables.c
similarity index 100%
rename from meas/measure_clover_field_strength_observables.c
rename to src/lib/meas/measure_clover_field_strength_observables.c
diff --git a/meas/measure_clover_field_strength_observables.h b/src/lib/meas/measure_clover_field_strength_observables.h
similarity index 100%
rename from meas/measure_clover_field_strength_observables.h
rename to src/lib/meas/measure_clover_field_strength_observables.h
diff --git a/meas/measurements.c b/src/lib/meas/measurements.c
similarity index 100%
rename from meas/measurements.c
rename to src/lib/meas/measurements.c
diff --git a/meas/measurements.h b/src/lib/meas/measurements.h
similarity index 100%
rename from meas/measurements.h
rename to src/lib/meas/measurements.h
diff --git a/meas/oriented_plaquettes.c b/src/lib/meas/oriented_plaquettes.c
similarity index 100%
rename from meas/oriented_plaquettes.c
rename to src/lib/meas/oriented_plaquettes.c
diff --git a/meas/oriented_plaquettes.h b/src/lib/meas/oriented_plaquettes.h
similarity index 100%
rename from meas/oriented_plaquettes.h
rename to src/lib/meas/oriented_plaquettes.h
diff --git a/meas/pion_norm.c b/src/lib/meas/pion_norm.c
similarity index 100%
rename from meas/pion_norm.c
rename to src/lib/meas/pion_norm.c
diff --git a/meas/pion_norm.h b/src/lib/meas/pion_norm.h
similarity index 100%
rename from meas/pion_norm.h
rename to src/lib/meas/pion_norm.h
diff --git a/meas/polyakov_loop.c b/src/lib/meas/polyakov_loop.c
similarity index 100%
rename from meas/polyakov_loop.c
rename to src/lib/meas/polyakov_loop.c
diff --git a/meas/polyakov_loop.h b/src/lib/meas/polyakov_loop.h
similarity index 100%
rename from meas/polyakov_loop.h
rename to src/lib/meas/polyakov_loop.h
diff --git a/measure_gauge_action.c b/src/lib/measure_gauge_action.c
similarity index 100%
rename from measure_gauge_action.c
rename to src/lib/measure_gauge_action.c
diff --git a/measure_gauge_action.h b/src/lib/measure_gauge_action.h
similarity index 100%
rename from measure_gauge_action.h
rename to src/lib/measure_gauge_action.h
diff --git a/measure_rectangles.c b/src/lib/measure_rectangles.c
similarity index 100%
rename from measure_rectangles.c
rename to src/lib/measure_rectangles.c
diff --git a/measure_rectangles.h b/src/lib/measure_rectangles.h
similarity index 100%
rename from measure_rectangles.h
rename to src/lib/measure_rectangles.h
diff --git a/misc_types.h b/src/lib/misc_types.h
similarity index 99%
rename from misc_types.h
rename to src/lib/misc_types.h
index 91ceda0a8..fee62159f 100644
--- a/misc_types.h
+++ b/src/lib/misc_types.h
@@ -101,7 +101,7 @@ typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = QMP_THREAD_SINGLE,
   TM_MPI_THREAD_MULTIPLE = QMP_THREAD_MULTIPLE
 } tm_mpi_thread_level_t;
-#elif TM_USE_MPI
+#elif defined(TM_USE_MPI) 
 typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = MPI_THREAD_SERIALIZED,
   TM_MPI_THREAD_MULTIPLE = MPI_THREAD_MULTIPLE
diff --git a/monomial/Makefile.in b/src/lib/monomial/Makefile.in
similarity index 100%
rename from monomial/Makefile.in
rename to src/lib/monomial/Makefile.in
diff --git a/monomial/clover_trlog_monomial.c b/src/lib/monomial/clover_trlog_monomial.c
similarity index 100%
rename from monomial/clover_trlog_monomial.c
rename to src/lib/monomial/clover_trlog_monomial.c
diff --git a/monomial/clover_trlog_monomial.h b/src/lib/monomial/clover_trlog_monomial.h
similarity index 100%
rename from monomial/clover_trlog_monomial.h
rename to src/lib/monomial/clover_trlog_monomial.h
diff --git a/monomial/cloverdet_monomial.c b/src/lib/monomial/cloverdet_monomial.c
similarity index 100%
rename from monomial/cloverdet_monomial.c
rename to src/lib/monomial/cloverdet_monomial.c
diff --git a/monomial/cloverdet_monomial.h b/src/lib/monomial/cloverdet_monomial.h
similarity index 100%
rename from monomial/cloverdet_monomial.h
rename to src/lib/monomial/cloverdet_monomial.h
diff --git a/monomial/cloverdetratio_monomial.c b/src/lib/monomial/cloverdetratio_monomial.c
similarity index 100%
rename from monomial/cloverdetratio_monomial.c
rename to src/lib/monomial/cloverdetratio_monomial.c
diff --git a/monomial/cloverdetratio_monomial.h b/src/lib/monomial/cloverdetratio_monomial.h
similarity index 100%
rename from monomial/cloverdetratio_monomial.h
rename to src/lib/monomial/cloverdetratio_monomial.h
diff --git a/monomial/cloverdetratio_rwmonomial.c b/src/lib/monomial/cloverdetratio_rwmonomial.c
similarity index 100%
rename from monomial/cloverdetratio_rwmonomial.c
rename to src/lib/monomial/cloverdetratio_rwmonomial.c
diff --git a/monomial/cloverdetratio_rwmonomial.h b/src/lib/monomial/cloverdetratio_rwmonomial.h
similarity index 100%
rename from monomial/cloverdetratio_rwmonomial.h
rename to src/lib/monomial/cloverdetratio_rwmonomial.h
diff --git a/monomial/clovernd_trlog_monomial.c b/src/lib/monomial/clovernd_trlog_monomial.c
similarity index 100%
rename from monomial/clovernd_trlog_monomial.c
rename to src/lib/monomial/clovernd_trlog_monomial.c
diff --git a/monomial/clovernd_trlog_monomial.h b/src/lib/monomial/clovernd_trlog_monomial.h
similarity index 100%
rename from monomial/clovernd_trlog_monomial.h
rename to src/lib/monomial/clovernd_trlog_monomial.h
diff --git a/monomial/cloverndpoly_monomial.c b/src/lib/monomial/cloverndpoly_monomial.c
similarity index 100%
rename from monomial/cloverndpoly_monomial.c
rename to src/lib/monomial/cloverndpoly_monomial.c
diff --git a/monomial/cloverndpoly_monomial.h b/src/lib/monomial/cloverndpoly_monomial.h
similarity index 100%
rename from monomial/cloverndpoly_monomial.h
rename to src/lib/monomial/cloverndpoly_monomial.h
diff --git a/monomial/det_monomial.c b/src/lib/monomial/det_monomial.c
similarity index 100%
rename from monomial/det_monomial.c
rename to src/lib/monomial/det_monomial.c
diff --git a/monomial/det_monomial.h b/src/lib/monomial/det_monomial.h
similarity index 100%
rename from monomial/det_monomial.h
rename to src/lib/monomial/det_monomial.h
diff --git a/monomial/detratio_monomial.c b/src/lib/monomial/detratio_monomial.c
similarity index 100%
rename from monomial/detratio_monomial.c
rename to src/lib/monomial/detratio_monomial.c
diff --git a/monomial/detratio_monomial.h b/src/lib/monomial/detratio_monomial.h
similarity index 100%
rename from monomial/detratio_monomial.h
rename to src/lib/monomial/detratio_monomial.h
diff --git a/monomial/gauge_monomial.c b/src/lib/monomial/gauge_monomial.c
similarity index 100%
rename from monomial/gauge_monomial.c
rename to src/lib/monomial/gauge_monomial.c
diff --git a/monomial/gauge_monomial.h b/src/lib/monomial/gauge_monomial.h
similarity index 100%
rename from monomial/gauge_monomial.h
rename to src/lib/monomial/gauge_monomial.h
diff --git a/monomial/moment_energy.c b/src/lib/monomial/moment_energy.c
similarity index 100%
rename from monomial/moment_energy.c
rename to src/lib/monomial/moment_energy.c
diff --git a/monomial/moment_energy.h b/src/lib/monomial/moment_energy.h
similarity index 100%
rename from monomial/moment_energy.h
rename to src/lib/monomial/moment_energy.h
diff --git a/monomial/monitor_forces.c b/src/lib/monomial/monitor_forces.c
similarity index 100%
rename from monomial/monitor_forces.c
rename to src/lib/monomial/monitor_forces.c
diff --git a/monomial/monitor_forces.h b/src/lib/monomial/monitor_forces.h
similarity index 100%
rename from monomial/monitor_forces.h
rename to src/lib/monomial/monitor_forces.h
diff --git a/monomial/monomial.c b/src/lib/monomial/monomial.c
similarity index 100%
rename from monomial/monomial.c
rename to src/lib/monomial/monomial.c
diff --git a/monomial/monomial.h b/src/lib/monomial/monomial.h
similarity index 100%
rename from monomial/monomial.h
rename to src/lib/monomial/monomial.h
diff --git a/monomial/nddetratio_monomial.c b/src/lib/monomial/nddetratio_monomial.c
similarity index 100%
rename from monomial/nddetratio_monomial.c
rename to src/lib/monomial/nddetratio_monomial.c
diff --git a/monomial/nddetratio_monomial.h b/src/lib/monomial/nddetratio_monomial.h
similarity index 100%
rename from monomial/nddetratio_monomial.h
rename to src/lib/monomial/nddetratio_monomial.h
diff --git a/monomial/ndpoly_monomial.c b/src/lib/monomial/ndpoly_monomial.c
similarity index 100%
rename from monomial/ndpoly_monomial.c
rename to src/lib/monomial/ndpoly_monomial.c
diff --git a/monomial/ndpoly_monomial.h b/src/lib/monomial/ndpoly_monomial.h
similarity index 100%
rename from monomial/ndpoly_monomial.h
rename to src/lib/monomial/ndpoly_monomial.h
diff --git a/monomial/ndrat_monomial.c b/src/lib/monomial/ndrat_monomial.c
similarity index 100%
rename from monomial/ndrat_monomial.c
rename to src/lib/monomial/ndrat_monomial.c
diff --git a/monomial/ndrat_monomial.h b/src/lib/monomial/ndrat_monomial.h
similarity index 100%
rename from monomial/ndrat_monomial.h
rename to src/lib/monomial/ndrat_monomial.h
diff --git a/monomial/ndratcor_monomial.c b/src/lib/monomial/ndratcor_monomial.c
similarity index 100%
rename from monomial/ndratcor_monomial.c
rename to src/lib/monomial/ndratcor_monomial.c
diff --git a/monomial/ndratcor_monomial.h b/src/lib/monomial/ndratcor_monomial.h
similarity index 100%
rename from monomial/ndratcor_monomial.h
rename to src/lib/monomial/ndratcor_monomial.h
diff --git a/monomial/poly_monomial.c b/src/lib/monomial/poly_monomial.c
similarity index 100%
rename from monomial/poly_monomial.c
rename to src/lib/monomial/poly_monomial.c
diff --git a/monomial/poly_monomial.h b/src/lib/monomial/poly_monomial.h
similarity index 100%
rename from monomial/poly_monomial.h
rename to src/lib/monomial/poly_monomial.h
diff --git a/monomial/rat_monomial.c b/src/lib/monomial/rat_monomial.c
similarity index 100%
rename from monomial/rat_monomial.c
rename to src/lib/monomial/rat_monomial.c
diff --git a/monomial/rat_monomial.h b/src/lib/monomial/rat_monomial.h
similarity index 100%
rename from monomial/rat_monomial.h
rename to src/lib/monomial/rat_monomial.h
diff --git a/monomial/ratcor_monomial.c b/src/lib/monomial/ratcor_monomial.c
similarity index 100%
rename from monomial/ratcor_monomial.c
rename to src/lib/monomial/ratcor_monomial.c
diff --git a/monomial/ratcor_monomial.h b/src/lib/monomial/ratcor_monomial.h
similarity index 100%
rename from monomial/ratcor_monomial.h
rename to src/lib/monomial/ratcor_monomial.h
diff --git a/mpi_init.c b/src/lib/mpi_init.c
similarity index 100%
rename from mpi_init.c
rename to src/lib/mpi_init.c
diff --git a/mpi_init.h b/src/lib/mpi_init.h
similarity index 100%
rename from mpi_init.h
rename to src/lib/mpi_init.h
diff --git a/omp_accumulator.h b/src/lib/omp_accumulator.h
similarity index 100%
rename from omp_accumulator.h
rename to src/lib/omp_accumulator.h
diff --git a/operator.c b/src/lib/operator.c
similarity index 100%
rename from operator.c
rename to src/lib/operator.c
diff --git a/operator.h b/src/lib/operator.h
similarity index 100%
rename from operator.h
rename to src/lib/operator.h
diff --git a/operator/Block_D_psi_body.c b/src/lib/operator/Block_D_psi_body.c
similarity index 100%
rename from operator/Block_D_psi_body.c
rename to src/lib/operator/Block_D_psi_body.c
diff --git a/operator/D_psi.c b/src/lib/operator/D_psi.c
similarity index 100%
rename from operator/D_psi.c
rename to src/lib/operator/D_psi.c
diff --git a/operator/D_psi.h b/src/lib/operator/D_psi.h
similarity index 100%
rename from operator/D_psi.h
rename to src/lib/operator/D_psi.h
diff --git a/operator/D_psi_body.c b/src/lib/operator/D_psi_body.c
similarity index 100%
rename from operator/D_psi_body.c
rename to src/lib/operator/D_psi_body.c
diff --git a/operator/Dov_proj.c b/src/lib/operator/Dov_proj.c
similarity index 100%
rename from operator/Dov_proj.c
rename to src/lib/operator/Dov_proj.c
diff --git a/operator/Dov_proj.h b/src/lib/operator/Dov_proj.h
similarity index 100%
rename from operator/Dov_proj.h
rename to src/lib/operator/Dov_proj.h
diff --git a/operator/Dov_psi.c b/src/lib/operator/Dov_psi.c
similarity index 100%
rename from operator/Dov_psi.c
rename to src/lib/operator/Dov_psi.c
diff --git a/operator/Dov_psi.h b/src/lib/operator/Dov_psi.h
similarity index 100%
rename from operator/Dov_psi.h
rename to src/lib/operator/Dov_psi.h
diff --git a/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
similarity index 100%
rename from operator/Hopping_Matrix.c
rename to src/lib/operator/Hopping_Matrix.c
diff --git a/operator/Hopping_Matrix.h b/src/lib/operator/Hopping_Matrix.h
similarity index 100%
rename from operator/Hopping_Matrix.h
rename to src/lib/operator/Hopping_Matrix.h
diff --git a/operator/Hopping_Matrix_32.c b/src/lib/operator/Hopping_Matrix_32.c
similarity index 100%
rename from operator/Hopping_Matrix_32.c
rename to src/lib/operator/Hopping_Matrix_32.c
diff --git a/operator/Hopping_Matrix_32.h b/src/lib/operator/Hopping_Matrix_32.h
similarity index 100%
rename from operator/Hopping_Matrix_32.h
rename to src/lib/operator/Hopping_Matrix_32.h
diff --git a/operator/Hopping_Matrix_32_nocom.c b/src/lib/operator/Hopping_Matrix_32_nocom.c
similarity index 100%
rename from operator/Hopping_Matrix_32_nocom.c
rename to src/lib/operator/Hopping_Matrix_32_nocom.c
diff --git a/operator/Hopping_Matrix_nocom.c b/src/lib/operator/Hopping_Matrix_nocom.c
similarity index 100%
rename from operator/Hopping_Matrix_nocom.c
rename to src/lib/operator/Hopping_Matrix_nocom.c
diff --git a/operator/Hopping_Matrix_nocom.h b/src/lib/operator/Hopping_Matrix_nocom.h
similarity index 100%
rename from operator/Hopping_Matrix_nocom.h
rename to src/lib/operator/Hopping_Matrix_nocom.h
diff --git a/operator/Makefile.in b/src/lib/operator/Makefile.in
similarity index 100%
rename from operator/Makefile.in
rename to src/lib/operator/Makefile.in
diff --git a/operator/assign_mul_one_sw_pm_imu_inv_block_body.c b/src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.c
similarity index 100%
rename from operator/assign_mul_one_sw_pm_imu_inv_block_body.c
rename to src/lib/operator/assign_mul_one_sw_pm_imu_inv_block_body.c
diff --git a/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c b/src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
similarity index 100%
rename from operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
rename to src/lib/operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
diff --git a/operator/clover_accumulate_deriv.c b/src/lib/operator/clover_accumulate_deriv.c
similarity index 100%
rename from operator/clover_accumulate_deriv.c
rename to src/lib/operator/clover_accumulate_deriv.c
diff --git a/operator/clover_deriv.c b/src/lib/operator/clover_deriv.c
similarity index 100%
rename from operator/clover_deriv.c
rename to src/lib/operator/clover_deriv.c
diff --git a/operator/clover_det.c b/src/lib/operator/clover_det.c
similarity index 100%
rename from operator/clover_det.c
rename to src/lib/operator/clover_det.c
diff --git a/operator/clover_inline.h b/src/lib/operator/clover_inline.h
similarity index 100%
rename from operator/clover_inline.h
rename to src/lib/operator/clover_inline.h
diff --git a/operator/clover_invert.c b/src/lib/operator/clover_invert.c
similarity index 100%
rename from operator/clover_invert.c
rename to src/lib/operator/clover_invert.c
diff --git a/operator/clover_leaf.c b/src/lib/operator/clover_leaf.c
similarity index 100%
rename from operator/clover_leaf.c
rename to src/lib/operator/clover_leaf.c
diff --git a/operator/clover_leaf.h b/src/lib/operator/clover_leaf.h
similarity index 100%
rename from operator/clover_leaf.h
rename to src/lib/operator/clover_leaf.h
diff --git a/operator/clover_term.c b/src/lib/operator/clover_term.c
similarity index 100%
rename from operator/clover_term.c
rename to src/lib/operator/clover_term.c
diff --git a/operator/clovertm_operators.c b/src/lib/operator/clovertm_operators.c
similarity index 100%
rename from operator/clovertm_operators.c
rename to src/lib/operator/clovertm_operators.c
diff --git a/operator/clovertm_operators.h b/src/lib/operator/clovertm_operators.h
similarity index 100%
rename from operator/clovertm_operators.h
rename to src/lib/operator/clovertm_operators.h
diff --git a/operator/clovertm_operators_32.c b/src/lib/operator/clovertm_operators_32.c
similarity index 100%
rename from operator/clovertm_operators_32.c
rename to src/lib/operator/clovertm_operators_32.c
diff --git a/operator/clovertm_operators_32.h b/src/lib/operator/clovertm_operators_32.h
similarity index 100%
rename from operator/clovertm_operators_32.h
rename to src/lib/operator/clovertm_operators_32.h
diff --git a/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
similarity index 100%
rename from operator/halfspinor_body.c
rename to src/lib/operator/halfspinor_body.c
diff --git a/operator/halfspinor_body_32.c b/src/lib/operator/halfspinor_body_32.c
similarity index 100%
rename from operator/halfspinor_body_32.c
rename to src/lib/operator/halfspinor_body_32.c
diff --git a/operator/halfspinor_hopping.h b/src/lib/operator/halfspinor_hopping.h
similarity index 100%
rename from operator/halfspinor_hopping.h
rename to src/lib/operator/halfspinor_hopping.h
diff --git a/operator/halfspinor_hopping_32.h b/src/lib/operator/halfspinor_hopping_32.h
similarity index 100%
rename from operator/halfspinor_hopping_32.h
rename to src/lib/operator/halfspinor_hopping_32.h
diff --git a/operator/hopping_bg_dbl.c b/src/lib/operator/hopping_bg_dbl.c
similarity index 100%
rename from operator/hopping_bg_dbl.c
rename to src/lib/operator/hopping_bg_dbl.c
diff --git a/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
similarity index 100%
rename from operator/hopping_body_dbl.c
rename to src/lib/operator/hopping_body_dbl.c
diff --git a/operator/hopping_sgl.c b/src/lib/operator/hopping_sgl.c
similarity index 100%
rename from operator/hopping_sgl.c
rename to src/lib/operator/hopping_sgl.c
diff --git a/operator/mul_one_pm_imu_inv_body.c b/src/lib/operator/mul_one_pm_imu_inv_body.c
similarity index 100%
rename from operator/mul_one_pm_imu_inv_body.c
rename to src/lib/operator/mul_one_pm_imu_inv_body.c
diff --git a/operator/mul_one_pm_imu_sub_mul_body.c b/src/lib/operator/mul_one_pm_imu_sub_mul_body.c
similarity index 100%
rename from operator/mul_one_pm_imu_sub_mul_body.c
rename to src/lib/operator/mul_one_pm_imu_sub_mul_body.c
diff --git a/operator/tm_operators.c b/src/lib/operator/tm_operators.c
similarity index 100%
rename from operator/tm_operators.c
rename to src/lib/operator/tm_operators.c
diff --git a/operator/tm_operators.h b/src/lib/operator/tm_operators.h
similarity index 100%
rename from operator/tm_operators.h
rename to src/lib/operator/tm_operators.h
diff --git a/operator/tm_operators_32.c b/src/lib/operator/tm_operators_32.c
similarity index 100%
rename from operator/tm_operators_32.c
rename to src/lib/operator/tm_operators_32.c
diff --git a/operator/tm_operators_32.h b/src/lib/operator/tm_operators_32.h
similarity index 100%
rename from operator/tm_operators_32.h
rename to src/lib/operator/tm_operators_32.h
diff --git a/operator/tm_operators_nd.c b/src/lib/operator/tm_operators_nd.c
similarity index 100%
rename from operator/tm_operators_nd.c
rename to src/lib/operator/tm_operators_nd.c
diff --git a/operator/tm_operators_nd.h b/src/lib/operator/tm_operators_nd.h
similarity index 100%
rename from operator/tm_operators_nd.h
rename to src/lib/operator/tm_operators_nd.h
diff --git a/operator/tm_operators_nd_32.c b/src/lib/operator/tm_operators_nd_32.c
similarity index 100%
rename from operator/tm_operators_nd_32.c
rename to src/lib/operator/tm_operators_nd_32.c
diff --git a/operator/tm_operators_nd_32.h b/src/lib/operator/tm_operators_nd_32.h
similarity index 100%
rename from operator/tm_operators_nd_32.h
rename to src/lib/operator/tm_operators_nd_32.h
diff --git a/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
similarity index 100%
rename from operator/tm_sub_Hopping_Matrix.c
rename to src/lib/operator/tm_sub_Hopping_Matrix.c
diff --git a/operator/tm_sub_Hopping_Matrix.h b/src/lib/operator/tm_sub_Hopping_Matrix.h
similarity index 100%
rename from operator/tm_sub_Hopping_Matrix.h
rename to src/lib/operator/tm_sub_Hopping_Matrix.h
diff --git a/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
similarity index 100%
rename from operator/tm_times_Hopping_Matrix.c
rename to src/lib/operator/tm_times_Hopping_Matrix.c
diff --git a/operator/tm_times_Hopping_Matrix.h b/src/lib/operator/tm_times_Hopping_Matrix.h
similarity index 100%
rename from operator/tm_times_Hopping_Matrix.h
rename to src/lib/operator/tm_times_Hopping_Matrix.h
diff --git a/operator_types.h b/src/lib/operator_types.h
similarity index 100%
rename from operator_types.h
rename to src/lib/operator_types.h
diff --git a/overrelaxation.c b/src/lib/overrelaxation.c
similarity index 99%
rename from overrelaxation.c
rename to src/lib/overrelaxation.c
index 2c2e486f7..91d95fa30 100644
--- a/overrelaxation.c
+++ b/src/lib/overrelaxation.c
@@ -205,7 +205,7 @@ void overrel_sweep() {
   static su3 v;
   for (mu = 0; mu < 4; mu++) {
     for (ix = 0; ix < VOLUME; ix++) {
-      get_staples(&v, ix, mu, g_gauge_field);
+      get_staples(&v, ix, mu, (const su3 **)g_gauge_field);
       flip_subgroup(ix, mu, v, 1);
       flip_subgroup(ix, mu, v, 2);
       flip_subgroup(ix, mu, v, 3);
diff --git a/overrelaxation.h b/src/lib/overrelaxation.h
similarity index 100%
rename from overrelaxation.h
rename to src/lib/overrelaxation.h
diff --git a/parallel_io.h b/src/lib/parallel_io.h
similarity index 100%
rename from parallel_io.h
rename to src/lib/parallel_io.h
diff --git a/phmc.c b/src/lib/phmc.c
similarity index 100%
rename from phmc.c
rename to src/lib/phmc.c
diff --git a/phmc.h b/src/lib/phmc.h
similarity index 100%
rename from phmc.h
rename to src/lib/phmc.h
diff --git a/prepare_source.c b/src/lib/prepare_source.c
similarity index 100%
rename from prepare_source.c
rename to src/lib/prepare_source.c
diff --git a/prepare_source.h b/src/lib/prepare_source.h
similarity index 100%
rename from prepare_source.h
rename to src/lib/prepare_source.h
diff --git a/profiling/hmc/Readme.md b/src/lib/profiling/hmc/Readme.md
similarity index 100%
rename from profiling/hmc/Readme.md
rename to src/lib/profiling/hmc/Readme.md
diff --git a/profiling/hmc/example_profile.pdf b/src/lib/profiling/hmc/example_profile.pdf
similarity index 100%
rename from profiling/hmc/example_profile.pdf
rename to src/lib/profiling/hmc/example_profile.pdf
diff --git a/profiling/hmc/profile.Rmd b/src/lib/profiling/hmc/profile.Rmd
similarity index 100%
rename from profiling/hmc/profile.Rmd
rename to src/lib/profiling/hmc/profile.Rmd
diff --git a/profiling/hmc/timing.R b/src/lib/profiling/hmc/timing.R
similarity index 100%
rename from profiling/hmc/timing.R
rename to src/lib/profiling/hmc/timing.R
diff --git a/profiling/hmc_mk2/.gitignore b/src/lib/profiling/hmc_mk2/.gitignore
similarity index 100%
rename from profiling/hmc_mk2/.gitignore
rename to src/lib/profiling/hmc_mk2/.gitignore
diff --git a/profiling/hmc_mk2/README.md b/src/lib/profiling/hmc_mk2/README.md
similarity index 100%
rename from profiling/hmc_mk2/README.md
rename to src/lib/profiling/hmc_mk2/README.md
diff --git a/profiling/hmc_mk2/logs/example_log.out b/src/lib/profiling/hmc_mk2/logs/example_log.out
similarity index 100%
rename from profiling/hmc_mk2/logs/example_log.out
rename to src/lib/profiling/hmc_mk2/logs/example_log.out
diff --git a/profiling/hmc_mk2/make_profile.R b/src/lib/profiling/hmc_mk2/make_profile.R
similarity index 100%
rename from profiling/hmc_mk2/make_profile.R
rename to src/lib/profiling/hmc_mk2/make_profile.R
diff --git a/profiling/hmc_mk2/profile.Rmd b/src/lib/profiling/hmc_mk2/profile.Rmd
similarity index 100%
rename from profiling/hmc_mk2/profile.Rmd
rename to src/lib/profiling/hmc_mk2/profile.Rmd
diff --git a/src/lib/qphix/qphix_base_classes.hpp b/src/lib/qphix/qphix_base_classes.hpp
new file mode 100644
index 000000000..26015e3a2
--- /dev/null
+++ b/src/lib/qphix/qphix_base_classes.hpp
@@ -0,0 +1,771 @@
+// Copyright © 2017 Martin Ueding <dev@martin-ueding.de>
+// Licensed unter the [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause).
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// This file should be deprecated or updated to provide any functionality
+// not covered by QPhiX itself.
+
+/**
+  \file Additions to QPhiX that are only needed for tmLQCD.
+
+  In the original QPhiX, there are only Wilson fermions and Wilson clover
+  fermions. The Dslash operators have a different call signature (the latter
+  requiring a clover term), so there is no common base class. With the addition
+  of Wilson twisted mass (Mario) and Wilson twisted clover (Peter), there are
+  now two instances of the Dslash that have the same signature. In order to
+  write a more general even-odd source preparation and solution reconstruction
+  code, a common base class for non-clover and clover is desired. In order to
+  leave the QPhiX code untouched (for now), this code lives here in tmLQCD.
+  */
+
+#pragma once
+
+#include <qphix/blas_new_c.h>
+#include <qphix/clover_dslash_def.h>
+#include <qphix/dslash_def.h>
+#include <qphix/geometry.h>
+#include <qphix/tm_clov_dslash_def.h>
+#include <qphix/tm_dslash_def.h>
+
+#include <cassert>
+
+namespace tmlqcd {
+
+namespace {
+size_t constexpr re = 0;
+size_t constexpr im = 1;
+int const n_blas_simt = 1;
+
+// The even checkerboard is given by ( (x + y + z + t ) & 1 == 0 ) -> cb0 is even
+int constexpr cb_even = 0;
+int constexpr cb_odd = 1;
+}
+
+/**
+  Complex multiplication accumulate.
+
+  Computes \f$ (r + \mathrm i i) += (a + \mathrm i b) * (c + \mathrm i d) \f$.
+  */
+template <typename FT>
+void cplx_mul_acc(FT &r_out, FT &i_out, FT const &a, FT const &b, FT const &c, FT const &d) {
+  r_out += a * c - b * d;
+  i_out += a * d + b * c;
+}
+
+/**
+  Wrapper for the clover multiplication function.
+
+  The `struct` is needed in order to allow for partial template specialization in the `Clover`
+  parameter.
+
+  \tparam Clover Type of clover block to use, must be a type from Geometry such that there exists a
+  specialization for it.
+  */
+template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
+struct InnerCloverProduct {
+  /**
+  Multiplies the clover term for a single lattice size to a spinor.
+
+  This function is intended to be used in a loop over all lattice sites. It is expected from the
+  caller to have figured out all the correct indices. There are template specializations for the two
+  different types of clover term that are used in QPhiX.
+
+  \param[out] out Output spinor block. It is assumed to be zeroed properly, the function will just
+  accumulate values into that output variable. Use \ref QPhiX::zeroSpinor for that.
+  \param[in] in Input spinor block.
+  \param[in] clover Single clover block that contains the lattice site of the spinor.
+  \param[in] xi SIMD index for the arrays with length `soalen`, as in the spinors.
+  \param[in] veclen_idx SIMD index for the arrays with length `veclen`, as in the clover term.
+  */
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &in,
+      Clover const &clover, int const xi, int const veclen_idx);
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+struct InnerCloverProduct<FT, veclen, soalen, compress12,
+                          typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock> {
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock const &clov_block,
+      int const xi, int const veclen_idx) {
+    // The clover term is block-diagonal in spin. Therefore we need
+    // to iterate over the two blocks of spin.
+    for (auto s_block : {0, 1}) {
+      // Extract the diagonal and triangular parts.
+      auto const &diag_in = s_block == 0 ? clov_block.diag1 : clov_block.diag2;
+      auto const &off_diag_in = s_block == 0 ? clov_block.off_diag1 : clov_block.off_diag2;
+      // Input two-spinor component.
+      for (auto two_s_in : {0, 1}) {
+        // Reconstruct four spinor index.
+        auto const four_s_in = 2 * s_block + two_s_in;
+        // Output two-spinor component.
+        for (auto two_s_out : {0, 1}) {
+          // Reconstruct four spinor index.
+          auto const four_s_out = 2 * s_block + two_s_out;
+          // Input color.
+          for (auto c_in : {0, 1, 2}) {
+            // Spin-color index (0, ..., 5).
+            auto const sc_in = 3 * two_s_in + c_in;
+            // Output color.
+            for (auto c_out : {0, 1, 2}) {
+              // Spin-color index (0, ..., 5).
+              auto const sc_out = 3 * two_s_out + c_out;
+
+              // See `qphix-codegen` file `dslash_common.cc`
+              // function
+              // `clover_term` for the index manipulations done
+              // here.
+
+              // Using separate loops over the actual indices is
+              // probably
+              // faster than the branching in the innermost loop.
+
+              if (sc_out == sc_in) {
+                cplx_mul_acc(spinor_out[c_out][four_s_out][re][xi],
+                             spinor_out[c_out][four_s_out][im][xi], diag_in[sc_in][veclen_idx],
+                             QPhiX::rep<FT,double>(0.0), spinor_in[c_in][four_s_in][re][xi],
+                             spinor_in[c_in][four_s_in][im][xi]);
+              } else if (sc_out < sc_in) {
+                auto const idx15 = sc_in * (sc_in - 1) / 2 + sc_out;
+                cplx_mul_acc(
+                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                    off_diag_in[idx15][re][veclen_idx],
+                    // aww hell, maybe one should just add negation to QPhiX::half ?
+                    QPhiX::rep<FT,double>(-QPhiX::rep<double,FT>(off_diag_in[idx15][im][veclen_idx])),
+                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+              } else {
+                auto const idx15 = sc_out * (sc_out - 1) / 2 + sc_in;
+                cplx_mul_acc(
+                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                    off_diag_in[idx15][re][veclen_idx], off_diag_in[idx15][im][veclen_idx],
+                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+struct InnerCloverProduct<
+    FT, veclen, soalen, compress12,
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock> {
+  static void multiply(
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock const &clov_block,
+      int const xi, int const veclen_idx) {
+    // The clover term is block-diagonal in spin. Therefore we need
+    // to iterate over the two blocks of spin.
+    for (auto s_block : {0, 1}) {
+      // handy reference to half-spinor block
+      auto const &block_in = s_block == 0 ? clov_block.block1 : clov_block.block2;
+      // Input two-spinor component.
+      for (auto two_s_in : {0, 1}) {
+        // Reconstruct four spinor index.
+        auto const four_s_in = 2 * s_block + two_s_in;
+        // Output two-spinor component.
+        for (auto two_s_out : {0, 1}) {
+          // Reconstruct four spinor index.
+          auto const four_s_out = 2 * s_block + two_s_out;
+          // Input color.
+          for (auto c_in : {0, 1, 2}) {
+            // Spin-color index (0, ..., 5).
+            auto const sc_in = 3 * two_s_in + c_in;
+            // Output color.
+            for (auto c_out : {0, 1, 2}) {
+              // Spin-color index (0, ..., 5).
+              auto const sc_out = 3 * two_s_out + c_out;
+
+              cplx_mul_acc(
+                  spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
+                  block_in[sc_out][sc_in][re][veclen_idx], block_in[sc_out][sc_in][im][veclen_idx],
+                  spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/**
+  Multiplies a checkerboarded QPhiX Clover term with a checkerboarded QPhiX spinor.
+
+  Padding is taken care of. A test case for (a copy of) this function exists in QPhiX.
+
+  If the preprocessor macro `PRINT_MAPPING` is defined, it will print out the mapping of `(x, y, z,
+  t)` coordinates to block indices. Also it will check that each block is accessed the proper number
+  of times, that is `soalen` for spinors and `veclen` for clover blocks.
+
+  \param[out] out Output spinor
+  \param[in] in Input spinor
+  \param[in] clover Clover block
+  \param[in] geom Geometry object holding the dimension of clover and spinor
+  */
+template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
+void clover_product(
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock *const out,
+    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const *const in,
+    Clover *clover, ::QPhiX::Geometry<FT, veclen, soalen, compress12> &geom) {
+  ::QPhiX::zeroSpinor<FT, veclen, soalen, compress12>(out, geom, n_blas_simt);
+
+#ifdef PRINT_MAPPING
+  std::vector<int> spin_touches(geom.getPxyz() * geom.Nt(), 0);
+  std::vector<int> clover_touches(geom.getPxyz() * geom.Nt() * soalen / veclen, 0);
+
+  std::cout << std::setw(3) << "x" << std::setw(3) << "y" << std::setw(3) << "z" << std::setw(3)
+            << "t"
+            << ":" << std::setw(5) << "spin" << std::setw(5) << "clov"
+            << "\n";
+#endif
+
+  // Iterate through all the block.
+  for (int t = 0; t < geom.Nt(); ++t) {
+    for (int z = 0; z < geom.Nz(); ++z) {
+      for (int y = 0; y < geom.Ny(); ++y) {
+        for (int x = 0; x < geom.Nxh(); ++x) {
+          // First element in the current XY plane at desired Z and T.
+          auto const xyBase = t * geom.getPxyz() + z * geom.getPxy();
+          // Index of the SoA along the X direction.
+          auto const xb = x / soalen;
+          // Index within the SoA.
+          auto const xi = x % soalen;
+          // Global spin block index.
+          auto const spin_block_idx = xb + geom.Nxh() / soalen * y + xyBase;
+          // Global clover/gauge block index.
+          auto const clov_block_idx =
+              xb + (y / geom.nGY()) * geom.Nxh() / soalen + xyBase / geom.nGY();
+          // Index of the SoA structure within the current tile.
+          // auto const tile = (geom.Nxh() / soalen * y + xyBase) % geom.nGY();
+          auto const tile = y % geom.nGY();
+          // Vector index for clover/gauge. The SoA index only runs to
+          // `soalen`, this index needs to run to `veclen`, that is across the
+          // various SoA within the tile.
+          auto const veclen_idx = soalen * tile + xi;
+
+#ifdef PRINT_MAPPING
+          ++spin_touches[spin_block_idx];
+          ++clover_touches[clov_block_idx];
+
+          std::cout << std::setw(3) << x << std::setw(3) << y << std::setw(3) << z << std::setw(3)
+                    << t << ":" << std::setw(5) << spin_block_idx << std::setw(5) << clov_block_idx
+                    << "\n";
+#endif
+
+          assert(xi + xb * soalen == x);
+
+          // References to the objects at desired block.
+          auto const &clov_block = clover[clov_block_idx];
+          auto const &spinor_in = in[spin_block_idx];
+          auto &spinor_out = out[spin_block_idx];
+
+          InnerCloverProduct<FT, veclen, soalen, compress12, Clover>::multiply(
+              spinor_out, spinor_in, clov_block, xi, veclen_idx);
+        }
+      }
+    }
+  }
+
+#ifdef PRINT_MAPPING
+  std::cout << std::flush;
+
+  // Make sure that each block got touched the correct number of times.
+  for (int i = 0; i != spin_touches.size(); ++i) {
+    if (spin_touches[i] != soalen) {
+      std::cout << "Spin missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
+                << spin_touches[i] << " times instead of " << soalen << "\n";
+    }
+  }
+
+  for (int i = 0; i != clover_touches.size(); ++i) {
+    if (clover_touches[i] != veclen) {
+      std::cout << "Clover missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
+                << clover_touches[i] << " times instead of " << veclen << "\n";
+    }
+  }
+
+  std::cout << std::flush;
+#endif
+}
+
+/**
+  Abstract base class for all single-flavor Dslash variants.
+
+  There are four Dslash operators which are implemented in QPhiX:
+
+  - Wilson
+  - Wilson clover
+  - Wilson twisted mass
+  - Wilson clover with twisted mass
+
+  Each of these has a the actual Dslash operation and a so-called “achimbdpsi” operation. These act
+  on four-spinors given a gauge field. This base class provides a uniform interface to all four
+  kinds.
+
+  This code should eventually be migrated into the QPhiX repository. Currently these classes are
+  mere delegators. In the QPhiX repository, the actual classes there should be used as concrete
+  classes.
+  */
+template <typename FT, int veclen, int soalen, bool compress12>
+class Dslash {
+ public:
+  typedef ::QPhiX::Geometry<FT, veclen, soalen, compress12> Geom;
+  typedef typename Geom::FourSpinorBlock Spinor;
+  typedef typename Geom::SU3MatrixBlock SU3MatrixBlock;
+
+  explicit Dslash(Geom *geom, double const t_boundary_, double const aniso_coeff_S_,
+                  double const aniso_coeff_T_, double const mass_, bool use_tbc_[4] = nullptr,
+                  double tbc_phases_[4][2] = nullptr)
+      : geom(geom),
+        t_boundary(t_boundary_),
+        aniso_coeff_S(aniso_coeff_S_),
+        aniso_coeff_T(aniso_coeff_T_),
+        mass(mass_) {}
+
+  /**
+    Computes \f$ \psi_\mathrm o = A_\mathrm{oo} \chi_\mathrm o \f$.
+
+    The actual definition of the matrix \f$ A_\mathrm{oo} \f$ is
+    implementation dependent and can be the mass factor \f$ \alpha = 4 + m
+    \f$ for plain Wilson or something more complicated for twisted mass.
+
+    \param[out] out Output spinor \f$ \psi \f$.
+    \param[in] in Input spinor \f$ \chi \f$.
+    */
+  virtual void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) = 0;
+
+  /**
+    Computes \f$ \psi_\mathrm e = A_\mathrm{ee}^{-1} \chi_\mathrm e \f$.
+
+    \param[out] out Output spinor \f$ \psi \f$.
+    \param[in] in Input spinor \f$ \chi \f$.
+    */
+  virtual void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                         int const cb) = 0;
+
+  /**
+    Forwarder for the `dslash`.
+
+    This will call the `dslash` function of the respective QPhiX dslash class. There is a subtle
+    difference between the Wilson and all other cases. The Wilson dslash is just the hopping matrix,
+    just the operator \f$ D \f$. For every other case (clover, twisted mass, twisted mass clover),
+    the `dslash` member function will compute \f$ A^{-1} D \f$. In the Wilson case, this \f$ A =
+    \alpha = 4 + m = 1/(2 \kappa) \f$. Since that is _not_ included in the Wilson `dslash`, you will
+    obtain different results when using WilsonDslash::dslash and WilsonTMDslash::dslash with \f$
+    \mu = 0 \f$.
+
+    \todo Make this member function `const`. For this the member function in
+    QPhiX that is called internally must be marked `const` as well.
+    */
+  virtual void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                      int const isign, int const cb) = 0;
+
+  /**
+    Always plain Wilson dslash.
+
+    In contrast to the \ref dslash member function which just forwards the implementation of QPhiX,
+    this will always give you the “naked” plain Wilson dslash without any factors of \f$ A^{-1} \f$
+    applied.
+    */
+  virtual void plain_dslash(Spinor *const res, const Spinor *const psi,
+                            const SU3MatrixBlock *const u, int const isign, int const cb) {
+    // XXX Perhaps rather implement this with an instance of the WilsonDslash instead?
+
+    auto tmp = QPhiX::makeFourSpinorHandle(*geom);
+    dslash(tmp.get(), psi, u, isign, cb);
+    A_chi(res, tmp.get(), isign, cb);
+  };
+
+  /**
+    Always “dressed” dslash.
+
+    This computes \f$ A^{-1} D \f$ for all variants. In the Wilson case, this will give \f$
+    \alpha^{-1} D \f$.
+    */
+  virtual void A_inv_dslash(Spinor *const res, const Spinor *const psi,
+                            const SU3MatrixBlock *const u, int const isign, int const cb) {
+    dslash(res, psi, u, isign, cb);
+  };
+
+  /**
+    Forwarder for the `achimbdpsi`.
+
+    \todo Make this member function `const`. For this the member function in QPhiX that is called
+    internally must be marked `const` as well.
+    */
+  virtual void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                          const SU3MatrixBlock *const u, double const alpha, double const beta,
+                          int const isign, int const cb) = 0;
+
+  /**
+    Prepares the sources on the odd checkerboard.
+
+    This computes
+    \f[
+        \tilde b_o = \frac 12 D_{oe} M_{ee}^{-1} b_e + b_o \,.
+    \f]
+
+    \param[out] tilde_b_odd Prepared source
+    \param[in] b_even Source (right hand side) on the even lattice sites
+    \param]in] b_odd Source on the odd lattice sites
+    \param[in] u Gauge field on the odd lattice sites
+    */
+  virtual void prepare_source(Spinor *const tilde_b_odd, Spinor const *const b_even,
+                              Spinor const *const b_odd, SU3MatrixBlock const *const u);
+
+  /**
+    Reconstructs the solution on the even lattices sites.
+
+    This computes
+    \f[
+        x_e = M_{ee}^{-1} \left( b_e - \frac 12 D_{eo} x_o \right) \,.
+    \f]
+
+    \param[out] x_even Solution on the even lattices sites
+    \param[in] b_even Source (right hand side) on the even lattice sites
+    \param[in] x_odd Solution on the odd lattices sites
+    \param[in] u Gauge field on the even lattice sites
+    */
+  virtual void reconstruct_solution(Spinor *const x_even, Spinor const *const b_even,
+                                    Spinor const *const x_odd, SU3MatrixBlock const *const u);
+
+  Geom *getGeometry() const { return geom; }
+
+ private:
+  Geom *const geom;
+
+  double const t_boundary;
+  double const aniso_coeff_S;
+  double const aniso_coeff_T;
+  double const mass;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+
+  WilsonDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
+               double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
+               bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {}
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+             int const cb_ignored) override {
+    int const n_blas_simt = 1;
+    ::QPhiX::axy(mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+                 int const cb_ignored) override {
+    int const n_blas_simt = 1;
+    ::QPhiX::axy(1.0 / mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, isign, cb);
+  }
+
+  void plain_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                    int const isign, int const cb) override {
+    dslash(res, psi, u, isign, cb);
+  };
+
+  void A_inv_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+                    int const isign, int const cb) override {
+    auto tmp = QPhiX::makeFourSpinorHandle(upstream_dslash.getGeometry());
+    dslash(tmp.get(), psi, u, isign, cb);
+    A_inv_chi(res, tmp.get(), isign, cb);
+  };
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::Dslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonTMDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+
+  WilsonTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
+                 double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
+                 double const twisted_mass_, bool use_tbc_[4] = nullptr,
+                 double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, mass_, twisted_mass_,
+                        use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(0.25),
+        derived_mu(twisted_mass_ / mass_factor_alpha),
+        derived_mu_inv(mass_factor_alpha /
+                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {}
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign,
+             int const cb_ignored) override {
+    helper_A_chi(out, in, -derived_mu * isign, mass_factor_alpha);
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                 int const cb_ignored) override {
+    helper_A_chi(out, in, derived_mu * isign, derived_mu_inv);
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
+  }
+
+ private:
+  void helper_A_chi(Spinor *const out, Spinor const *const in, double const factor_a,
+                    double const factor_b);
+
+  ::QPhiX::TMDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+  double const derived_mu;
+  double const derived_mu_inv;
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonClovDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
+
+  WilsonClovDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
+                   double const t_boundary_, double const aniso_coeff_S_,
+                   double const aniso_coeff_T_, double const mass_,
+                   CloverBlock *const (&clover_)[2], CloverBlock *const (&inv_clover_)[2],
+                   bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {
+    for (int cb : {0, 1}) {
+      clover[cb] = clover_[cb];
+      inv_clover[cb] = inv_clover_[cb];
+    }
+  }
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+             int const cb) override {
+    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
+                 int const cb) override {
+    clover_product(out, in, inv_clover[cb], upstream_dslash.getGeometry());
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, inv_clover[cb], isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb], mass_factor_beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::ClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+
+  /**
+    Reference to the clover term.
+
+    This class has to provide a `dslash` and `achimbdpsi` member function with the prescribed
+    argument list which does not contain the clover term. The user of these classes should not have
+    to differentiate between non-clover and clover variants. In order to provide the function
+    signature, the clover term is a member. This means that the user has to construct a new operator
+    if the pointers to the clover field need to be changed. Seperate pointers are kept for the fields
+    on the even and odd checkerboards, hence the array dimension.
+    */
+  CloverBlock *clover[2];
+
+  /// See \ref clover.
+  CloverBlock *inv_clover[2];
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+class WilsonClovTMDslash : public Dslash<FT, veclen, soalen, compress12> {
+ public:
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
+  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
+  typedef
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock FullCloverBlock;
+  typedef
+      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
+
+  WilsonClovTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
+                     double const t_boundary_, double const aniso_coeff_S_,
+                     double const aniso_coeff_T_, double const mass_, double const twisted_mass_,
+                     CloverBlock *const (&clover_)[2],
+                     FullCloverBlock *const (&inv_clover_)[2][2], bool use_tbc_[4] = nullptr,
+                     double tbc_phases_[4][2] = nullptr)
+      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
+                                               mass_, use_tbc_, tbc_phases_),
+        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
+        mass_factor_alpha(4.0 + mass_),
+        mass_factor_beta(0.25),
+        derived_mu(twisted_mass_ / mass_factor_alpha),
+        derived_mu_inv(mass_factor_alpha /
+                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {
+    for (int cb : {0, 1}) {
+      clover[cb] = clover_[cb];
+      for (int fl : {0, 1}) {
+        inv_clover[cb][fl] = inv_clover_[cb][fl];
+      }
+    }
+  }
+
+  void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) override {
+    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
+    // TODO: add twisted mass here
+  }
+
+  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
+                 int const cb) override {
+    if (isign == -1) {
+      clover_product(out, in, inv_clover[cb][1], upstream_dslash.getGeometry());
+    } else {
+      clover_product(out, in, inv_clover[cb][0], upstream_dslash.getGeometry());
+    }
+  }
+
+  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
+              int const isign, int const cb) override {
+    upstream_dslash.dslash(res, psi, u, (const FullCloverBlock **)inv_clover[cb], isign, cb);
+  }
+
+  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
+                  const SU3MatrixBlock *const u, double const alpha, double const beta,
+                  int const isign, int const cb) override {
+    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb],
+                                         mass_factor_beta, isign, cb);
+  }
+
+ private:
+  ::QPhiX::TMClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
+
+  double const mass_factor_alpha;
+  double const mass_factor_beta;
+  double const derived_mu;
+  double const derived_mu_inv;
+
+  CloverBlock *clover[2];
+  /* For twisted clover, there are two fields on each checkerboard which differ in the sign
+   * of the twisted quark mass. In effect then, the inner index can be thought of as being
+   * in flavour space while the outer index is the checkerboard index. 
+   */
+  FullCloverBlock *inv_clover[2][2];
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void WilsonTMDslash<FT, veclen, soalen, compress12>::helper_A_chi(Spinor *const out,
+                                                                  Spinor const *const in,
+                                                                  double const factor_a,
+                                                                  double const factor_b) {
+  auto const nVecs = upstream_dslash.getGeometry().nVecs();
+  auto const Pxy = upstream_dslash.getGeometry().getPxy();
+  auto const Pxyz = upstream_dslash.getGeometry().getPxyz();
+
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX / 2; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          uint64_t const SIMD_vector = x / soalen;
+          uint64_t const x_internal = x % soalen;
+          uint64_t const qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+
+          for (int color = 0; color < 3; ++color) {
+            for (int spin_block = 0; spin_block < 2; ++spin_block) {
+              // Implement the $\gamma_5$ structure.
+              auto const signed_factor_a = factor_a * (spin_block == 0 ? 1.0 : -1.0);
+
+              for (int half_spin = 0; half_spin < 2; ++half_spin) {
+                auto const four_spin = 2 * spin_block + half_spin;
+                for (int v = 0; v < soalen; ++v) {
+                  auto &out_bcs = out[qphix_idx][color][four_spin];
+                  auto const &in_bcs = in[qphix_idx][color][four_spin];
+
+                  out_bcs[re][v] = factor_b * (in_bcs[re][v] + signed_factor_a * in_bcs[im][v]);
+                  out_bcs[im][v] = factor_b * (in_bcs[im][v] - signed_factor_a * in_bcs[re][v]);
+                }
+              }
+            }
+          }
+
+        }  // volume
+};
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void Dslash<FT, veclen, soalen, compress12>::prepare_source(Spinor *const tilde_b_odd,
+                                                            Spinor const *const b_even,
+                                                            Spinor const *const b_odd,
+                                                            SU3MatrixBlock const *const u) {
+  auto Mee_be = QPhiX::makeFourSpinorHandle(*geom);
+  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
+                                                            aniso_coeff_T, mass);
+
+  A_inv_chi(Mee_be.get(), b_even, 1, cb_even);
+
+  plain_dslash.dslash(tilde_b_odd, Mee_be.get(), u, 1, cb_odd);
+
+  // FIXME Perhaps use a variable number of BLAS threads here (last parameter).
+  QPhiX::aypx(0.5, Mee_be.get(), tilde_b_odd, *geom, 1);
+}
+
+template <typename FT, int veclen, int soalen, bool compress12>
+void Dslash<FT, veclen, soalen, compress12>::reconstruct_solution(Spinor *const x_even,
+                                                                  Spinor const *const b_even,
+                                                                  Spinor const *const x_odd,
+                                                                  SU3MatrixBlock const *const u) {
+  auto tmp = QPhiX::makeFourSpinorHandle(*geom);
+  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
+                                                            aniso_coeff_T, mass);
+
+  plain_dslash.dslash(tmp.get(), x_odd, u, 1, cb_even);
+  QPhiX::aypx(0.5, b_even, tmp.get(), *geom, 1);
+  A_inv_chi(x_even, tmp.get(), 1, cb_even);
+}
+}
diff --git a/src/lib/qphix/qphix_interface.cpp b/src/lib/qphix/qphix_interface.cpp
new file mode 100644
index 000000000..2c61427dd
--- /dev/null
+++ b/src/lib/qphix/qphix_interface.cpp
@@ -0,0 +1,2192 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#include "qphix_interface.h"
+#include "qphix_interface.hpp"
+#include "qphix_interface_utils.hpp"
+#include "qphix_types.h"
+#include "qphix_veclen.h"
+
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+
+extern "C" {
+#ifdef HAVE_CONFIG_H
+#include "tmlqcd_config.h"
+#endif
+#include "boundary.h"
+#include "geometry_eo.h"
+#include "gettime.h"
+#include "global.h"
+#include "linalg/convert_eo_to_lexic.h"
+#include "linalg/diff.h"
+#include "linalg/square_norm.h"
+#include "misc_types.h"
+#include "operator/Hopping_Matrix.h"
+#include "operator/clover_leaf.h"
+#include "operator/clovertm_operators.h"
+#include "operator_types.h"
+#include "struct_accessors.h"
+
+// for the normalisation of the heavy doublet when running
+// RHMC
+#include "phmc.h"
+
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver.h"
+#include "solver/solver_field.h"
+#include "solver/solver_params.h"
+#include "solver/solver_types.h"
+#include "start.h"
+#include "xchange/xchange_gauge.h"
+}
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
+#include <qphix/blas_new_c.h>
+#include <qphix/clover.h>
+#include <qphix/inv_dummy_hermtest.h>
+#include <qphix/inv_richardson_multiprec.h>
+#include <qphix/invbicgstab.h>
+#include <qphix/invcg.h>
+#include <qphix/minvcg.h>
+#include <qphix/ndtm_reuse_operator.h>
+#include <qphix/ndtm_reuse_operator_clover.h>
+#include <qphix/print_utils.h>
+#include <qphix/qphix_config.h>
+#include <qphix/twisted_mass.h>
+#include <qphix/twisted_mass_clover.h>
+#include <qphix/wilson.h>
+#include <cfloat>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace tmlqcd;
+
+tm_QPhiXParams_t qphix_input;
+
+int By;
+int Bz;
+int NCores;
+int Sy;
+int Sz;
+int PadXY;
+int PadXYZ;
+int MinCt;
+int N_simt;
+bool compress12;
+QphixPrec_t qphix_precision;
+QphixPrec_t qphix_inner_precision;
+
+int subLattSize[4];
+int lattSize[4];
+int qmp_geom[4];
+int qmp_tm_map[4];
+
+// angles for boundary phases, values come from read_input
+extern double X0, X1, X2, X3;
+
+bool use_tbc[4];
+double tbc_phases[4][2];
+// we always use twisted boundary conditions, which means that we are always
+// periodic in time and any possible anti-periodicity is implemented via
+// the phase
+double constexpr t_boundary = 1.0;
+
+template <typename T>
+struct rsdTarget {
+  static const double value;
+};
+
+template <>
+const double rsdTarget<QPhiX::half>::value = 1.0e-3;
+
+template <>
+const double rsdTarget<float>::value = 1.0e-8;
+
+void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_,
+                QphixPrec_t inner_precision_) {
+  static bool qmp_topo_initialised = false;
+
+  // Global Lattice Size
+  lattSize[0] = LX * g_nproc_x;
+  lattSize[1] = LY * g_nproc_y;
+  lattSize[2] = LZ * g_nproc_z;
+  lattSize[3] = T * g_nproc_t;
+
+  // Local Lattice Size
+  subLattSize[0] = LX;
+  subLattSize[1] = LY;
+  subLattSize[2] = LZ;
+  subLattSize[3] = T;
+
+  // extract twisted boundary conditions
+  for (int dim = 0; dim < 4; dim++) {
+    bool dim_tbc = false;
+    double dim_phase[2] = {1.0, 0.0};
+    if (dim == 0) {
+      dim_tbc = (fabs(X1) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_1))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_1))[1] / g_kappa;
+    } else if (dim == 1) {
+      dim_tbc = (fabs(X2) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_2))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_2))[1] / g_kappa;
+    } else if (dim == 2) {
+      dim_tbc = (fabs(X3) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_3))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_3))[1] / g_kappa;
+    } else if (dim == 3) {
+      dim_tbc = (fabs(X0) > DBL_EPSILON);
+      dim_phase[0] = -((double *)(&phase_0))[0] / g_kappa;
+      dim_phase[1] = -((double *)(&phase_0))[1] / g_kappa;
+    }
+    use_tbc[dim] = dim_tbc;
+    tbc_phases[dim][0] = dim_phase[0];
+    tbc_phases[dim][1] = dim_phase[1];
+  }
+
+  By = params.By;
+  Bz = params.Bz;
+  NCores = params.NCores;
+  Sy = params.Sy;
+  Sz = params.Sz;
+  PadXY = params.PadXY;
+  PadXYZ = params.PadXYZ;
+  MinCt = params.MinCt;
+  N_simt = Sy * Sz;
+  if (c12 == 8) {
+    QPhiX::masterPrintf(
+        "# INFO QphiX: 8-parameter gauge compression not supported, using two row compression "
+        "instead!\n");
+    c12 = 12;
+  }
+  compress12 = c12 == 12 ? true : false;
+  qphix_precision = precision_;
+  qphix_inner_precision = inner_precision_;
+
+#ifdef QPHIX_QMP_COMMS
+  // Declare the logical topology
+  if (!qmp_topo_initialised) {
+    // the QMP topology is the one implied by the number of processes in each
+    // dimension as required by QPHIX ( x fastest to t slowest running )
+    qmp_geom[0] = g_nproc_x;
+    qmp_geom[1] = g_nproc_y;
+    qmp_geom[2] = g_nproc_z;
+    qmp_geom[3] = g_nproc_t;
+
+    // in order for the topologies to agree between tmLQCD and QPhiX, the dimensions need to be
+    // permuted
+    // since Z is fastest in tmLQCD and X is second-slowest
+    qmp_tm_map[0] = 2;
+    qmp_tm_map[1] = 1;
+    qmp_tm_map[2] = 0;
+    qmp_tm_map[3] = 3;
+    if (QMP_declare_logical_topology_map(qmp_geom, 4, qmp_tm_map, 4) != QMP_SUCCESS) {
+      QMP_error("Failed to declare QMP Logical Topology\n");
+      abort();
+    }
+    // longish test to check if the logical coordinates are correctly mapped
+    if (g_debug_level >= 5) {
+      for (int proc = 0; proc < g_nproc; proc++) {
+        if (proc == g_proc_id) {
+          const int coordinates[4] = {g_proc_coords[1], g_proc_coords[2], g_proc_coords[3],
+                                      g_proc_coords[0]};
+          int id = QMP_get_node_number_from(coordinates);
+          int *qmp_coords = QMP_get_logical_coordinates_from(id);
+          fflush(stdout);
+          printf("QMP id: %3d x:%3d y:%3d z:%3d t:%3d\n", id, qmp_coords[0], qmp_coords[1],
+                 qmp_coords[2], qmp_coords[3]);
+          printf("MPI id: %3d x:%3d y:%3d z:%3d t:%3d\n\n", g_proc_id, g_proc_coords[1],
+                 g_proc_coords[2], g_proc_coords[3], g_proc_coords[0]);
+          free(qmp_coords);
+          fflush(stdout);
+          MPI_Barrier(MPI_COMM_WORLD);
+        } else {
+          MPI_Barrier(MPI_COMM_WORLD);
+        }
+      }
+    }
+    qmp_topo_initialised = true;
+  }
+#endif
+
+#ifdef QPHIX_QPX_SOURCE
+  if (thread_bind) {
+    QPhiX::setThreadAffinity(NCores_user, Sy_user * Sz_user);
+  }
+  QPhiX::reportAffinity();
+#endif
+}
+
+void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_) {
+  _initQphix(argc, argv, params, c12, precision_, precision_);
+}
+
+// Finalize the QPhiX library
+void _endQphix() {}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_clover_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::CloverBlock *qphix_clover, int cb,
+    bool inverse, bool fl_offdiag = false) {
+  const double startTime = gettime();
+
+  /* the spin-colour clover term in sw_term and the corresponding inverse
+   * in sw_inv are stored in the tmLQCD gamma basis.
+   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
+   * spinor and then apply the same transformation to the output spinor
+   * ( we have V^dagger = V and V*V = 1 )
+   * Thus, in order to translate the clover field, we need to copy
+   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
+   * This way, the clover term will be in the correct gamma basis.
+   *
+   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
+   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
+   * Below, each sAB represents one 3x3 colour matrix
+   *
+   *                +s33 -s32    0    0
+   *  T' = V*T*V =  -s23 +s22    0    0
+   *                   0    0 +s11 -s10
+   *                   0    0 -s01 +s00
+   *
+   * Such that the half-spinor blocks are inverted and within these, the ordering is
+   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
+   * each other and this is preserved by the transformation.
+   *
+   * The QPhiX (Wilson) clover term is stored as 12 reals on the diagonal
+   * in two 6-element vectors, one for each half-spinor spin pair
+   * and two sets of off-diagonal complex components.
+   *
+   * In addition, colour matrices are transposed in QPhiX.
+   *
+   * The tmLQCD clover term is stored as:
+   *
+   *      s00 s01
+   *          s11
+   * T =          s22 s23
+   *                  s33
+   *
+   * with indexing
+   *
+   *     sw[0][0] sw[1][0]
+   *              sw[2][0]
+   *                       sw[0][1] sw[1][1]
+   *                                sw[2][1]
+   *
+   * The inverse has four su3 blocks instead and is indexed
+   *     sw_inv[0][0] sw_inv[1][0]
+   *     sw_inv[3][0] sw_inv[2][0]
+   *                               sw_inv[0][1] sw_inv[1][1]
+   *                               sw_inv[3][1] sw_inv[2][1]
+   *
+   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0
+   *
+   * There is a special case for the non-degenerate twisted clover operator. The
+   * flavour-off-diagonal components of the inverse clover term do not have an imaginary part on the
+   * spin-colour diagonal. They can thus be stored as CloverBlock, which is done in the QPhiX
+   * implementation of the ND tmclover operator.
+   *
+   * As a hack, this inverse is prepared by sw_invert_epsbar and placed in to the last
+   * VOLUME/2 sites of sw_inv. Reading from there is triggered by the boolean
+   * fl_offdiag.
+   */
+
+  // rescale to get clover term (or its inverse) in the physical normalisation
+  // rather than the kappa normalisation
+  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
+  su3 ***tm_clover = inverse ? sw_inv : sw;
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // packer for Wilson clover (real diagonal + complex upper-triangular)
+  /* for the index in the off_diagN arrays, we map to an index in the su3 struct
+   * keeping in mind complex conjugation
+   * The off-diagonal in QPhiX is stored as follows:
+   *
+   * 0 1 3 6 10
+   *   2 4 7 11
+   *     5 8 12
+   *       9 13
+   *         14
+   *
+   * which we are going to map to su3 in blocks
+   *
+   *     0* 1*
+   *        2*
+   *
+   * 3   4  5
+   * 6   7  8
+   * 10 11 12
+   *
+   *   9* 13*
+   *      14*
+   *
+   * where the asterisk indicates complex conjugation. As a linear array then,
+   * these mappings are:
+   *
+   */
+  const int od_su3_offsets[15] = {Nz,
+                                  2 * Nz,            //     0 1
+                                  Nc * Nz + 2 * Nz,  //       2
+
+                                  0,
+                                  Nz,
+                                  2 * Nz,  // 3  4  5
+                                  Nc * Nz,
+                                  Nc * Nz + Nz,
+                                  Nc * Nz + 2 * Nz,  // 6  7  8
+
+                                  Nz,  //     9
+
+                                  2 * Nc * Nz,
+                                  2 * Nc * Nz + Nz,
+                                  2 * Nc * Nz + 2 * Nz,  // 10 11 12
+
+                                  2 * Nz,
+                                  Nc * Nz + 2 * Nz};  // 13 14
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
+            int64_t xx = (y % ngy) * SOALEN + x_soa;
+            int64_t q_cb_x_coord = x_soa + v * SOALEN;
+            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+
+            // the inverse of the clover term is in even-odd ordering
+            // while the clover term itself is lexicographically ordered
+            // for the special case of the nd tmclover operator, the inverse of the flavour
+            // off-diagonal components is stored in the last VOLUME/2 elements of sw_inv
+            int64_t tm_idx =
+                (inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z]) +
+                ((inverse && fl_offdiag) ? VOLUME / 2 : 0);
+
+            int b_idx;
+
+            //             we begin with the diagonal elements in CloverBlock
+            for (int d = 0; d < 6; d++) {
+              //               choose the block in sw which corresponds to the block in T'
+              b_idx = d < 3 ? 2 : 0;
+              //               get the right colour components
+              qphix_clover[block].diag1[d][xx] = QPhiX::rep<FT, double>(
+                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                    (Nc * Nz + Nz) * (d % 3)) *
+                  scale);
+
+              qphix_clover[block].diag2[d][xx] = QPhiX::rep<FT, double>(
+                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                    (Nc * Nz + Nz) * (d % 3)) *
+                  scale);
+            }
+
+            b_idx = 2;  // s33 and s11
+            for (int od : {0, 1, 2}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+
+                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+              }
+            }
+
+            b_idx = 1;  // s32 and s10
+            for (int od : {3, 4, 5, 6, 7, 8, 10, 11, 12}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    (-scale));
+
+                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    (-scale));
+              }
+            }
+
+            b_idx = 0;  // s22 and s00
+            for (int od : {9, 13, 14}) {
+              for (int reim : {0, 1}) {
+                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+
+                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
+                    (reim == 1 ? -1.0 : 1.0) *
+                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
+                      od_su3_offsets[od] + reim) *
+                    scale);
+              }
+            }
+
+          }  // x_soa
+        }  // for(v)
+      }  // for(y)
+    }  // for(z)
+  }  // for(t)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf(
+        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (CloverBlock): %f secs\n",
+        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_clover_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FullCloverBlock *qphix_clover[2],
+    int cb, bool inverse) {
+  const double startTime = gettime();
+
+  /* the spin-colour clover term in sw_term and the corresponding inverse
+   * in sw_inv are stored in the tmLQCD gamma basis.
+   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
+   * spinor and then apply the same transformation to the output spinor
+   * ( we have V^dagger = V and V*V = 1 )
+   * Thus, in order to translate the clover field, we need to copy
+   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
+   * This way, the clover term will be in the correct gamma basis.
+   *
+   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
+   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
+   * Below, each sAB represents one 3x3 colour matrix
+   *
+   *                +s33 -s32    0    0
+   *  T' = V*T*V =  -s23 +s22    0    0
+   *                   0    0 +s11 -s10
+   *                   0    0 -s01 +s00
+   *
+   * Such that the half-spinor blocks are inverted and within these, the ordering is
+   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
+   * each other and this is preserved by the transformation.
+   *
+   * The QPhiX (tmclover) clover term and its inverse are stored as a pair of full
+   * 6x6 complex matrices which are multiplied with the spinor in exactly the same way
+   * as in tmLQCD.
+   *
+   * The tmLQCD clover term is stored as:
+   *
+   *      s00 s01
+   *          s11
+   * T =          s22 s23
+   *                  s33
+   *
+   * with indexing
+   *
+   *     sw[0][0] sw[1][0]
+   *              sw[2][0]
+   *                       sw[0][1] sw[1][1]
+   *                                sw[2][1]
+   *
+   * The inverse has four su3 blocks instead and is indexed
+   *     sw_inv[0][0] sw_inv[1][0]
+   *     sw_inv[3][0] sw_inv[2][0]
+   *                               sw_inv[0][1] sw_inv[1][1]
+   *                               sw_inv[3][1] sw_inv[2][1]
+   *
+   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0   *
+   */
+
+  // rescale to get clover term (or its inverse) in the physical normalisation
+  // rather than the kappa normalisation
+  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
+  su3 ***tm_clover = inverse ? sw_inv : sw;
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const double amu = g_mu / (2.0 * g_kappa);
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
+            int64_t xx = (y % ngy) * SOALEN + x_soa;
+            int64_t q_cb_x_coord = x_soa + v * SOALEN;
+            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+
+            //             the inverse of the clover term is in even-odd ordering
+            //             while the clover term itself is lexicographically ordered
+            int64_t tm_idx =
+                inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z];
+
+            for (int fl : {0, 1}) {
+              if (inverse && fl == 1) {
+                // the inverse clover term for the second flavour is stored at an offset
+                tm_idx += VOLUME / 2;
+              }
+              for (int q_hs : {0, 1}) {
+                auto &hs_block =
+                    ((q_hs == 0) ? qphix_clover[fl][block].block1 : qphix_clover[fl][block].block2);
+                for (int q_sc1 = 0; q_sc1 < 6; q_sc1++) {
+                  for (int q_sc2 = 0; q_sc2 < 6; q_sc2++) {
+                    const int q_s1 = q_sc1 / 3;
+                    const int q_s2 = q_sc2 / 3;
+                    const int q_c1 = q_sc1 % 3;
+                    const int q_c2 = q_sc2 % 3;
+
+                    // invert in spin as required by V*T*V
+                    const int t_hs = 1 - q_hs;
+                    // the indices inside the half-spinor are also inverted
+                    // (which transposes them, of course)
+                    const int t_s1 = 1 - q_s1;
+                    const int t_s2 = 1 - q_s2;
+                    // carry out the mapping from T' to T, keeping in mind that for the inverse
+                    // there are four blocks also on the tmLQCD side, otherwise there are just three
+                    const int t_b_idx = t_s1 + t_s2 + ((inverse && t_s1 == 1 && t_s2 == 0) ? 2 : 0);
+                    for (int reim : {0, 1}) {
+                      hs_block[q_sc1][q_sc2][reim][xx] = QPhiX::rep<FT, double>(
+                          scale *
+                              // off-diagonal (odd-numbered) blocks change sign
+                              (t_b_idx & 1 ? (-1.0) : 1.0) *
+                              // if not doing the inverse and in the bottom-left block, need to
+                              // complex conjugate
+                              ((!inverse && (t_s1 == 1 && t_s2 == 0) && reim == 1) ? -1.0 : 1.0) *
+                              *(reinterpret_cast<double const *const>(
+                                    &(tm_clover[tm_idx][t_b_idx][t_hs].c00)) +
+                                // if not doing the inverse and in the bottom-left block, transpose
+                                // in colour
+                                // because we're actually reading out of the top-right block
+                                Nz * ((!inverse && (t_s1 == 1 && t_s2 == 0)) ? Nc * q_c2 + q_c1
+                                                                             : Nc * q_c1 + q_c2) +
+                                reim) +
+                          // in the QPhiX gamma basis, the twisted quark mass enters with the
+                          // opposite
+                          // sign for consistency
+                          ((!inverse && q_sc1 == q_sc2 && q_hs == 0 && reim == 1)
+                               ? -amu * (1 - 2 * fl)
+                               : 0) +
+                          ((!inverse && q_sc1 == q_sc2 && q_hs == 1 && reim == 1)
+                               ? amu * (1 - 2 * fl)
+                               : 0));
+                    }
+                  }  // q_sc2
+                }  // q_sc1
+              }  // q_hs
+            }  // fl
+
+          }  // x_soa
+        }  // for(v)
+      }  // for(y)
+    }  // for(z)
+  }  // for(t)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf(
+        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (FullCloverBlock): %f secs\n",
+        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_gauge_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb0,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb1) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  // Here c1 is QPhiX's outer color, and c2 the inner one
+  const int Ns = 4;
+  const int Nc1 = compress12 ? 2 : 3;
+  const int Nc2 = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto ngy = geom.nGY();
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // orderings of the direction index "\mu" in tmlQCD
+  // and QPhiX, respectively
+  // in qphix, the Dirac operator is applied in the order
+  //   -+x -> -+y -> -+z -> -+t
+  // while tmlqcd does
+  //   -+t -> -+x -> -+y -> -+z
+  // same as the lattice ordering
+  // The mappingn between the application dimensions is thus:
+  //  tmlqcd_dim(t(0) -> x(1) -> y(2) -> z(3)) = qphix_dim( t(3) -> x(0) -> y(1) -> z(2) )
+  const int change_dim[4] = {1, 2, 3, 0};
+
+  // Get the base pointer for the (global) tmlQCD gauge field
+  xchange_gauge(g_gauge_field);
+  const double *in = reinterpret_cast<double *>(&g_gauge_field[0][0].c00);
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++)
+    for (int64_t z = 0; z < LZ; z++)
+      for (int64_t y = 0; y < LY; y++)
+        for (int64_t v = 0; v < nVecs; v++) {
+          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
+
+          for (int dim = 0; dim < 4; dim++)     // dimension == QPhiX \mu
+            for (int c1 = 0; c1 < Nc1; c1++)    // QPhiX convention color 1 (runs up to 2 or 3)
+              for (int c2 = 0; c2 < Nc2; c2++)  // QPhiX convention color 2 (always runs up to 3)
+                for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                  int64_t xx = (y % ngy) * SOALEN + x_soa;
+                  int64_t q_cb_x_coord = x_soa + v * SOALEN;
+                  int64_t tm_x_coord_cb0 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 0);
+                  int64_t tm_x_coord_cb1 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 1);
+
+                  int64_t tm_idx_cb0;
+                  int64_t tm_idx_cb1;
+
+                  // backward / forward
+                  for (int dir = 0; dir < 2; dir++) {
+                    if (dir == 0) {
+                      tm_idx_cb0 = g_idn[g_ipt[t][tm_x_coord_cb0][y][z]][change_dim[dim]];
+                      tm_idx_cb1 = g_idn[g_ipt[t][tm_x_coord_cb1][y][z]][change_dim[dim]];
+                    } else {
+                      tm_idx_cb0 = g_ipt[t][tm_x_coord_cb0][y][z];
+                      tm_idx_cb1 = g_ipt[t][tm_x_coord_cb1][y][z];
+                    }
+                    for (int reim = 0; reim < Nz; reim++) {
+                      // Note:
+                      // -----
+                      // 1. \mu in QPhiX runs from 0..7 for all eight neighbouring
+                      // links.
+                      //    Here, the ordering of the direction (backward/forward)
+                      //    is the same
+                      //    for tmlQCD and QPhiX, but we have to change the
+                      //    ordering of the dimensions.
+                      int q_mu = 2 * dim + dir;
+
+                      qphix_gauge_cb0[block][q_mu][c1][c2][reim][xx] =
+                          QPhiX::rep<FT, double>(su3_get_elem(
+                              &(g_gauge_field[tm_idx_cb0][change_dim[dim]]), c2, c1, reim));
+                      qphix_gauge_cb1[block][q_mu][c1][c2][reim][xx] =
+                          QPhiX::rep<FT, double>(su3_get_elem(
+                              &(g_gauge_field[tm_idx_cb1][change_dim[dim]]), c2, c1, reim));
+                    }
+                  }
+                }  // for(dim,c1,c2,x_soa)
+        }  // outer loop (t,z,y,v)
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_gauge_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder tmLQCD eo-spinor to a FourSpinorBlock QPhiX spinor on the given checkerboard
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_eo_spinor_to_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor const *const tm_eo_spinor,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
+    const int cb) {
+  const double startTime = gettime();
+
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+  const auto Nxh = geom.Nxh();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  // (note, this is a 4x4 matrix with 4 non-zero elements)
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          for (int col = 0; col < Nc; col++) {
+            for (int q_spin = 0; q_spin < Ns; q_spin++) {
+              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
+                int64_t q_cb_x_coord = v * SOALEN + x_soa;
+                // when t+y+z is odd and we're on an odd (1) checkerboard OR
+                // when t+y+z is even and we're on an even (0) checkerboard
+                // the full x coordinate is 2*x_cb
+                // otherwise, it is 2*x_cb+1
+                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+                // exchange x and z dimensions
+                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
+
+                for (int reim = 0; reim < 2; reim++) {
+                  qphix_spinor[q_ind][col][q_spin][reim][x_soa] = QPhiX::rep<FT, double>(
+                      change_sign[q_spin] *
+                      spinor_get_elem(&(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col, reim));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_eo_spinor_from_QPhiX(
+    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor *tm_eo_spinor,
+    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
+    const int cb, double normFac = 1.0) {
+  const double startTime = gettime();
+
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+  const auto Nxh = geom.Nxh();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  // (note, this is a 4x4 matrix with 4 non-zero elements)
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+#pragma omp parallel for collapse(4)
+  for (int64_t t = 0; t < T; t++) {
+    for (int64_t z = 0; z < LZ; z++) {
+      for (int64_t y = 0; y < LY; y++) {
+        for (int64_t v = 0; v < nVecs; v++) {
+          for (int col = 0; col < Nc; col++) {
+            for (int q_spin = 0; q_spin < Ns; q_spin++) {
+              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
+                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
+                int64_t q_cb_x_coord = v * SOALEN + x_soa;
+                // when t+y+z is odd and we're on an odd checkerboard (1) OR
+                // when t+y+z is even and we're on an even (0) checkerboard
+                // the full x coordinate is 2*x_cb
+                // otherwise, it is 2*x_cb+1
+                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
+                // exchange x and z dimensions
+                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
+
+                spinor_set_elem(
+                    &(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col,
+                    change_sign[q_spin] * normFac *
+                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][0][x_soa]),
+                    change_sign[q_spin] * normFac *
+                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][1][x_soa]));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_from_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder a full tmLQCD spinor to a cb0 and cb1 QPhiX spinor
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_spinor_to_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+                             double const *tm_spinor, FT *qphix_spinor_cb0, FT *qphix_spinor_cb1) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+// This will loop over the entire lattice and calculate
+// the array and internal indices for both tmlQCD & QPhiX
+#pragma omp parallel for collapse(4)
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          // These are the QPhiX SIMD vector in checkerboarded x direction
+          // (up to LX/2) and the internal position inside the SIMD vector
+          const uint64_t SIMD_vector = (x / 2) / SOALEN;
+          const uint64_t x_internal = (x / 2) % SOALEN;
+
+          // Calculate the array index in tmlQCD & QPhiX,
+          // given a global lattice index (t,x,y,z)
+          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+          const uint64_t tm_idx = g_ipt[t][x][y][z];
+
+          // Calculate base point for every spinor field element (tmlQCD) or
+          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
+          // which will depend on the checkerboard (cb)
+          const double *in = tm_spinor + Ns * Nc * Nz * tm_idx;
+          FT *out;
+          if ((t + x + y + z) & 1)
+            out = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // odd -> cb1
+          else
+            out = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // even -> cb0
+
+          // Copy the internal elements, performing a gamma basis transformation
+          for (int spin = 0; spin < Ns; spin++)  // QPhiX spin index
+            for (int color = 0; color < Nc; color++)
+              for (int z = 0; z < Nz; z++)  // RE or IM
+              {
+                const uint64_t qId =
+                    x_internal + z * SOALEN + spin * SOALEN * Nz + color * SOALEN * Nz * Ns;
+                const uint64_t tId = z + color * Nz + change_spin[spin] * Nz * Nc;
+
+                out[qId] = QPhiX::rep<FT, double>(change_sign[spin] * in[tId]);
+              }
+
+        }  // volume
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_to_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+// Reorder a cb0 and cb1 QPhiX spinor to a full tmLQCD spinor
+template <typename FT, int VECLEN, int SOALEN, bool compress12>
+void reorder_spinor_from_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
+                               double *tm_spinor, FT const *qphix_spinor_cb0,
+                               FT const *qphix_spinor_cb1, double normFac = 1.0) {
+  const double startTime = gettime();
+
+  // Number of elements in spin, color & complex
+  const int Ns = 4;
+  const int Nc = 3;
+  const int Nz = 2;
+
+  // Geometric parameters for QPhiX data layout
+  const auto nVecs = geom.nVecs();
+  const auto Pxy = geom.getPxy();
+  const auto Pxyz = geom.getPxyz();
+
+  // This is needed to translate between the different
+  // gamma bases tmlQCD and QPhiX are using
+  const int change_sign[4] = {1, -1, -1, 1};
+  const int change_spin[4] = {3, 2, 1, 0};
+
+// This will loop over the entire lattice and calculate
+// the array and internal indices for both tmlQCD & QPhiX
+#pragma omp parallel for collapse(4)
+  for (uint64_t t = 0; t < T; t++)
+    for (uint64_t x = 0; x < LX; x++)
+      for (uint64_t y = 0; y < LY; y++)
+        for (uint64_t z = 0; z < LZ; z++) {
+          // These are the QPhiX SIMD vector in checkerboarded x direction
+          // (up to LX/2) and the internal position inside the SIMD vector
+          const uint64_t SIMD_vector = (x / 2) / SOALEN;
+          const uint64_t x_internal = (x / 2) % SOALEN;
+
+          // Calculate the array index in tmlQCD & QPhiX,
+          // given a global lattice index (t,x,y,z)
+          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
+          const uint64_t tm_idx = g_ipt[t][x][y][z];
+
+          // Calculate base point for every spinor field element (tmlQCD) or
+          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
+          // which will depend on the checkerboard (cb)
+          const FT *in;
+          if ((t + x + y + z) & 1)
+            in = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb1
+          else
+            in = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb0
+          double *out = tm_spinor + Ns * Nc * Nz * tm_idx;
+
+          // Copy the internal elements, performing a gamma basis transformation
+          for (int spin = 0; spin < Ns; spin++)  // tmlQCD spin index
+            for (int color = 0; color < Nc; color++)
+              for (int z = 0; z < Nz; z++)  // RE or IM
+              {
+                const uint64_t qId = x_internal + z * SOALEN + change_spin[spin] * SOALEN * Nz +
+                                     color * SOALEN * Nz * Ns;
+                const uint64_t tId = z + color * Nz + spin * Nz * Nc;
+
+                out[tId] = QPhiX::rep<double, FT>(normFac * change_sign[spin] * in[qId]);
+              }
+
+        }  // volume
+
+  const double diffTime = gettime() - startTime;
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_from_QPhiX: %f secs\n",
+                        diffTime);
+  }
+}
+
+template <typename FT, int V, int S, bool compress12, typename FT_inner, int V_inner, int S_inner,
+          bool compress12_inner>
+void pack_nd_clover(
+    QPhiX::Geometry<FT, V, S, compress12> &geom,
+    QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner> &geom_inner,
+    typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock *full_invclov[2],
+    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *invclov_odiag,
+    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *clov,
+    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
+        *full_invclov_inner[2],
+    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
+        *invclov_odiag_inner,
+    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock *clov_inner,
+    const int cb, bool pack_inner) {
+  typedef typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock QClover;
+  typedef typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock QFullClover;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
+      QClover_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
+      QFullClover_inner;
+
+  double start = gettime();
+  reorder_clover_to_QPhiX(geom, clov, cb, false);
+  if (pack_inner) {
+    reorder_clover_to_QPhiX(geom_inner, clov_inner, cb, false);
+  }
+
+  sw_invert_epsbar(g_epsbar);
+  reorder_clover_to_QPhiX(geom, invclov_odiag, 1 - cb, true, true);
+  if (pack_inner) {
+    reorder_clover_to_QPhiX(geom_inner, invclov_odiag_inner, 1 - cb, true, true);
+  }
+
+  // no minus sign here, the difference in the sign of gamma5
+  // is taken care of internally
+  sw_invert_mubar(g_mubar);
+  reorder_clover_to_QPhiX(geom, full_invclov, 1 - cb, true);
+  if (pack_inner) {
+    reorder_clover_to_QPhiX(geom_inner, full_invclov_inner, 1 - cb, true);
+  }
+
+  sw_invert_nd(g_mubar * g_mubar - g_epsbar * g_epsbar);
+
+  if (g_debug_level > 1) {
+    QPhiX::masterPrintf("# QPHIX-inteface: ND TMClover clover-field packing took %.4lf seconds\n",
+                        gettime() - start);
+  }
+}
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// Apply the full QPhiX fermion matrix to checkerboarded tm spinors
+// template <typename FT, int V, int S, bool compress>
+// void Mfull_helper(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
+//                  const op_type_t op_type) {
+//  // TODO: this should use handles for gauge and spinors because these are definitely temporary
+//  // objects
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
+//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
+//
+//  if (g_debug_level > 1) tmlqcd::printQphixDiagnostics(V, S, compress, V, S, compress);
+//
+//  double coeff_s = (FT)(1);
+//  double coeff_t = (FT)(1);
+//
+//  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
+//                                           MinCt);
+//
+//  // Wilson mass
+//  double mass = 1 / (2.0 * g_kappa) - 4;
+//
+//  tmlqcd::Dslash<FT, V, S, compress> *polymorphic_dslash;
+//
+//  QGauge *u_packed[2];
+//  QSpinor *qphix_in[2];
+//  QSpinor *qphix_out[2];
+//
+//  QClover *clover[2];
+//  QClover *inv_clover[2];
+//
+//  QFullClover *inv_fullclover[2][2];
+//
+//  QSpinor *tmp_spinor = (QSpinor *)geom.allocCBFourSpinor();
+//  for (int cb : {0, 1}) {
+//    u_packed[cb] = (QGauge *)geom.allocCBGauge();
+//    qphix_in[cb] = (QSpinor *)geom.allocCBFourSpinor();
+//    qphix_out[cb] = (QSpinor *)geom.allocCBFourSpinor();
+//    clover[cb] = nullptr;
+//    inv_clover[cb] = nullptr;
+//    for (int fl : {0, 1}) {
+//      inv_fullclover[cb][fl] = nullptr;
+//    }
+//  }
+//  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
+//
+//  if (op_type == WILSON) {
+//    polymorphic_dslash = new tmlqcd::WilsonDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, use_tbc, tbc_phases);
+//  } else if (op_type == TMWILSON) {
+//    polymorphic_dslash = new tmlqcd::WilsonTMDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), use_tbc, tbc_phases);
+//  } else if (op_type == CLOVER && fabs(g_mu) <= DBL_EPSILON) {
+//    for (int cb : {0, 1}) {
+//      clover[cb] = (QClover *)geom.allocCBClov();
+//      inv_clover[cb] = (QClover *)geom.allocCBClov();
+//
+//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
+//      sw_invert(cb, 0);
+//      reorder_clover_to_QPhiX(geom, inv_clover[cb], cb, true);
+//    }
+//
+//    polymorphic_dslash = new tmlqcd::WilsonClovDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, clover, inv_clover, use_tbc, tbc_phases);
+//
+//  } else if (op_type == CLOVER && fabs(g_mu) > DBL_EPSILON) {
+//    for (int cb : {0, 1}) {
+//      clover[cb] = (QClover *)geom.allocCBClov();
+//      for (int fl : {0, 1}) {
+//        inv_fullclover[cb][fl] = (QFullClover *)geom.allocCBFullClov();
+//      }
+//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
+//      sw_invert(cb, g_mu);
+//      reorder_clover_to_QPhiX(geom, inv_fullclover[cb], cb, true);
+//    }
+//
+//    polymorphic_dslash = new tmlqcd::WilsonClovTMDslash<FT, V, S, compress>(
+//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), clover,
+//        inv_fullclover, use_tbc, tbc_phases);
+//
+//  } else {
+//    QPhiX::masterPrintf("tmlqcd::Mfull_helper; No such operator type: %d\n", op_type);
+//    abort();
+//  }
+//
+////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
+////                              qphix_in[cb_even], cb_even);
+////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
+/// qphix_in[cb_odd], /                              cb_odd);
+//  reorder_eo_spinor_to_QPhiX(geom, Even_in,
+//                             qphix_in[cb_even], cb_even);
+//  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_in[cb_odd],
+//                             cb_odd);
+//  // Apply QPhiX Mfull
+//  polymorphic_dslash->plain_dslash(qphix_out[cb_odd], qphix_in[cb_even], u_packed[cb_odd],
+//                                   /* isign == non-conjugate */ 1, cb_odd);
+//  polymorphic_dslash->plain_dslash(qphix_out[cb_even], qphix_in[cb_odd], u_packed[cb_even],
+//                                   /* isign == non-conjugate */ 1, cb_even);
+//  for (int cb : {0, 1}) {
+//    polymorphic_dslash->A_chi(tmp_spinor, qphix_in[cb], 1, cb);
+//    QPhiX::aypx(-0.5, tmp_spinor, qphix_out[cb], geom, 1);
+//  }
+//
+//  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_out[cb_even],
+//                               cb_even, 2.0 * g_kappa);
+//  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_out[cb_odd], cb_odd,
+//                               2.0 * g_kappa);
+//
+//  geom.free(tmp_spinor);
+//  for (int cb : {0, 1}) {
+//    geom.free(u_packed[cb]);
+//    geom.free(qphix_in[cb]);
+//    geom.free(qphix_out[cb]);
+//    geom.free(clover[cb]);
+//    geom.free(inv_clover[cb]);
+//    for (int fl : {0, 1}) {
+//      geom.free(inv_fullclover[cb][fl]);
+//    }
+//  };
+//  delete (polymorphic_dslash);
+//}
+
+// Templated even-odd preconditioned solver using QPhiX Library
+template <typename FT, int V, int S, bool compress, typename FT_inner = FT, int V_inner = V,
+          int S_inner = S, bool compress_inner = compress>
+int invert_eo_qphix_helper(std::vector<std::vector<spinor *> > &tmlqcd_odd_out,
+                           std::vector<std::vector<spinor *> > &tmlqcd_odd_in,
+                           const double target_precision, const int max_iter, const int solver_flag,
+                           solver_params_t solver_params, const int num_flavour) {
+  // TODO: it would perhaps be beneficial to keep the fields resident
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
+  typedef typename QPhiX::FourSpinorHandle<FT, V, S, compress> QSpinorHandle;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
+  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
+
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::SU3MatrixBlock
+      QGauge_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FourSpinorBlock
+      QSpinor_inner;
+  typedef typename QPhiX::FourSpinorHandle<FT_inner, V_inner, S_inner, compress_inner>
+      QSpinorHandle_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::CloverBlock
+      QClover_inner;
+  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FullCloverBlock
+      QFullClover_inner;
+
+  /************************
+   *                      *
+   *    SETUP GEOMETRY    *
+   *                      *
+   ************************/
+
+  if (g_debug_level > 1) {
+    tmlqcd::printQphixDiagnostics(V, S, compress, V_inner, S_inner, compress_inner);
+  }
+
+  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
+                                           MinCt);
+
+  // we always create the inner geometry, the overhead should be small...
+  QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner> geom_inner(
+      subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ, MinCt);
+
+  // Set number of BLAS threads by hand.
+  // In case some implements the tune routines in QPhiX
+  // this may be updated...
+  QPhiX::masterPrintf("# Setting number of BLAS threads...\n");
+  const int n_blas_simt = N_simt;
+  QPhiX::masterPrintf("# ...done.\n");
+
+  // Anisotropy Coefficents
+  const double coeff_s = 1.0;
+  const double coeff_t = 1.0;
+
+  // The Wilson mass
+  const double mass = 1.0 / (2.0 * g_kappa) - 4.0;
+
+  // Set variables need for solve
+  bool verbose = g_debug_level > 2 ? true : false;
+  int niters = -1;
+  int niters2 = 0;
+  double rsd_final = -1.0;
+  uint64_t site_flops = 0;
+  uint64_t site_flops2 = 0;
+  uint64_t mv_apps = 0;
+  uint64_t mv_apps2 = 0;
+
+  double start_time;
+  double end_time;
+
+  // support for multi-shift solves via the length of the output vector,
+  // which counts the shifts on the outer index and the flavour on the inner index
+  const int num_shifts = tmlqcd_odd_out.size();
+  std::vector<double> shifts;
+  shifts.resize(num_shifts);
+  std::vector<double> RsdTargetArr;
+  RsdTargetArr.resize(num_shifts);
+  std::vector<double> RsdFinalArr;
+  RsdFinalArr.resize(num_shifts);
+
+  double rescale = 0.5 / g_kappa;
+  // the inverse of M M^dag, as required for the HMC, comes with a factor of alpha^2
+  if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+    rescale *= rescale;
+  }
+
+  std::vector<QSpinorHandle> q_spinor_handles;
+
+  QGauge *u_packed[2] = {nullptr, nullptr};
+  QGauge_inner *u_packed_inner[2] = {nullptr, nullptr};
+  for (int cb : {0, 1}) {
+    u_packed[cb] = (QGauge *)geom.allocCBGauge();
+  }
+  // Reorder (global) input gauge field from tmLQCD to QPhiX
+  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
+
+  // for mixed solvers, we also need the gauge field in the inner precision
+  if (solver_is_mixed(solver_flag)) {
+    for (int cb : {0, 1}) {
+      u_packed_inner[cb] = (QGauge_inner *)geom_inner.allocCBGauge();
+    }
+    reorder_gauge_to_QPhiX(geom_inner, u_packed_inner[cb_even], u_packed_inner[cb_odd]);
+  }
+
+  if (num_flavour == 1) {
+    constexpr int nf = 1;
+    std::vector<QSpinor *> qphix_in;
+    qphix_in.resize(1);
+    std::vector<QSpinor *> qphix_out;
+    qphix_out.resize(num_shifts);
+    QSpinor *qphix_buffer;
+
+    QClover *qphix_clover = nullptr;
+    QClover *qphix_inv_clover = nullptr;
+
+    QClover_inner *qphix_clover_inner = nullptr;
+    QClover_inner *qphix_inv_clover_inner = nullptr;
+
+    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
+
+    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
+
+    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+    qphix_in[0] = q_spinor_handles.back().get();
+
+    for (int shift = 0; shift < num_shifts; shift++) {
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_out[shift] = q_spinor_handles.back().get();
+    }
+
+    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+    qphix_buffer = q_spinor_handles.back().get();
+
+    QPhiX::EvenOddLinearOperator<FT, V, S, compress> *FermionMatrixQPhiX = nullptr;
+    QPhiX::EvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
+        *InnerFermionMatrixQPhiX = nullptr;
+    if ((fabs(g_mu) > DBL_EPSILON) && g_c_sw > DBL_EPSILON) {  // TWISTED-MASS-CLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      for (int fl : {0, 1}) {
+        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
+      }
+      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
+      reorder_clover_to_QPhiX(geom, qphix_inv_fullclover, cb_even, true);
+
+      QPhiX::masterPrintf("# Creating QPhiX Twisted Clover Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddTMCloverOperator<FT, V, S, compress>(
+          u_packed, qphix_clover, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t,
+          use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
+      if (solver_is_mixed(solver_flag)) {
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        for (int fl : {0, 1}) {
+          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
+        }
+        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
+        reorder_clover_to_QPhiX(geom_inner, qphix_inv_fullclover_inner, cb_even, true);
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddTMCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                u_packed_inner, qphix_clover_inner, qphix_inv_fullclover_inner, &geom_inner,
+                t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+    } else if (fabs(g_mu) > DBL_EPSILON) {  // TWISTED-MASS
+      const double TwistedMass = -g_mu / (2.0 * g_kappa);
+      QPhiX::masterPrintf("# Creating QPhiX Twisted Mass Wilson Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddTMWilsonOperator<FT, V, S, compress>(
+          mass, TwistedMass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      QPhiX::masterPrintf("# ...done.\n");
+      if (solver_is_mixed(solver_flag)) {
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddTMWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                mass, TwistedMass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t,
+                use_tbc, tbc_phases);
+      }
+    } else if (g_c_sw > DBL_EPSILON) {  // WILSON CLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      qphix_inv_clover = (QClover *)geom.allocCBClov();
+
+      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
+      reorder_clover_to_QPhiX(geom, qphix_inv_clover, cb_even, true);
+
+      QPhiX::masterPrintf("# Creating QPhiX Wilson Clover Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddCloverOperator<FT, V, S, compress>(
+          u_packed, qphix_clover, qphix_inv_clover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
+          tbc_phases, -0.5 * g_mu3 / g_kappa);
+      if (solver_is_mixed(solver_flag)) {
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        qphix_inv_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
+        reorder_clover_to_QPhiX(geom_inner, qphix_inv_clover_inner, cb_even, true);
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                u_packed_inner, qphix_clover_inner, qphix_inv_clover_inner, &geom_inner, t_boundary,
+                coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * g_mu3 / g_kappa);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+
+    } else {  // WILSON
+      QPhiX::masterPrintf("# Creating QPhiX Wilson Fermion Matrix...\n");
+      FermionMatrixQPhiX = new QPhiX::EvenOddWilsonOperator<FT, V, S, compress>(
+          mass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      if (solver_is_mixed(solver_flag)) {
+        InnerFermionMatrixQPhiX =
+            new QPhiX::EvenOddWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                mass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc,
+                tbc_phases);
+      }
+      QPhiX::masterPrintf("# ...done.\n");
+    }
+
+    // Create a Linear Solver Object
+    QPhiX::AbstractSolver<FT, V, S, compress> *SolverQPhiX = nullptr;
+    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner> *InnerSolverQPhiX = nullptr;
+    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *MultiSolverQPhiX = nullptr;
+    if (solver_flag == DUMMYHERMTEST) {
+      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
+      SolverQPhiX =
+          new QPhiX::InvDummyHermTest<FT, V, S, compress,
+                                      typename QPhiX::EvenOddLinearOperator<FT, V, S, compress> >(
+              *FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == CG) {
+      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
+      SolverQPhiX = new QPhiX::InvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == BICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating BiCGStab solver...\n");
+      SolverQPhiX = new QPhiX::InvBiCGStab<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == MIXEDCG) {
+      // TODO: probably need to adjust inner solver iterations here...
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
+      InnerSolverQPhiX = new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner>(
+          *InnerFermionMatrixQPhiX, max_iter);
+      const bool MMdag = true;
+      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
+                                                      S_inner, compress_inner, MMdag>(
+          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == MIXEDBICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision BICGCGSTAB solver...\n");
+      InnerSolverQPhiX = new QPhiX::InvBiCGStab<FT_inner, V_inner, S_inner, compress_inner>(
+          *InnerFermionMatrixQPhiX, max_iter);
+      const bool MMdag = false;
+      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
+                                                      S_inner, compress_inner, MMdag>(
+          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == CGMMS) {
+      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver ...\n");
+      MultiSolverQPhiX =
+          new QPhiX::MInvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter, num_shifts);
+    } else {
+      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
+      QPhiX::masterPrintf(" Aborting...\n");
+      abort();
+    }
+    QPhiX::masterPrintf("# ...done.\n");
+
+    //     reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
+    //     *const>(tmlqcd_odd_in[0][0]),
+    //                                qphix_in[0], cb_odd);
+    reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][0], qphix_in[0], cb_odd);
+    QPhiX::masterPrintf("# Calling the solver...\n");
+
+    // Set the right precision for the QPhiX solver
+    // we get target_precision externally and and is given such, that it's either
+    // already relative or absolute
+    // Most QPhiX solvers allow setting absolute or relative residual
+    // by passing an appropriate flag, but this is not true for the multi-shift solver.
+    // As a result, we follow that solver and call ALL solvers with
+    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
+    double rhs_norm2 = 1.0;
+    QPhiX::norm2Spinor(rhs_norm2, qphix_in[0], geom, n_blas_simt);
+    const double RsdTarget = sqrt(target_precision / rhs_norm2);
+
+    // Calling the solver
+    start_time = gettime();
+    if (solver_flag == DUMMYHERMTEST) {
+      random_spinor_field_eo(tmlqcd_odd_out[0][0], 0, RN_GAUSS);
+      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][0], qphix_buffer, cb_odd);
+      for (int isign : {-1, 1}) {
+        (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
+                       isign, verbose, cb_odd, QPhiX::RELATIVE);
+      }
+      QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+    } else if (solver_flag == CG || solver_flag == MIXEDCG || solver_flag == RGMIXEDCG) {
+      // USING CG:
+      // We are solving
+      //   M M^dagger qphix_buffer = qphix_in_prepared
+      // here, that is, isign = -1 for the QPhiX CG solver.
+      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
+                     -1, verbose, cb_odd, QPhiX::RELATIVE);
+      // After that. if required by the solution type, multiply with M^dagger:
+      //   qphix_out[1] = M^dagger ( M^dagger^-1 M^-1 ) qphix_in_prepared
+      if (solver_params.solution_type == TM_SOLUTION_M) {
+        (*FermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
+        mv_apps++;
+      } else {
+        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == CGMMS) {
+      // TODO: handle the residuals properly
+      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMS: shifts: \n");
+      for (int shift = 0; shift < num_shifts; shift++) {
+        RsdTargetArr[shift] = RsdTarget;
+        RsdFinalArr[shift] = -1.0;
+        shifts[shift] =
+            solver_params.shifts[shift] * solver_params.shifts[shift] / (4 * g_kappa * g_kappa);
+        if (g_debug_level > 2)
+          QPhiX::masterPrintf("# QPHIX CGMMS: shift[%d] = %.6e\n", shift, shifts[shift]);
+      }
+      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
+      (*MultiSolverQPhiX)(qphix_out.data(), qphix_in[0], num_shifts, shifts.data(),
+                          RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops, mv_apps, -1,
+                          verbose);
+      rsd_final = RsdFinalArr[0];
+    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
+      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
+                     1, verbose, cb_odd, QPhiX::RELATIVE);
+      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
+      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+        (*SolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
+                       mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
+      } else {
+        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    }
+    end_time = gettime();
+
+    for (int shift = 0; shift < num_shifts; shift++) {
+      reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][0], qphix_out[shift], cb_odd,
+                                   rescale);
+    }
+
+    QPhiX::masterPrintf("# QPHIX: ...done.\n");
+    QPhiX::masterPrintf("# QPHIX: Cleaning up\n");
+    delete (FermionMatrixQPhiX);
+    delete (InnerFermionMatrixQPhiX);
+    delete (SolverQPhiX);
+    delete (InnerSolverQPhiX);
+    delete (MultiSolverQPhiX);
+    // on KNL, it seems that munmap is problematic, so we check for nullptr
+    if (qphix_clover) geom.free(qphix_clover);
+    if (qphix_inv_clover) geom.free(qphix_inv_clover);
+    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
+    if (qphix_inv_clover_inner) geom_inner.free(qphix_inv_clover_inner);
+    for (int fl : {0, 1}) {
+      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
+      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
+    }
+    QPhiX::masterPrintf("# QPHIX: ...done.\n\n");
+
+  } else if (num_flavour == 2) {
+    // for explicit template arguments
+    constexpr int nf = 2;
+
+    QSpinor *qphix_in[2];
+    std::vector<QSpinor **> qphix_out;
+    qphix_out.resize(num_shifts);
+    for (int shift = 0; shift < num_shifts; shift++) {
+      qphix_out[shift] = new QSpinor *[2];
+      for (int fl : {0, 1}) {
+        q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+        qphix_out[shift][fl] = q_spinor_handles.back().get();
+      }
+    }
+
+    QSpinor *qphix_buffer[2];
+    for (int fl : {0, 1}) {
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_in[fl] = q_spinor_handles.back().get();
+      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
+      qphix_buffer[fl] = q_spinor_handles.back().get();
+    }
+
+    QClover *qphix_clover = nullptr;
+    QClover_inner *qphix_clover_inner = nullptr;
+
+    QClover *qphix_invclov_odiag = nullptr;
+    QClover_inner *qphix_invclov_odiag_inner = nullptr;
+
+    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
+    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
+
+    QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> *TwoFlavFermionMatrixQPhiX = nullptr;
+    QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
+        *InnerTwoFlavFermionMatrixQPhiX = nullptr;
+
+    if (g_c_sw > DBL_EPSILON) {  // DBCLOVER
+      qphix_clover = (QClover *)geom.allocCBClov();
+      qphix_invclov_odiag = (QClover *)geom.allocCBClov();
+      if (solver_is_mixed(solver_flag)) {
+        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
+        qphix_invclov_odiag_inner = (QClover_inner *)geom_inner.allocCBClov();
+      }
+
+      for (int fl : {0, 1}) {
+        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
+        if (solver_is_mixed(solver_flag)) {
+          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
+        }
+      }
+
+      pack_nd_clover(geom, geom_inner, qphix_inv_fullclover, qphix_invclov_odiag, qphix_clover,
+                     qphix_inv_fullclover_inner, qphix_invclov_odiag_inner, qphix_clover_inner,
+                     cb_odd, solver_is_mixed(solver_flag));
+
+      QPhiX::masterPrintf(
+          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Clover Fermion Matrix...\n");
+      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMCloverReuseOperator<FT, V, S, compress>(
+          -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, qphix_clover,
+          qphix_invclov_odiag, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
+          tbc_phases);
+      if (solver_is_mixed(solver_flag)) {
+        InnerTwoFlavFermionMatrixQPhiX =
+            new QPhiX::EvenOddNDTMCloverReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
+                qphix_clover_inner, qphix_invclov_odiag_inner, qphix_inv_fullclover_inner,
+                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+    } else {  // DBTMWILSON
+      QPhiX::masterPrintf(
+          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Mass Fermion Matrix...\n");
+      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMWilsonReuseOperator<FT, V, S, compress>(
+          mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, &geom, t_boundary,
+          coeff_s, coeff_t, use_tbc, tbc_phases);
+      if (solver_is_mixed(solver_flag)) {
+        InnerTwoFlavFermionMatrixQPhiX =
+            new QPhiX::EvenOddNDTMWilsonReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
+                mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
+                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
+      }
+    }
+
+    //
+    QPhiX::AbstractSolver<FT, V, S, compress, nf> *TwoFlavSolverQPhiX = nullptr;
+    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner, nf> *InnerTwoFlavSolverQPhiX =
+        nullptr;
+    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *TwoFlavMultiSolverQPhiX = nullptr;
+    if (solver_flag == DUMMYHERMTEST) {
+      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
+      TwoFlavSolverQPhiX = new QPhiX::InvDummyHermTest<
+          FT, V, S, compress, typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+          *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == CG) {
+      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
+      TwoFlavSolverQPhiX =
+          new QPhiX::InvCG<FT, V, S, compress,
+                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == BICGSTAB) {
+      QPhiX::masterPrintf("# QPHIX: Creating BiCGstab solver...\n");
+      TwoFlavSolverQPhiX =
+          new QPhiX::InvBiCGStab<FT, V, S, compress,
+                                 typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter);
+    } else if (solver_flag == MIXEDCG) {
+      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
+      InnerTwoFlavSolverQPhiX =
+          new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner,
+                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner,
+                                                                        compress_inner> >(
+              *InnerTwoFlavFermionMatrixQPhiX, max_iter);
+      const bool MMdag = true;
+      TwoFlavSolverQPhiX = new QPhiX::InvRichardsonMultiPrec<
+          FT, V, S, compress, FT_inner, V_inner, S_inner, compress_inner, MMdag,
+          typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+          *TwoFlavFermionMatrixQPhiX, *InnerTwoFlavSolverQPhiX, solver_params.mcg_delta, max_iter);
+    } else if (solver_flag == CGMMSND) {
+      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver...\n");
+      TwoFlavMultiSolverQPhiX =
+          new QPhiX::MInvCG<FT, V, S, compress,
+                            typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
+              *TwoFlavFermionMatrixQPhiX, max_iter, num_shifts);
+    } else {
+      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
+      QPhiX::masterPrintf(" Aborting...\n");
+      abort();
+    }
+    QPhiX::masterPrintf("# QPHIX: ...done.\n");
+
+    for (int fl : {0, 1}) {
+      //       reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
+      //       *const>(tmlqcd_odd_in[0][fl]),
+      //                                  qphix_in[fl], cb_odd);
+      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][fl], qphix_in[fl], cb_odd);
+    }
+
+    QPhiX::masterPrintf("# QPHIX: Calling the solver...\n");
+
+    // Set the right precision for the QPhiX solver
+    // we get target_precision externally and and is given such, that it's either
+    // already relative or absolute
+    // Most QPhiX solvers allow setting absolute or relative residual
+    // by passing an appropriate flag, but this is not true for the multi-shift solver.
+    // As a result, we follow that solver and call ALL solvers with
+    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
+    double rhs_norm2 = 1.0;
+    QPhiX::norm2Spinor<FT, V, S, compress, nf>(rhs_norm2, qphix_in, geom, n_blas_simt);
+    const double RsdTarget = sqrt(target_precision / rhs_norm2);
+
+    // Calling the solver
+    start_time = gettime();
+    if (solver_flag == DUMMYHERMTEST) {
+      for (int fl : {0, 1}) {
+        random_spinor_field_eo(tmlqcd_odd_out[0][fl], 0, RN_GAUSS);
+        reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][fl], qphix_buffer[fl], cb_odd);
+      }
+      for (int isign : {-1, 1}) {
+        (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
+                              mv_apps, isign, verbose, cb_odd, QPhiX::RELATIVE);
+      }
+      QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+    } else if (solver_flag == CG || solver_flag == MIXEDCG) {
+      // USING CG:
+      // We are solving
+      //   M M^dagger qphix_buffer = qphix_in_prepared
+      // here, that is, isign = -1 for the QPhiX CG solver.
+      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
+                            mv_apps, -1, verbose, cb_odd, QPhiX::RELATIVE);
+      // After that. if required by the solution type, multiply with M^dagger:
+      //   qphix_out[1] = M^dagger M^dagger^-1 M^-1 qphix_in_prepared
+      if (solver_params.solution_type == TM_SOLUTION_M) {
+        (*TwoFlavFermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
+        mv_apps++;
+      } else {
+        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
+      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
+                            mv_apps, 1, verbose, cb_odd, QPhiX::RELATIVE);
+      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
+      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+        (*TwoFlavSolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
+                              mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
+      } else {
+        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
+      }
+    } else if (solver_flag == CGMMSND) {
+      // TODO: handle the residuals properly
+      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMSND: shifts: \n");
+      // tmLQCD weights the operator with 1/maxev in the RHMC relative to the shifts
+      // we will do this externally on the inverse (in monomial_solve) and thus need to weight
+      // the shifts by maxev^2
+      const double maxev_sq = (1.0 / phmc_invmaxev) * (1.0 / phmc_invmaxev);
+      for (int shift = 0; shift < num_shifts; shift++) {
+        RsdTargetArr[shift] = RsdTarget;
+        RsdFinalArr[shift] = -1.0;
+        shifts[shift] = maxev_sq * solver_params.shifts[shift] * solver_params.shifts[shift] /
+                        (4 * g_kappa * g_kappa);
+        if (g_debug_level > 2) QPhiX::masterPrintf("# [%d] = %lf\n", shift, shifts[shift]);
+      }
+      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
+      (*TwoFlavMultiSolverQPhiX)(qphix_out.data(), qphix_in, num_shifts, shifts.data(),
+                                 RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops,
+                                 mv_apps, -1, verbose);
+      rsd_final = RsdFinalArr[0];
+    }
+    end_time = gettime();
+
+    for (int shift = 0; shift < num_shifts; shift++) {
+      for (int fl : {0, 1}) {
+        reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][fl], qphix_out[shift][fl], cb_odd,
+                                     rescale);
+      }
+    }
+
+    delete TwoFlavFermionMatrixQPhiX;
+    delete InnerTwoFlavFermionMatrixQPhiX;
+    delete InnerTwoFlavSolverQPhiX;
+    delete TwoFlavMultiSolverQPhiX;
+    delete TwoFlavSolverQPhiX;
+    for (int shift = 0; shift < num_shifts; shift++) {
+      delete[] qphix_out[shift];
+    }
+
+    if (qphix_clover) geom.free(qphix_clover);
+    if (qphix_invclov_odiag) geom.free(qphix_invclov_odiag);
+    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
+    if (qphix_invclov_odiag_inner) geom_inner.free(qphix_invclov_odiag_inner);
+    for (int fl : {0, 1}) {
+      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
+      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
+    }
+
+  } else {  // if(num_flavour)
+    // complain, this number of flavours is not valid
+  }  // if(num_flavour)
+
+  for (int cb : {0, 1}) {
+    if (u_packed[cb]) geom.free(u_packed[cb]);
+    if (u_packed_inner[cb]) geom_inner.free(u_packed_inner[cb]);
+  }
+
+  // FIXME: This should be called properly somewhere else
+  _endQphix();
+
+  QPhiX::masterPrintf("# ...done.\n\n");
+
+  uint64_t num_cb_sites = lattSize[0] / 2 * lattSize[1] * lattSize[2] * lattSize[3];
+  // FIXME: this needs to be adjusted depending on the operator used
+  uint64_t op_flops_per_site = 1320;
+  uint64_t total_flops =
+      (site_flops + site_flops2 + (2 * num_flavour * op_flops_per_site) * (mv_apps + mv_apps2)) *
+      num_cb_sites;
+  QPhiX::masterPrintf("# QPHIX: Solver Time = %g sec\n", (end_time - start_time));
+  QPhiX::masterPrintf("# QPHIX: Performance in GFLOPS = %g\n\n",
+                      1.0e-9 * total_flops / (end_time - start_time));
+
+  if (solver_is_mixed(solver_flag)) {
+    // the mixed solver reports the outer iterations, we would like to get
+    // some better total
+    niters = mv_apps / 2;
+    if (solver_flag == MIXEDBICGSTAB && solver_params.solution_type == TM_SOLUTION_M_MDAG) {
+      niters2 = mv_apps2 / 2;
+    }
+  }
+  // solver did not converge in maximum number of iterations
+  // FIXME: non-convergence does not work correctly yet
+  if ((niters + niters2) > max_iter) {
+    niters = -1;
+    niters2 = 0;
+  }
+  return (niters + niters2);
+}
+
+// Due to github issue #404, the helper functions to apply the full QPhiX operator
+// are currently disabled because they conflict with the new interfaces in QPhiX
+// itself. If required, these should be rewritten to use these interfaces
+// rather than the base classes in qphix_base_classes.hpp
+
+// Template wrapper for the Dslash operator call-able from C code
+// void Mfull_qphix(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
+//                 const op_type_t op_type) {
+//  tmlqcd::checkQphixInputParameters(qphix_input);
+//  // FIXME: two-row gauge compression and double precision hard-coded
+//  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
+//
+//  if (qphix_precision == QPHIX_DOUBLE_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_DP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_DP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN DOUBLE PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                          op_type);
+//    } else {
+//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                           op_type);
+//    }
+//  } else if (qphix_precision == QPHIX_FLOAT_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_SP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_SP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN SINGLE PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                         op_type);
+//    } else {
+//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                          op_type);
+//    }
+//  }
+// #if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+//  else if (qphix_precision == QPHIX_HALF_PREC) {
+//    if (QPHIX_SOALEN > VECLEN_HP) {
+//      QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
+//                          VECLEN_HP);
+//      abort();
+//    }
+//    QPhiX::masterPrintf("TESTING IN HALF PRECISION \n");
+//    if (compress12) {
+//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
+//                                                               op_type);
+//    } else {
+//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in,
+//      Odd_in,
+//                                                                op_type);
+//    }
+//  }
+// #endif
+//}
+
+// we have a unified interface for n-flavour inversions, but we need to provide wrappers
+// which can be called by the tmLQCD solver drivers for one and two-flavour inversions
+int invert_eo_qphix_oneflavour(spinor *Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
+                               const double precision, const int solver_flag, const int rel_prec,
+                               const solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression) {
+  const int num_flavour = 1;
+  const int num_shifts = 1;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_out[0].resize(num_flavour);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_1f;
+  Odd_out[0][0] = Odd_out_1f;
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+int invert_eo_qphix_oneflavour_mshift(spinor **Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
+                                      const double precision, const int solver_flag,
+                                      const int rel_prec, const solver_params_t solver_params,
+                                      const SloppyPrecision sloppy,
+                                      const CompressionType compression) {
+  // even though the default is set to 1, guard against zeroes
+  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
+  const int num_flavour = 1;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_1f;
+  for (int shift = 0; shift < num_shifts; shift++) {
+    Odd_out[shift].resize(num_flavour);
+    Odd_out[shift][0] = Odd_out_1f[shift];
+  }
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+// Template wrapper for QPhiX solvers callable from C code, return number of iterations
+int invert_eo_qphix_twoflavour(spinor *Odd_out_s, spinor *Odd_out_c, spinor *Odd_in_s,
+                               spinor *Odd_in_c, const int max_iter, const double precision,
+                               const int solver_flag, const int rel_prec,
+                               const solver_params_t solver_params, const SloppyPrecision sloppy,
+                               const CompressionType compression) {
+  const int num_flavour = 2;
+  const int num_shifts = 1;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_out[0].resize(num_flavour);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_s;
+  Odd_in[0][1] = Odd_in_c;
+
+  Odd_out[0][0] = Odd_out_s;
+  Odd_out[0][1] = Odd_out_c;
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+int invert_eo_qphix_twoflavour_mshift(spinor **Odd_out_s, spinor **Odd_out_c, spinor *Odd_in_s,
+                                      spinor *Odd_in_c, const int max_iter, const double precision,
+                                      const int solver_flag, const int rel_prec,
+                                      const solver_params_t solver_params,
+                                      const SloppyPrecision sloppy,
+                                      const CompressionType compression) {
+  // even though the default is set to 1, guard against zeroes
+  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
+  const int num_flavour = 2;
+  std::vector<std::vector<spinor *> > Odd_out;
+  std::vector<std::vector<spinor *> > Odd_in;
+
+  Odd_out.resize(num_shifts);
+  Odd_in.resize(1);
+  Odd_in[0].resize(num_flavour);
+
+  Odd_in[0][0] = Odd_in_s;
+  Odd_in[0][1] = Odd_in_c;
+
+  for (int shift = 0; shift < num_shifts; shift++) {
+    Odd_out[shift].resize(num_flavour);
+    Odd_out[shift][0] = Odd_out_s[shift];
+    Odd_out[shift][1] = Odd_out_c[shift];
+  }
+
+  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
+                                         rel_prec, solver_params, sloppy, compression, num_flavour);
+}
+
+// Template wrapper for QPhiX solvers callable from C code, return number of iterations
+// the interface is prepared for multi-rhs solves, hence the double vector for the input
+int invert_eo_qphix_nflavour_mshift(std::vector<std::vector<spinor *> > &Odd_out,
+                                    std::vector<std::vector<spinor *> > &Odd_in,
+                                    const double precision, const int max_iter,
+                                    const int solver_flag, const int rel_prec,
+                                    solver_params_t solver_params, const SloppyPrecision sloppy,
+                                    const CompressionType compression, const int num_flavour) {
+  tmlqcd::checkQphixInputParameters(qphix_input);
+  double target_precision = precision;
+  double src_norm = 0.0;
+  for (int f = 0; f < num_flavour; ++f) {
+    src_norm += square_norm(Odd_in[0][f], VOLUME / 2, 1);
+  }
+  // we use "precision_lambda" to determine if a system can be solved in half or float
+  // precision (when a fixed-precision solver is used)
+  double precision_lambda = target_precision / src_norm;
+  if (rel_prec == 1) {
+    QPhiX::masterPrintf("# QPHIX: Using relative precision\n");
+    target_precision = precision * src_norm;
+    precision_lambda = precision;
+  }
+  QPhiX::masterPrintf("# QPHIX: precision_lambda: %g, target_precision: %g\n\n", precision_lambda,
+                      target_precision);
+
+  // mixed solvers require inner and outer precisions, which we specify explicitly here
+  if (solver_is_mixed(solver_flag)) {
+#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+    if (sloppy == SLOPPY_HALF) {
+      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_HP) {
+        QPhiX::masterPrintf(
+            "SOALEN=%d is greater than the half prec VECLEN=%d or the double prec VECLEN=%d\n",
+            QPHIX_SOALEN, VECLEN_HP, VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-HALF PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_HALF_PREC);
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, QPhiX::half, VECLEN_HP,
+                                      QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, QPhiX::half,
+                                      VECLEN_HP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else
+#else
+    if (sloppy == SLOPPY_HALF) {
+      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
+      abort();
+    } else
+#endif
+        if (sloppy == SLOPPY_SINGLE) {
+      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_SP) {
+        QPhiX::masterPrintf(
+            "SOALEN=%d is greater than the single prec VECLEN=%d or the double prec VECLEN=%d\n",
+            QPHIX_SOALEN, VECLEN_SP, VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-SINGLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_FLOAT_PREC);
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, float, VECLEN_SP,
+                                      QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, float, VECLEN_SP,
+                                      QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else {  // if(sloppy)
+      if (QPHIX_SOALEN > VECLEN_DP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE-DOUBLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_DOUBLE_PREC);
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    }  // if( sloppy )
+  } else {  // if( solver_is_mixed )
+#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
+    if (sloppy == SLOPPY_HALF || precision_lambda >= rsdTarget<QPhiX::half>::value) {
+      if (QPHIX_SOALEN > VECLEN_HP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_HP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING HALF PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_HALF_PREC);
+
+      if (compress12) {
+        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else
+#else
+    if (sloppy == SLOPPY_HALF) {
+      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
+      abort();
+    } else
+#endif
+        if (sloppy == SLOPPY_SINGLE || precision_lambda >= rsdTarget<float>::value) {
+      if (QPHIX_SOALEN > VECLEN_SP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_SP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING SINGLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_FLOAT_PREC);
+
+      if (compress12) {
+        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    } else {
+      if (QPHIX_SOALEN > VECLEN_DP) {
+        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
+                            VECLEN_DP);
+        abort();
+      }
+      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
+      QPhiX::masterPrintf("# USING DOUBLE PRECISION\n");
+      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC);
+
+      if (compress12) {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      } else {
+        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
+            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
+      }
+    }  // if( sloppy || target_precision )
+  }  // if ( solver_flag == *MIXEDCG )
+  return -1;
+}
+
+void tmlqcd::checkQphixInputParameters(const tm_QPhiXParams_t &params) {
+  if (params.MinCt == 0) {
+    QPhiX::masterPrintf("QPHIX Error: MinCt cannot be 0! Minimal value: 1. Aborting.\n");
+    abort();
+  }
+  if (params.By == 0 || params.Bz == 0) {
+    QPhiX::masterPrintf("QPHIX Error: By and Bz may not be 0! Minimal value: 1. Aborting.\n");
+    abort();
+  }
+  if (params.NCores * params.Sy * params.Sz != omp_num_threads) {
+    QPhiX::masterPrintf("QPHIX Error: NCores * Sy * Sz != ompnumthreads ! Aborting.\n");
+    abort();
+  }
+}
+
+void tmlqcd::printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner,
+                                   int SOALEN_inner, bool compress_inner) {
+  QPhiX::masterPrintf("# QphiX: VECLEN=%d SOALEN=%d VECLEN_inner=%d, SOALEN_inner=%d\n", VECLEN,
+                      SOALEN, VECLEN_inner, SOALEN_inner);
+
+  QPhiX::masterPrintf("# QphiX: Declared QMP Topology (xyzt):");
+  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d", qmp_geom[mu]);
+  QPhiX::masterPrintf("\n");
+
+  QPhiX::masterPrintf("# QphiX: Mapping of dimensions QMP -> tmLQCD (xyzt):");
+  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d->%d", mu, qmp_tm_map[mu]);
+  QPhiX::masterPrintf("\n");
+
+  QPhiX::masterPrintf("# QphiX: Global Lattice Size (xyzt) = ");
+  for (int mu = 0; mu < 4; mu++) {
+    QPhiX::masterPrintf(" %d", lattSize[mu]);
+  }
+  QPhiX::masterPrintf("\n");
+  QPhiX::masterPrintf("# QphiX: Local Lattice Size (xyzt) = ");
+  for (int mu = 0; mu < 4; mu++) {
+    QPhiX::masterPrintf(" %d", subLattSize[mu]);
+  }
+  QPhiX::masterPrintf("\n");
+  QPhiX::masterPrintf("# QphiX: Block Sizes: By= %d Bz=%d\n", By, Bz);
+  QPhiX::masterPrintf("# QphiX: Cores = %d\n", NCores);
+  QPhiX::masterPrintf("# QphiX: SMT Grid: Sy=%d Sz=%d\n", Sy, Sz);
+  QPhiX::masterPrintf("# QphiX: Pad Factors: PadXY=%d PadXYZ=%d\n", PadXY, PadXYZ);
+  QPhiX::masterPrintf("# QphiX: Threads_per_core = %d\n", N_simt);
+  QPhiX::masterPrintf("# QphiX: MinCt = %d\n", MinCt);
+  if (compress) {
+    QPhiX::masterPrintf("# QphiX: Using two-row gauge compression (compress12)\n");
+  }
+  if (compress_inner) {
+    QPhiX::masterPrintf("# QphiX: Inner solver using two-row gauge compression (compress12)\n");
+  }
+}
+
+void testSpinorPackers(spinor *Even_out, spinor *Odd_out, const spinor *const Even_in,
+                       const spinor *const Odd_in) {
+  tmlqcd::checkQphixInputParameters(qphix_input);
+  // FIXME: two-row gauge compression and double precision hard-coded
+  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
+
+  QPhiX::Geometry<double, VECLEN_SP, QPHIX_SOALEN, true> geom(subLattSize, By, Bz, NCores, Sy, Sz,
+                                                              PadXY, PadXYZ, MinCt);
+
+  auto qphix_cb_even = QPhiX::makeFourSpinorHandle(geom);
+  auto qphix_cb_odd = QPhiX::makeFourSpinorHandle(geom);
+
+  spinor **tmp;
+  init_solver_field(&tmp, VOLUME / 2, 2);
+
+  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
+  //                              qphix_cb_even.get(), cb_even);
+  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
+  //                              qphix_cb_odd.get(), cb_odd);
+  reorder_eo_spinor_to_QPhiX(geom, Even_in, qphix_cb_even.get(), cb_even);
+  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_cb_odd.get(), cb_odd);
+
+  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_cb_even.get(), cb_even, 1.0);
+  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_cb_odd.get(), cb_odd, 1.0);
+
+  diff(tmp[0], Even_out, Even_in, VOLUME / 2);
+  diff(tmp[1], Odd_out, Odd_in, VOLUME / 2);
+  double l2norm = square_norm(tmp[0], VOLUME / 2, 1) + square_norm(tmp[1], VOLUME / 2, 1);
+  QPhiX::masterPrintf("QPHIX eo spinor packer back and forth difference L2 norm: %lf\n", l2norm);
+  finalize_solver(tmp, 2);
+}
diff --git a/src/lib/qphix/qphix_interface.hpp b/src/lib/qphix/qphix_interface.hpp
new file mode 100644
index 000000000..b487eda66
--- /dev/null
+++ b/src/lib/qphix/qphix_interface.hpp
@@ -0,0 +1,51 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2017 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#pragma once
+
+#include "global.h"
+#include "qphix_types.h"
+
+#ifdef __cplusplus /* If this is a C++ compiler, use C linkage */
+extern "C" {
+#endif
+
+#include "misc_types.h"
+#include "operator_types.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "su3.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#include <vector>
+
+int invert_eo_qphix_nflavour_mshift(std::vector< std::vector< spinor* > > &Odd_out, 
+                                    std::vector< std::vector< spinor* > > &Odd_in, 
+                                    const double precision,
+                                    const int max_iter,
+                                    const int solver_flag, 
+                                    const int rel_prec,
+                                    solver_params_t solver_params,
+                                    const SloppyPrecision sloppy, const CompressionType compression,
+                                    const int num_flavour);
\ No newline at end of file
diff --git a/src/lib/qphix/qphix_interface_utils.hpp b/src/lib/qphix/qphix_interface_utils.hpp
new file mode 100644
index 000000000..56d8afe56
--- /dev/null
+++ b/src/lib/qphix/qphix_interface_utils.hpp
@@ -0,0 +1,33 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2015 Mario Schroeck
+ *               2016 Peter Labus
+ *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#pragma once
+
+#include "qphix_types.h"
+
+namespace tmlqcd {
+
+void checkQphixInputParameters(const tm_QPhiXParams_t &params);
+void printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner, int SOALEN_inner, bool compress_inner);
+
+}  // namespace tmlqcd
diff --git a/qphix_interface.h b/src/lib/qphix_interface.h
similarity index 100%
rename from qphix_interface.h
rename to src/lib/qphix_interface.h
diff --git a/qphix_types.h b/src/lib/qphix_types.h
similarity index 100%
rename from qphix_types.h
rename to src/lib/qphix_types.h
diff --git a/qphix_veclen.h b/src/lib/qphix_veclen.h
similarity index 100%
rename from qphix_veclen.h
rename to src/lib/qphix_veclen.h
diff --git a/quda_dummy_types.h b/src/lib/quda_dummy_types.h
similarity index 100%
rename from quda_dummy_types.h
rename to src/lib/quda_dummy_types.h
diff --git a/src/lib/quda_gauge_paths.inc b/src/lib/quda_gauge_paths.inc
new file mode 100644
index 000000000..d2c898e6c
--- /dev/null
+++ b/src/lib/quda_gauge_paths.inc
@@ -0,0 +1,158 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2021 Bartosz Kostrzewa, Ferenc Pittler, Simone Bacchio
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ ***********************************************************************/
+
+const int plaq_rect_length[24] = {
+    3, 3, 3, 3, 3, 3,
+    5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5,
+  };
+
+const int plaq_rect_path[4][24][5] = {
+    { {1, 7, 6 },
+      {6, 7, 1 },
+      {2, 7, 5 },
+      {5, 7, 2 },
+      {3, 7, 4 },
+      {4, 7, 3 }, 
+      {1, 1, 7, 6, 6 },
+      {6, 6, 7, 1, 1 },
+      {2, 2, 7, 5, 5 },
+      {5, 5, 7, 2, 2 },
+      {3, 3, 7, 4, 4 },
+      {4, 4, 7, 3, 3 },
+      {0, 1, 7, 7, 6 },
+      {6, 7, 7, 1, 0 },
+      {0, 2, 7, 7, 5 },
+      {5, 7, 7, 2, 0 },
+      {0, 3, 7, 7, 4 },
+      {4, 7, 7, 3, 0 },
+      {0, 4, 7, 7, 3 },
+      {3, 7, 7, 4, 0 },
+      {0, 5, 7, 7, 2 },
+      {2, 7, 7, 5, 0 },
+      {0, 6, 7, 7, 1 },
+      {1, 7, 7, 6, 0 } },
+    { { 2, 6, 5 },
+      { 5, 6, 2 },
+      { 3, 6, 4 },
+      { 4, 6, 3 },
+      { 0, 6, 7 },
+      { 7, 6, 0 },
+      { 1, 2, 6, 6, 5 },
+      { 2, 6, 6, 5, 1 },
+      { 5, 6, 6, 2, 1 },
+      { 1, 5, 6, 6, 2 },
+      { 1, 3, 6, 6, 4 },
+      { 3, 6, 6, 4, 1 },
+      { 4, 6, 6, 3, 1 },
+      { 1, 4, 6, 6, 3 },
+      { 1, 0, 6, 6, 7 },
+      { 0, 6, 6, 7, 1 },
+      { 7, 6, 6, 0, 1 },
+      { 1, 7, 6, 6, 0 },
+      { 5, 5, 6, 2, 2 },
+      { 2, 2, 6, 5, 5 },
+      { 4, 4, 6, 3, 3 },
+      { 3, 3, 6, 4, 4 },
+      { 7, 7, 6, 0, 0 },
+      { 0, 0, 6, 7, 7 } },
+    { {3, 5, 4},
+      {4, 5, 3},
+      {0, 5, 7},
+      {7, 5, 0},
+      {1, 5, 6},
+      {6, 5, 1},
+      {2, 3, 5, 5, 4},
+      {3, 5, 5, 4, 2}, 
+      {4, 5, 5, 3, 2}, 
+      {2, 4, 5, 5, 3}, 
+      {2, 0, 5, 5, 7}, 
+      {0, 5, 5, 7, 2}, 
+      {7, 5, 5, 0, 2}, 
+      {2, 7, 5, 5, 0},
+      {2, 1, 5, 5, 6}, 
+      {1, 5, 5, 6, 2}, 
+      {6, 5, 5, 1, 2}, 
+      {2, 6, 5, 5, 1}, 
+      {4, 4, 5, 3, 3}, 
+      {3, 3, 5, 4, 4}, 
+      {7, 7, 5, 0, 0},
+      {0, 0, 5, 7, 7}, 
+      {6, 6, 5, 1, 1}, 
+      {1, 1, 5, 6, 6} }, 
+    { { 0, 4, 7 },
+      { 7, 4, 0 },
+      { 1, 4, 6 },
+      { 6, 4, 1 },
+      { 2, 4, 5 },
+      { 5, 4, 2 },
+      { 3, 0, 4, 4, 7 },
+      { 0, 4, 4, 7, 3 },
+      { 7, 4, 4, 0, 3 },
+      { 3, 7, 4, 4, 0 },
+      { 3, 1, 4, 4, 6 },
+      { 1, 4, 4, 6, 3 },
+      { 6, 4, 4, 1, 3 },
+      { 3, 6, 4, 4, 1 },
+      { 3, 2, 4, 4, 5 },
+      { 2, 4, 4, 5, 3 },
+      { 5, 4, 4, 2, 3 },
+      { 3, 5, 4, 4, 2 },
+      { 7, 7, 4, 0, 0 },
+      { 0, 0, 4, 7, 7 },
+      { 6, 6, 4, 1, 1 },
+      { 1, 1, 4, 6, 6 },
+      { 5, 5, 4, 2, 2 },
+      { 2, 2, 4, 5, 5 } } 
+  };
+
+const int plaq_length[] = {
+    3, 3, 3, 3, 3, 3 };
+
+const int plaq_path[4][6][3] = {
+    { { 1, 7, 6 },
+      { 6, 7, 1 },
+      { 2, 7, 5 },
+      { 5, 7, 2 },
+      { 3, 7, 4 },
+      { 4, 7, 3 } },
+    { { 2, 6, 5 },
+      { 5, 6, 2 },
+      { 3, 6, 4 },
+      { 4, 6, 3 },
+      { 0, 6, 7 },
+      { 7, 6, 0 } },
+    { { 3, 5, 4},
+      { 4, 5, 3},
+      { 0, 5, 7},
+      { 7, 5, 0},
+      { 1, 5, 6},
+      { 6, 5, 1} },
+    { { 0, 4, 7 },
+      { 7, 4, 0 },
+      { 1, 4, 6 },
+      { 6, 4, 1 },
+      { 2, 4, 5 },
+      { 5, 4, 2 } } 
+  };
+
diff --git a/quda_interface.c b/src/lib/quda_interface.c
similarity index 100%
rename from quda_interface.c
rename to src/lib/quda_interface.c
diff --git a/quda_interface.h b/src/lib/quda_interface.h
similarity index 100%
rename from quda_interface.h
rename to src/lib/quda_interface.h
diff --git a/quda_types.h b/src/lib/quda_types.h
similarity index 100%
rename from quda_types.h
rename to src/lib/quda_types.h
diff --git a/ranlxd.c b/src/lib/ranlxd.c
similarity index 100%
rename from ranlxd.c
rename to src/lib/ranlxd.c
diff --git a/ranlxd.h b/src/lib/ranlxd.h
similarity index 100%
rename from ranlxd.h
rename to src/lib/ranlxd.h
diff --git a/ranlxs.c b/src/lib/ranlxs.c
similarity index 100%
rename from ranlxs.c
rename to src/lib/ranlxs.c
diff --git a/ranlxs.h b/src/lib/ranlxs.h
similarity index 100%
rename from ranlxs.h
rename to src/lib/ranlxs.h
diff --git a/rational/Makefile.in b/src/lib/rational/Makefile.in
similarity index 100%
rename from rational/Makefile.in
rename to src/lib/rational/Makefile.in
diff --git a/rational/elliptic.c b/src/lib/rational/elliptic.c
similarity index 100%
rename from rational/elliptic.c
rename to src/lib/rational/elliptic.c
diff --git a/rational/elliptic.h b/src/lib/rational/elliptic.h
similarity index 100%
rename from rational/elliptic.h
rename to src/lib/rational/elliptic.h
diff --git a/rational/rational.c b/src/lib/rational/rational.c
similarity index 100%
rename from rational/rational.c
rename to src/lib/rational/rational.c
diff --git a/rational/rational.h b/src/lib/rational/rational.h
similarity index 100%
rename from rational/rational.h
rename to src/lib/rational/rational.h
diff --git a/rational/zolotarev.c b/src/lib/rational/zolotarev.c
similarity index 100%
rename from rational/zolotarev.c
rename to src/lib/rational/zolotarev.c
diff --git a/rational/zolotarev.h b/src/lib/rational/zolotarev.h
similarity index 100%
rename from rational/zolotarev.h
rename to src/lib/rational/zolotarev.h
diff --git a/read_input.h b/src/lib/read_input.h
similarity index 100%
rename from read_input.h
rename to src/lib/read_input.h
diff --git a/read_input.l b/src/lib/read_input.l
similarity index 100%
rename from read_input.l
rename to src/lib/read_input.l
diff --git a/reweighting_factor.c b/src/lib/reweighting_factor.c
similarity index 100%
rename from reweighting_factor.c
rename to src/lib/reweighting_factor.c
diff --git a/reweighting_factor.h b/src/lib/reweighting_factor.h
similarity index 100%
rename from reweighting_factor.h
rename to src/lib/reweighting_factor.h
diff --git a/reweighting_factor_nd.c b/src/lib/reweighting_factor_nd.c
similarity index 100%
rename from reweighting_factor_nd.c
rename to src/lib/reweighting_factor_nd.c
diff --git a/reweighting_factor_nd.h b/src/lib/reweighting_factor_nd.h
similarity index 100%
rename from reweighting_factor_nd.h
rename to src/lib/reweighting_factor_nd.h
diff --git a/rnd_gauge_trafo.c b/src/lib/rnd_gauge_trafo.c
similarity index 100%
rename from rnd_gauge_trafo.c
rename to src/lib/rnd_gauge_trafo.c
diff --git a/rnd_gauge_trafo.h b/src/lib/rnd_gauge_trafo.h
similarity index 100%
rename from rnd_gauge_trafo.h
rename to src/lib/rnd_gauge_trafo.h
diff --git a/sighandler.c b/src/lib/sighandler.c
similarity index 100%
rename from sighandler.c
rename to src/lib/sighandler.c
diff --git a/sighandler.h b/src/lib/sighandler.h
similarity index 100%
rename from sighandler.h
rename to src/lib/sighandler.h
diff --git a/smearing/Makefile.in b/src/lib/smearing/Makefile.in
similarity index 100%
rename from smearing/Makefile.in
rename to src/lib/smearing/Makefile.in
diff --git a/smearing/ape.h b/src/lib/smearing/ape.h
similarity index 100%
rename from smearing/ape.h
rename to src/lib/smearing/ape.h
diff --git a/smearing/ape.ih b/src/lib/smearing/ape.ih
similarity index 100%
rename from smearing/ape.ih
rename to src/lib/smearing/ape.ih
diff --git a/smearing/ape_ape_smear.c b/src/lib/smearing/ape_ape_smear.c
similarity index 100%
rename from smearing/ape_ape_smear.c
rename to src/lib/smearing/ape_ape_smear.c
diff --git a/smearing/hex.h b/src/lib/smearing/hex.h
similarity index 100%
rename from smearing/hex.h
rename to src/lib/smearing/hex.h
diff --git a/smearing/hex.ih b/src/lib/smearing/hex.ih
similarity index 100%
rename from smearing/hex.ih
rename to src/lib/smearing/hex.ih
diff --git a/smearing/hex_hex_smear.c b/src/lib/smearing/hex_hex_smear.c
similarity index 100%
rename from smearing/hex_hex_smear.c
rename to src/lib/smearing/hex_hex_smear.c
diff --git a/smearing/hex_stout_exclude_none.c b/src/lib/smearing/hex_stout_exclude_none.c
similarity index 100%
rename from smearing/hex_stout_exclude_none.c
rename to src/lib/smearing/hex_stout_exclude_none.c
diff --git a/smearing/hex_stout_exclude_one.c b/src/lib/smearing/hex_stout_exclude_one.c
similarity index 100%
rename from smearing/hex_stout_exclude_one.c
rename to src/lib/smearing/hex_stout_exclude_one.c
diff --git a/smearing/hex_stout_exclude_two.c b/src/lib/smearing/hex_stout_exclude_two.c
similarity index 100%
rename from smearing/hex_stout_exclude_two.c
rename to src/lib/smearing/hex_stout_exclude_two.c
diff --git a/smearing/hyp.h b/src/lib/smearing/hyp.h
similarity index 100%
rename from smearing/hyp.h
rename to src/lib/smearing/hyp.h
diff --git a/smearing/hyp.ih b/src/lib/smearing/hyp.ih
similarity index 100%
rename from smearing/hyp.ih
rename to src/lib/smearing/hyp.ih
diff --git a/smearing/hyp_APE_project_exclude_none.c b/src/lib/smearing/hyp_APE_project_exclude_none.c
similarity index 100%
rename from smearing/hyp_APE_project_exclude_none.c
rename to src/lib/smearing/hyp_APE_project_exclude_none.c
diff --git a/smearing/hyp_APE_project_exclude_one.c b/src/lib/smearing/hyp_APE_project_exclude_one.c
similarity index 100%
rename from smearing/hyp_APE_project_exclude_one.c
rename to src/lib/smearing/hyp_APE_project_exclude_one.c
diff --git a/smearing/hyp_APE_project_exclude_two.c b/src/lib/smearing/hyp_APE_project_exclude_two.c
similarity index 100%
rename from smearing/hyp_APE_project_exclude_two.c
rename to src/lib/smearing/hyp_APE_project_exclude_two.c
diff --git a/smearing/hyp_hyp_smear.c b/src/lib/smearing/hyp_hyp_smear.c
similarity index 100%
rename from smearing/hyp_hyp_smear.c
rename to src/lib/smearing/hyp_hyp_smear.c
diff --git a/smearing/hyp_hyp_staples_exclude_none.c b/src/lib/smearing/hyp_hyp_staples_exclude_none.c
similarity index 100%
rename from smearing/hyp_hyp_staples_exclude_none.c
rename to src/lib/smearing/hyp_hyp_staples_exclude_none.c
diff --git a/smearing/hyp_hyp_staples_exclude_one.c b/src/lib/smearing/hyp_hyp_staples_exclude_one.c
similarity index 100%
rename from smearing/hyp_hyp_staples_exclude_one.c
rename to src/lib/smearing/hyp_hyp_staples_exclude_one.c
diff --git a/smearing/hyp_hyp_staples_exclude_two.c b/src/lib/smearing/hyp_hyp_staples_exclude_two.c
similarity index 100%
rename from smearing/hyp_hyp_staples_exclude_two.c
rename to src/lib/smearing/hyp_hyp_staples_exclude_two.c
diff --git a/smearing/stout.h b/src/lib/smearing/stout.h
similarity index 100%
rename from smearing/stout.h
rename to src/lib/smearing/stout.h
diff --git a/smearing/stout.ih b/src/lib/smearing/stout.ih
similarity index 100%
rename from smearing/stout.ih
rename to src/lib/smearing/stout.ih
diff --git a/smearing/stout_stout_smear.c b/src/lib/smearing/stout_stout_smear.c
similarity index 100%
rename from smearing/stout_stout_smear.c
rename to src/lib/smearing/stout_stout_smear.c
diff --git a/smearing/uils_print_config_to_screen.c b/src/lib/smearing/uils_print_config_to_screen.c
similarity index 100%
rename from smearing/uils_print_config_to_screen.c
rename to src/lib/smearing/uils_print_config_to_screen.c
diff --git a/smearing/utils.h b/src/lib/smearing/utils.h
similarity index 100%
rename from smearing/utils.h
rename to src/lib/smearing/utils.h
diff --git a/smearing/utils.ih b/src/lib/smearing/utils.ih
similarity index 100%
rename from smearing/utils.ih
rename to src/lib/smearing/utils.ih
diff --git a/smearing/utils_generic_staples.c b/src/lib/smearing/utils_generic_staples.c
similarity index 100%
rename from smearing/utils_generic_staples.c
rename to src/lib/smearing/utils_generic_staples.c
diff --git a/smearing/utils_print_config_to_screen.c b/src/lib/smearing/utils_print_config_to_screen.c
similarity index 100%
rename from smearing/utils_print_config_to_screen.c
rename to src/lib/smearing/utils_print_config_to_screen.c
diff --git a/smearing/utils_print_su3.c b/src/lib/smearing/utils_print_su3.c
similarity index 100%
rename from smearing/utils_print_su3.c
rename to src/lib/smearing/utils_print_su3.c
diff --git a/smearing/utils_project_antiherm.c b/src/lib/smearing/utils_project_antiherm.c
similarity index 100%
rename from smearing/utils_project_antiherm.c
rename to src/lib/smearing/utils_project_antiherm.c
diff --git a/smearing/utils_project_herm.c b/src/lib/smearing/utils_project_herm.c
similarity index 100%
rename from smearing/utils_project_herm.c
rename to src/lib/smearing/utils_project_herm.c
diff --git a/smearing/utils_reunitarize.c b/src/lib/smearing/utils_reunitarize.c
similarity index 100%
rename from smearing/utils_reunitarize.c
rename to src/lib/smearing/utils_reunitarize.c
diff --git a/smearing/utils_reunitarize_MILC.c b/src/lib/smearing/utils_reunitarize_MILC.c
similarity index 88%
rename from smearing/utils_reunitarize_MILC.c
rename to src/lib/smearing/utils_reunitarize_MILC.c
index 757a797df..b5efa2936 100644
--- a/smearing/utils_reunitarize_MILC.c
+++ b/src/lib/smearing/utils_reunitarize_MILC.c
@@ -1,4 +1,5 @@
 #include "utils.ih"
+#include <complex.h>
 
 /* No reunitarization code seems to be available, so I've adapted (stolen) this routine from the
  * MILC code (who stole it elsewhere, I think ;]) -- AD. */
@@ -35,12 +36,11 @@ void reunitarize(su3 *omega) {
   bj2 = omega->c02;
 
   omega->c20 = bj1 * omega->c12;
-  omega->c20 -= bj2 *omega
-                    ->c11
+  omega->c20 -= bj2 *omega->c11;
 
-                        omega->c21 = bj2 * omega->c10;
+  omega->c21 = bj2 * omega->c10;
   omega->c21 -= bj0 * omega->c12;
 
   omega->c22 = bj0 * omega->c11;
-  omega->c22 -= bj1r * omega->c10;
+  omega->c22 -= bj1 * omega->c10;
 }
diff --git a/solver/M_plus_block_psi_body.c b/src/lib/solver/M_plus_block_psi_body.c
similarity index 100%
rename from solver/M_plus_block_psi_body.c
rename to src/lib/solver/M_plus_block_psi_body.c
diff --git a/solver/Makefile.in b/src/lib/solver/Makefile.in
similarity index 100%
rename from solver/Makefile.in
rename to src/lib/solver/Makefile.in
diff --git a/solver/Msap.c b/src/lib/solver/Msap.c
similarity index 100%
rename from solver/Msap.c
rename to src/lib/solver/Msap.c
diff --git a/solver/Msap.h b/src/lib/solver/Msap.h
similarity index 100%
rename from solver/Msap.h
rename to src/lib/solver/Msap.h
diff --git a/solver/bicg_complex.c b/src/lib/solver/bicg_complex.c
similarity index 100%
rename from solver/bicg_complex.c
rename to src/lib/solver/bicg_complex.c
diff --git a/solver/bicg_complex.h b/src/lib/solver/bicg_complex.h
similarity index 100%
rename from solver/bicg_complex.h
rename to src/lib/solver/bicg_complex.h
diff --git a/solver/bicgstab2.c b/src/lib/solver/bicgstab2.c
similarity index 100%
rename from solver/bicgstab2.c
rename to src/lib/solver/bicgstab2.c
diff --git a/solver/bicgstab2.h b/src/lib/solver/bicgstab2.h
similarity index 100%
rename from solver/bicgstab2.h
rename to src/lib/solver/bicgstab2.h
diff --git a/solver/bicgstab_complex.c b/src/lib/solver/bicgstab_complex.c
similarity index 100%
rename from solver/bicgstab_complex.c
rename to src/lib/solver/bicgstab_complex.c
diff --git a/solver/bicgstab_complex.h b/src/lib/solver/bicgstab_complex.h
similarity index 100%
rename from solver/bicgstab_complex.h
rename to src/lib/solver/bicgstab_complex.h
diff --git a/solver/bicgstab_complex_bi.c b/src/lib/solver/bicgstab_complex_bi.c
similarity index 100%
rename from solver/bicgstab_complex_bi.c
rename to src/lib/solver/bicgstab_complex_bi.c
diff --git a/solver/bicgstab_complex_bi.h b/src/lib/solver/bicgstab_complex_bi.h
similarity index 100%
rename from solver/bicgstab_complex_bi.h
rename to src/lib/solver/bicgstab_complex_bi.h
diff --git a/solver/bicgstabell.c b/src/lib/solver/bicgstabell.c
similarity index 100%
rename from solver/bicgstabell.c
rename to src/lib/solver/bicgstabell.c
diff --git a/solver/bicgstabell.h b/src/lib/solver/bicgstabell.h
similarity index 100%
rename from solver/bicgstabell.h
rename to src/lib/solver/bicgstabell.h
diff --git a/solver/cg_her.c b/src/lib/solver/cg_her.c
similarity index 100%
rename from solver/cg_her.c
rename to src/lib/solver/cg_her.c
diff --git a/solver/cg_her.h b/src/lib/solver/cg_her.h
similarity index 100%
rename from solver/cg_her.h
rename to src/lib/solver/cg_her.h
diff --git a/solver/cg_her_bi.c b/src/lib/solver/cg_her_bi.c
similarity index 100%
rename from solver/cg_her_bi.c
rename to src/lib/solver/cg_her_bi.c
diff --git a/solver/cg_her_bi.h b/src/lib/solver/cg_her_bi.h
similarity index 100%
rename from solver/cg_her_bi.h
rename to src/lib/solver/cg_her_bi.h
diff --git a/solver/cg_her_nd.c b/src/lib/solver/cg_her_nd.c
similarity index 100%
rename from solver/cg_her_nd.c
rename to src/lib/solver/cg_her_nd.c
diff --git a/solver/cg_her_nd.h b/src/lib/solver/cg_her_nd.h
similarity index 100%
rename from solver/cg_her_nd.h
rename to src/lib/solver/cg_her_nd.h
diff --git a/solver/cg_mms_tm.c b/src/lib/solver/cg_mms_tm.c
similarity index 100%
rename from solver/cg_mms_tm.c
rename to src/lib/solver/cg_mms_tm.c
diff --git a/solver/cg_mms_tm.h b/src/lib/solver/cg_mms_tm.h
similarity index 100%
rename from solver/cg_mms_tm.h
rename to src/lib/solver/cg_mms_tm.h
diff --git a/solver/cg_mms_tm_nd.c b/src/lib/solver/cg_mms_tm_nd.c
similarity index 100%
rename from solver/cg_mms_tm_nd.c
rename to src/lib/solver/cg_mms_tm_nd.c
diff --git a/solver/cg_mms_tm_nd.h b/src/lib/solver/cg_mms_tm_nd.h
similarity index 100%
rename from solver/cg_mms_tm_nd.h
rename to src/lib/solver/cg_mms_tm_nd.h
diff --git a/solver/cgne4complex.c b/src/lib/solver/cgne4complex.c
similarity index 100%
rename from solver/cgne4complex.c
rename to src/lib/solver/cgne4complex.c
diff --git a/solver/cgne4complex.h b/src/lib/solver/cgne4complex.h
similarity index 100%
rename from solver/cgne4complex.h
rename to src/lib/solver/cgne4complex.h
diff --git a/solver/cgs_real.c b/src/lib/solver/cgs_real.c
similarity index 100%
rename from solver/cgs_real.c
rename to src/lib/solver/cgs_real.c
diff --git a/solver/cgs_real.h b/src/lib/solver/cgs_real.h
similarity index 100%
rename from solver/cgs_real.h
rename to src/lib/solver/cgs_real.h
diff --git a/solver/chrono_guess.c b/src/lib/solver/chrono_guess.c
similarity index 100%
rename from solver/chrono_guess.c
rename to src/lib/solver/chrono_guess.c
diff --git a/solver/chrono_guess.h b/src/lib/solver/chrono_guess.h
similarity index 100%
rename from solver/chrono_guess.h
rename to src/lib/solver/chrono_guess.h
diff --git a/solver/cr.c b/src/lib/solver/cr.c
similarity index 100%
rename from solver/cr.c
rename to src/lib/solver/cr.c
diff --git a/solver/cr.h b/src/lib/solver/cr.h
similarity index 100%
rename from solver/cr.h
rename to src/lib/solver/cr.h
diff --git a/solver/dfl_projector.c b/src/lib/solver/dfl_projector.c
similarity index 100%
rename from solver/dfl_projector.c
rename to src/lib/solver/dfl_projector.c
diff --git a/solver/dfl_projector.h b/src/lib/solver/dfl_projector.h
similarity index 100%
rename from solver/dfl_projector.h
rename to src/lib/solver/dfl_projector.h
diff --git a/solver/diagonalise_general_matrix.c b/src/lib/solver/diagonalise_general_matrix.c
similarity index 100%
rename from solver/diagonalise_general_matrix.c
rename to src/lib/solver/diagonalise_general_matrix.c
diff --git a/solver/diagonalise_general_matrix.h b/src/lib/solver/diagonalise_general_matrix.h
similarity index 100%
rename from solver/diagonalise_general_matrix.h
rename to src/lib/solver/diagonalise_general_matrix.h
diff --git a/solver/dirac_operator_eigenvectors.c b/src/lib/solver/dirac_operator_eigenvectors.c
similarity index 100%
rename from solver/dirac_operator_eigenvectors.c
rename to src/lib/solver/dirac_operator_eigenvectors.c
diff --git a/solver/dirac_operator_eigenvectors.h b/src/lib/solver/dirac_operator_eigenvectors.h
similarity index 100%
rename from solver/dirac_operator_eigenvectors.h
rename to src/lib/solver/dirac_operator_eigenvectors.h
diff --git a/solver/eigcg.c b/src/lib/solver/eigcg.c
similarity index 100%
rename from solver/eigcg.c
rename to src/lib/solver/eigcg.c
diff --git a/solver/eigcg.h b/src/lib/solver/eigcg.h
similarity index 100%
rename from solver/eigcg.h
rename to src/lib/solver/eigcg.h
diff --git a/solver/eigenvalues.c b/src/lib/solver/eigenvalues.c
similarity index 100%
rename from solver/eigenvalues.c
rename to src/lib/solver/eigenvalues.c
diff --git a/solver/eigenvalues.h b/src/lib/solver/eigenvalues.h
similarity index 100%
rename from solver/eigenvalues.h
rename to src/lib/solver/eigenvalues.h
diff --git a/solver/eigenvalues_bi.c b/src/lib/solver/eigenvalues_bi.c
similarity index 100%
rename from solver/eigenvalues_bi.c
rename to src/lib/solver/eigenvalues_bi.c
diff --git a/solver/eigenvalues_bi.h b/src/lib/solver/eigenvalues_bi.h
similarity index 100%
rename from solver/eigenvalues_bi.h
rename to src/lib/solver/eigenvalues_bi.h
diff --git a/solver/fgmres.c b/src/lib/solver/fgmres.c
similarity index 100%
rename from solver/fgmres.c
rename to src/lib/solver/fgmres.c
diff --git a/solver/fgmres.h b/src/lib/solver/fgmres.h
similarity index 100%
rename from solver/fgmres.h
rename to src/lib/solver/fgmres.h
diff --git a/solver/fgmres4complex.c b/src/lib/solver/fgmres4complex.c
similarity index 100%
rename from solver/fgmres4complex.c
rename to src/lib/solver/fgmres4complex.c
diff --git a/solver/fgmres4complex.h b/src/lib/solver/fgmres4complex.h
similarity index 100%
rename from solver/fgmres4complex.h
rename to src/lib/solver/fgmres4complex.h
diff --git a/solver/fgmres4complex_body.c b/src/lib/solver/fgmres4complex_body.c
similarity index 100%
rename from solver/fgmres4complex_body.c
rename to src/lib/solver/fgmres4complex_body.c
diff --git a/solver/gcr.c b/src/lib/solver/gcr.c
similarity index 100%
rename from solver/gcr.c
rename to src/lib/solver/gcr.c
diff --git a/solver/gcr.h b/src/lib/solver/gcr.h
similarity index 100%
rename from solver/gcr.h
rename to src/lib/solver/gcr.h
diff --git a/solver/gcr4complex.c b/src/lib/solver/gcr4complex.c
similarity index 100%
rename from solver/gcr4complex.c
rename to src/lib/solver/gcr4complex.c
diff --git a/solver/gcr4complex.h b/src/lib/solver/gcr4complex.h
similarity index 100%
rename from solver/gcr4complex.h
rename to src/lib/solver/gcr4complex.h
diff --git a/solver/gcr4complex_body.c b/src/lib/solver/gcr4complex_body.c
similarity index 100%
rename from solver/gcr4complex_body.c
rename to src/lib/solver/gcr4complex_body.c
diff --git a/solver/gcr4complex_body.h b/src/lib/solver/gcr4complex_body.h
similarity index 100%
rename from solver/gcr4complex_body.h
rename to src/lib/solver/gcr4complex_body.h
diff --git a/solver/generate_dfl_subspace.c b/src/lib/solver/generate_dfl_subspace.c
similarity index 100%
rename from solver/generate_dfl_subspace.c
rename to src/lib/solver/generate_dfl_subspace.c
diff --git a/solver/generate_dfl_subspace.h b/src/lib/solver/generate_dfl_subspace.h
similarity index 100%
rename from solver/generate_dfl_subspace.h
rename to src/lib/solver/generate_dfl_subspace.h
diff --git a/solver/gmres.c b/src/lib/solver/gmres.c
similarity index 100%
rename from solver/gmres.c
rename to src/lib/solver/gmres.c
diff --git a/solver/gmres.h b/src/lib/solver/gmres.h
similarity index 100%
rename from solver/gmres.h
rename to src/lib/solver/gmres.h
diff --git a/solver/gmres_dr.c b/src/lib/solver/gmres_dr.c
similarity index 100%
rename from solver/gmres_dr.c
rename to src/lib/solver/gmres_dr.c
diff --git a/solver/gmres_dr.h b/src/lib/solver/gmres_dr.h
similarity index 100%
rename from solver/gmres_dr.h
rename to src/lib/solver/gmres_dr.h
diff --git a/solver/gmres_precon.c b/src/lib/solver/gmres_precon.c
similarity index 100%
rename from solver/gmres_precon.c
rename to src/lib/solver/gmres_precon.c
diff --git a/solver/gmres_precon.h b/src/lib/solver/gmres_precon.h
similarity index 100%
rename from solver/gmres_precon.h
rename to src/lib/solver/gmres_precon.h
diff --git a/solver/gram-schmidt.c b/src/lib/solver/gram-schmidt.c
similarity index 100%
rename from solver/gram-schmidt.c
rename to src/lib/solver/gram-schmidt.c
diff --git a/solver/gram-schmidt.h b/src/lib/solver/gram-schmidt.h
similarity index 100%
rename from solver/gram-schmidt.h
rename to src/lib/solver/gram-schmidt.h
diff --git a/solver/incr_eigcg.c b/src/lib/solver/incr_eigcg.c
similarity index 100%
rename from solver/incr_eigcg.c
rename to src/lib/solver/incr_eigcg.c
diff --git a/solver/incr_eigcg.h b/src/lib/solver/incr_eigcg.h
similarity index 100%
rename from solver/incr_eigcg.h
rename to src/lib/solver/incr_eigcg.h
diff --git a/solver/index_jd.c b/src/lib/solver/index_jd.c
similarity index 100%
rename from solver/index_jd.c
rename to src/lib/solver/index_jd.c
diff --git a/solver/index_jd.h b/src/lib/solver/index_jd.h
similarity index 100%
rename from solver/index_jd.h
rename to src/lib/solver/index_jd.h
diff --git a/solver/init_guess.c b/src/lib/solver/init_guess.c
similarity index 100%
rename from solver/init_guess.c
rename to src/lib/solver/init_guess.c
diff --git a/solver/init_guess.h b/src/lib/solver/init_guess.h
similarity index 100%
rename from solver/init_guess.h
rename to src/lib/solver/init_guess.h
diff --git a/solver/jdher.c b/src/lib/solver/jdher.c
similarity index 100%
rename from solver/jdher.c
rename to src/lib/solver/jdher.c
diff --git a/solver/jdher.h b/src/lib/solver/jdher.h
similarity index 100%
rename from solver/jdher.h
rename to src/lib/solver/jdher.h
diff --git a/solver/jdher_bi.c b/src/lib/solver/jdher_bi.c
similarity index 100%
rename from solver/jdher_bi.c
rename to src/lib/solver/jdher_bi.c
diff --git a/solver/jdher_bi.h b/src/lib/solver/jdher_bi.h
similarity index 100%
rename from solver/jdher_bi.h
rename to src/lib/solver/jdher_bi.h
diff --git a/solver/little_mg_precon_body.c b/src/lib/solver/little_mg_precon_body.c
similarity index 100%
rename from solver/little_mg_precon_body.c
rename to src/lib/solver/little_mg_precon_body.c
diff --git a/solver/little_project_eo_body.c b/src/lib/solver/little_project_eo_body.c
similarity index 100%
rename from solver/little_project_eo_body.c
rename to src/lib/solver/little_project_eo_body.c
diff --git a/solver/lu_solve.c b/src/lib/solver/lu_solve.c
similarity index 100%
rename from solver/lu_solve.c
rename to src/lib/solver/lu_solve.c
diff --git a/solver/lu_solve.h b/src/lib/solver/lu_solve.h
similarity index 100%
rename from solver/lu_solve.h
rename to src/lib/solver/lu_solve.h
diff --git a/solver/matrix_mult_typedef.h b/src/lib/solver/matrix_mult_typedef.h
similarity index 100%
rename from solver/matrix_mult_typedef.h
rename to src/lib/solver/matrix_mult_typedef.h
diff --git a/solver/matrix_mult_typedef_bi.h b/src/lib/solver/matrix_mult_typedef_bi.h
similarity index 100%
rename from solver/matrix_mult_typedef_bi.h
rename to src/lib/solver/matrix_mult_typedef_bi.h
diff --git a/solver/matrix_mult_typedef_nd.h b/src/lib/solver/matrix_mult_typedef_nd.h
similarity index 100%
rename from solver/matrix_mult_typedef_nd.h
rename to src/lib/solver/matrix_mult_typedef_nd.h
diff --git a/solver/mcr.c b/src/lib/solver/mcr.c
similarity index 100%
rename from solver/mcr.c
rename to src/lib/solver/mcr.c
diff --git a/solver/mcr.h b/src/lib/solver/mcr.h
similarity index 100%
rename from solver/mcr.h
rename to src/lib/solver/mcr.h
diff --git a/solver/mcr4complex.c b/src/lib/solver/mcr4complex.c
similarity index 100%
rename from solver/mcr4complex.c
rename to src/lib/solver/mcr4complex.c
diff --git a/solver/mcr4complex.h b/src/lib/solver/mcr4complex.h
similarity index 100%
rename from solver/mcr4complex.h
rename to src/lib/solver/mcr4complex.h
diff --git a/solver/mixed_cg_her.c b/src/lib/solver/mixed_cg_her.c
similarity index 100%
rename from solver/mixed_cg_her.c
rename to src/lib/solver/mixed_cg_her.c
diff --git a/solver/mixed_cg_her.h b/src/lib/solver/mixed_cg_her.h
similarity index 100%
rename from solver/mixed_cg_her.h
rename to src/lib/solver/mixed_cg_her.h
diff --git a/solver/mixed_cg_mms_tm_nd.c b/src/lib/solver/mixed_cg_mms_tm_nd.c
similarity index 100%
rename from solver/mixed_cg_mms_tm_nd.c
rename to src/lib/solver/mixed_cg_mms_tm_nd.c
diff --git a/solver/mixed_cg_mms_tm_nd.h b/src/lib/solver/mixed_cg_mms_tm_nd.h
similarity index 100%
rename from solver/mixed_cg_mms_tm_nd.h
rename to src/lib/solver/mixed_cg_mms_tm_nd.h
diff --git a/solver/monomial_solve.c b/src/lib/solver/monomial_solve.c
similarity index 100%
rename from solver/monomial_solve.c
rename to src/lib/solver/monomial_solve.c
diff --git a/solver/monomial_solve.h b/src/lib/solver/monomial_solve.h
similarity index 100%
rename from solver/monomial_solve.h
rename to src/lib/solver/monomial_solve.h
diff --git a/solver/mr.c b/src/lib/solver/mr.c
similarity index 100%
rename from solver/mr.c
rename to src/lib/solver/mr.c
diff --git a/solver/mr.h b/src/lib/solver/mr.h
similarity index 100%
rename from solver/mr.h
rename to src/lib/solver/mr.h
diff --git a/solver/mr4complex.c b/src/lib/solver/mr4complex.c
similarity index 100%
rename from solver/mr4complex.c
rename to src/lib/solver/mr4complex.c
diff --git a/solver/mr4complex.h b/src/lib/solver/mr4complex.h
similarity index 100%
rename from solver/mr4complex.h
rename to src/lib/solver/mr4complex.h
diff --git a/solver/mrblk_body.c b/src/lib/solver/mrblk_body.c
similarity index 100%
rename from solver/mrblk_body.c
rename to src/lib/solver/mrblk_body.c
diff --git a/solver/ortho.c b/src/lib/solver/ortho.c
similarity index 100%
rename from solver/ortho.c
rename to src/lib/solver/ortho.c
diff --git a/solver/ortho.h b/src/lib/solver/ortho.h
similarity index 100%
rename from solver/ortho.h
rename to src/lib/solver/ortho.h
diff --git a/solver/pcg_her.c b/src/lib/solver/pcg_her.c
similarity index 100%
rename from solver/pcg_her.c
rename to src/lib/solver/pcg_her.c
diff --git a/solver/pcg_her.h b/src/lib/solver/pcg_her.h
similarity index 100%
rename from solver/pcg_her.h
rename to src/lib/solver/pcg_her.h
diff --git a/solver/poly_precon.c b/src/lib/solver/poly_precon.c
similarity index 100%
rename from solver/poly_precon.c
rename to src/lib/solver/poly_precon.c
diff --git a/solver/poly_precon.h b/src/lib/solver/poly_precon.h
similarity index 100%
rename from solver/poly_precon.h
rename to src/lib/solver/poly_precon.h
diff --git a/solver/quicksort.c b/src/lib/solver/quicksort.c
similarity index 100%
rename from solver/quicksort.c
rename to src/lib/solver/quicksort.c
diff --git a/solver/quicksort.h b/src/lib/solver/quicksort.h
similarity index 100%
rename from solver/quicksort.h
rename to src/lib/solver/quicksort.h
diff --git a/solver/restart_X.c b/src/lib/solver/restart_X.c
similarity index 100%
rename from solver/restart_X.c
rename to src/lib/solver/restart_X.c
diff --git a/solver/restart_X.h b/src/lib/solver/restart_X.h
similarity index 100%
rename from solver/restart_X.h
rename to src/lib/solver/restart_X.h
diff --git a/solver/rg_mixed_cg_her.c b/src/lib/solver/rg_mixed_cg_her.c
similarity index 100%
rename from solver/rg_mixed_cg_her.c
rename to src/lib/solver/rg_mixed_cg_her.c
diff --git a/solver/rg_mixed_cg_her.h b/src/lib/solver/rg_mixed_cg_her.h
similarity index 100%
rename from solver/rg_mixed_cg_her.h
rename to src/lib/solver/rg_mixed_cg_her.h
diff --git a/solver/rg_mixed_cg_her_nd.c b/src/lib/solver/rg_mixed_cg_her_nd.c
similarity index 100%
rename from solver/rg_mixed_cg_her_nd.c
rename to src/lib/solver/rg_mixed_cg_her_nd.c
diff --git a/solver/rg_mixed_cg_her_nd.h b/src/lib/solver/rg_mixed_cg_her_nd.h
similarity index 100%
rename from solver/rg_mixed_cg_her_nd.h
rename to src/lib/solver/rg_mixed_cg_her_nd.h
diff --git a/solver/rg_mixed_cg_typedef.h b/src/lib/solver/rg_mixed_cg_typedef.h
similarity index 100%
rename from solver/rg_mixed_cg_typedef.h
rename to src/lib/solver/rg_mixed_cg_typedef.h
diff --git a/solver/solver.h b/src/lib/solver/solver.h
similarity index 100%
rename from solver/solver.h
rename to src/lib/solver/solver.h
diff --git a/solver/solver_field.c b/src/lib/solver/solver_field.c
similarity index 100%
rename from solver/solver_field.c
rename to src/lib/solver/solver_field.c
diff --git a/solver/solver_field.h b/src/lib/solver/solver_field.h
similarity index 100%
rename from solver/solver_field.h
rename to src/lib/solver/solver_field.h
diff --git a/solver/solver_params.h b/src/lib/solver/solver_params.h
similarity index 100%
rename from solver/solver_params.h
rename to src/lib/solver/solver_params.h
diff --git a/solver/solver_types.c b/src/lib/solver/solver_types.c
similarity index 100%
rename from solver/solver_types.c
rename to src/lib/solver/solver_types.c
diff --git a/solver/solver_types.h b/src/lib/solver/solver_types.h
similarity index 100%
rename from solver/solver_types.h
rename to src/lib/solver/solver_types.h
diff --git a/solver/sub_low_ev.c b/src/lib/solver/sub_low_ev.c
similarity index 100%
rename from solver/sub_low_ev.c
rename to src/lib/solver/sub_low_ev.c
diff --git a/solver/sub_low_ev.h b/src/lib/solver/sub_low_ev.h
similarity index 100%
rename from solver/sub_low_ev.h
rename to src/lib/solver/sub_low_ev.h
diff --git a/solver/sumr.c b/src/lib/solver/sumr.c
similarity index 100%
rename from solver/sumr.c
rename to src/lib/solver/sumr.c
diff --git a/solver/sumr.h b/src/lib/solver/sumr.h
similarity index 100%
rename from solver/sumr.h
rename to src/lib/solver/sumr.h
diff --git a/source_generation.c b/src/lib/source_generation.c
similarity index 100%
rename from source_generation.c
rename to src/lib/source_generation.c
diff --git a/source_generation.h b/src/lib/source_generation.h
similarity index 100%
rename from source_generation.h
rename to src/lib/source_generation.h
diff --git a/spinor_fft.c b/src/lib/spinor_fft.c
similarity index 100%
rename from spinor_fft.c
rename to src/lib/spinor_fft.c
diff --git a/spinor_fft.h b/src/lib/spinor_fft.h
similarity index 100%
rename from spinor_fft.h
rename to src/lib/spinor_fft.h
diff --git a/start.c b/src/lib/start.c
similarity index 100%
rename from start.c
rename to src/lib/start.c
diff --git a/start.h b/src/lib/start.h
similarity index 100%
rename from start.h
rename to src/lib/start.h
diff --git a/struct_accessors.h b/src/lib/struct_accessors.h
similarity index 100%
rename from struct_accessors.h
rename to src/lib/struct_accessors.h
diff --git a/su3.h b/src/lib/su3.h
similarity index 100%
rename from su3.h
rename to src/lib/su3.h
diff --git a/su3adj.h b/src/lib/su3adj.h
similarity index 100%
rename from su3adj.h
rename to src/lib/su3adj.h
diff --git a/su3spinor.h b/src/lib/su3spinor.h
similarity index 100%
rename from su3spinor.h
rename to src/lib/su3spinor.h
diff --git a/tensors.h b/src/lib/tensors.h
similarity index 100%
rename from tensors.h
rename to src/lib/tensors.h
diff --git a/test/Makefile b/src/lib/test/Makefile
similarity index 100%
rename from test/Makefile
rename to src/lib/test/Makefile
diff --git a/test/check_geometry.c b/src/lib/test/check_geometry.c
similarity index 100%
rename from test/check_geometry.c
rename to src/lib/test/check_geometry.c
diff --git a/test/check_geometry.h b/src/lib/test/check_geometry.h
similarity index 100%
rename from test/check_geometry.h
rename to src/lib/test/check_geometry.h
diff --git a/test/check_nan.c b/src/lib/test/check_nan.c
similarity index 100%
rename from test/check_nan.c
rename to src/lib/test/check_nan.c
diff --git a/test/check_nan.h b/src/lib/test/check_nan.h
similarity index 100%
rename from test/check_nan.h
rename to src/lib/test/check_nan.h
diff --git a/test/check_overlap.c b/src/lib/test/check_overlap.c
similarity index 100%
rename from test/check_overlap.c
rename to src/lib/test/check_overlap.c
diff --git a/test/check_xchange.c b/src/lib/test/check_xchange.c
similarity index 100%
rename from test/check_xchange.c
rename to src/lib/test/check_xchange.c
diff --git a/test/hopping_test.README b/src/lib/test/hopping_test.README
similarity index 100%
rename from test/hopping_test.README
rename to src/lib/test/hopping_test.README
diff --git a/test/hopping_test.input.compare b/src/lib/test/hopping_test.input.compare
similarity index 100%
rename from test/hopping_test.input.compare
rename to src/lib/test/hopping_test.input.compare
diff --git a/test/hopping_test.input.new b/src/lib/test/hopping_test.input.new
similarity index 100%
rename from test/hopping_test.input.new
rename to src/lib/test/hopping_test.input.new
diff --git a/test/hopping_test.input.start b/src/lib/test/hopping_test.input.start
similarity index 100%
rename from test/hopping_test.input.start
rename to src/lib/test/hopping_test.input.start
diff --git a/test/hopping_test_generate_script b/src/lib/test/hopping_test_generate_script
similarity index 100%
rename from test/hopping_test_generate_script
rename to src/lib/test/hopping_test_generate_script
diff --git a/test/hopping_test_qscript b/src/lib/test/hopping_test_qscript
similarity index 100%
rename from test/hopping_test_qscript
rename to src/lib/test/hopping_test_qscript
diff --git a/test/measure_rectangles.debug.c b/src/lib/test/measure_rectangles.debug.c
similarity index 100%
rename from test/measure_rectangles.debug.c
rename to src/lib/test/measure_rectangles.debug.c
diff --git a/test/overlaptests.c b/src/lib/test/overlaptests.c
similarity index 100%
rename from test/overlaptests.c
rename to src/lib/test/overlaptests.c
diff --git a/test/overlaptests.h b/src/lib/test/overlaptests.h
similarity index 100%
rename from test/overlaptests.h
rename to src/lib/test/overlaptests.h
diff --git a/test/qdran64.h b/src/lib/test/qdran64.h
similarity index 100%
rename from test/qdran64.h
rename to src/lib/test/qdran64.h
diff --git a/tm_debug_printf.c b/src/lib/tm_debug_printf.c
similarity index 100%
rename from tm_debug_printf.c
rename to src/lib/tm_debug_printf.c
diff --git a/tm_debug_printf.h b/src/lib/tm_debug_printf.h
similarity index 100%
rename from tm_debug_printf.h
rename to src/lib/tm_debug_printf.h
diff --git a/update_backward_gauge.c b/src/lib/update_backward_gauge.c
similarity index 100%
rename from update_backward_gauge.c
rename to src/lib/update_backward_gauge.c
diff --git a/update_backward_gauge.h b/src/lib/update_backward_gauge.h
similarity index 100%
rename from update_backward_gauge.h
rename to src/lib/update_backward_gauge.h
diff --git a/update_gauge.c b/src/lib/update_gauge.c
similarity index 100%
rename from update_gauge.c
rename to src/lib/update_gauge.c
diff --git a/update_gauge.h b/src/lib/update_gauge.h
similarity index 100%
rename from update_gauge.h
rename to src/lib/update_gauge.h
diff --git a/update_momenta.c b/src/lib/update_momenta.c
similarity index 100%
rename from update_momenta.c
rename to src/lib/update_momenta.c
diff --git a/update_momenta.h b/src/lib/update_momenta.h
similarity index 100%
rename from update_momenta.h
rename to src/lib/update_momenta.h
diff --git a/update_momenta_fg.c b/src/lib/update_momenta_fg.c
similarity index 100%
rename from update_momenta_fg.c
rename to src/lib/update_momenta_fg.c
diff --git a/update_momenta_fg.h b/src/lib/update_momenta_fg.h
similarity index 100%
rename from update_momenta_fg.h
rename to src/lib/update_momenta_fg.h
diff --git a/update_tm.c b/src/lib/update_tm.c
similarity index 100%
rename from update_tm.c
rename to src/lib/update_tm.c
diff --git a/update_tm.h b/src/lib/update_tm.h
similarity index 100%
rename from update_tm.h
rename to src/lib/update_tm.h
diff --git a/util/io.c b/src/lib/util/io.c
similarity index 100%
rename from util/io.c
rename to src/lib/util/io.c
diff --git a/util/io.h b/src/lib/util/io.h
similarity index 100%
rename from util/io.h
rename to src/lib/util/io.h
diff --git a/util/laguer/Makefile b/src/lib/util/laguer/Makefile
similarity index 100%
rename from util/laguer/Makefile
rename to src/lib/util/laguer/Makefile
diff --git a/util/laguer/chebyRoot.C b/src/lib/util/laguer/chebyRoot.C
similarity index 100%
rename from util/laguer/chebyRoot.C
rename to src/lib/util/laguer/chebyRoot.C
diff --git a/util/laguer/chebyRoot.H b/src/lib/util/laguer/chebyRoot.H
similarity index 100%
rename from util/laguer/chebyRoot.H
rename to src/lib/util/laguer/chebyRoot.H
diff --git a/util/laguer/laguer.c b/src/lib/util/laguer/laguer.c
similarity index 100%
rename from util/laguer/laguer.c
rename to src/lib/util/laguer/laguer.c
diff --git a/util/laguer/quadroptRoot.C b/src/lib/util/laguer/quadroptRoot.C
similarity index 100%
rename from util/laguer/quadroptRoot.C
rename to src/lib/util/laguer/quadroptRoot.C
diff --git a/util/oox/Makefile b/src/lib/util/oox/Makefile
similarity index 100%
rename from util/oox/Makefile
rename to src/lib/util/oox/Makefile
diff --git a/util/oox/oox.c b/src/lib/util/oox/oox.c
similarity index 100%
rename from util/oox/oox.c
rename to src/lib/util/oox/oox.c
diff --git a/util/oox/oox_gawrapper.cxx b/src/lib/util/oox/oox_gawrapper.cxx
similarity index 100%
rename from util/oox/oox_gawrapper.cxx
rename to src/lib/util/oox/oox_gawrapper.cxx
diff --git a/util/oox/oox_gawrapper.h b/src/lib/util/oox/oox_gawrapper.h
similarity index 100%
rename from util/oox/oox_gawrapper.h
rename to src/lib/util/oox/oox_gawrapper.h
diff --git a/util/swapendian.c b/src/lib/util/swapendian.c
similarity index 100%
rename from util/swapendian.c
rename to src/lib/util/swapendian.c
diff --git a/util/tmlqcd-indent b/src/lib/util/tmlqcd-indent
similarity index 100%
rename from util/tmlqcd-indent
rename to src/lib/util/tmlqcd-indent
diff --git a/wrapper/Makefile.in b/src/lib/wrapper/Makefile.in
similarity index 100%
rename from wrapper/Makefile.in
rename to src/lib/wrapper/Makefile.in
diff --git a/wrapper/lib_wrapper.c b/src/lib/wrapper/lib_wrapper.c
similarity index 100%
rename from wrapper/lib_wrapper.c
rename to src/lib/wrapper/lib_wrapper.c
diff --git a/xchange/Makefile.in b/src/lib/xchange/Makefile.in
similarity index 100%
rename from xchange/Makefile.in
rename to src/lib/xchange/Makefile.in
diff --git a/xchange/little_field_gather.c b/src/lib/xchange/little_field_gather.c
similarity index 100%
rename from xchange/little_field_gather.c
rename to src/lib/xchange/little_field_gather.c
diff --git a/xchange/little_field_gather.h b/src/lib/xchange/little_field_gather.h
similarity index 100%
rename from xchange/little_field_gather.h
rename to src/lib/xchange/little_field_gather.h
diff --git a/xchange/little_field_gather_body.c b/src/lib/xchange/little_field_gather_body.c
similarity index 100%
rename from xchange/little_field_gather_body.c
rename to src/lib/xchange/little_field_gather_body.c
diff --git a/xchange/xchange.h b/src/lib/xchange/xchange.h
similarity index 100%
rename from xchange/xchange.h
rename to src/lib/xchange/xchange.h
diff --git a/xchange/xchange_2fields.c b/src/lib/xchange/xchange_2fields.c
similarity index 100%
rename from xchange/xchange_2fields.c
rename to src/lib/xchange/xchange_2fields.c
diff --git a/xchange/xchange_2fields.h b/src/lib/xchange/xchange_2fields.h
similarity index 100%
rename from xchange/xchange_2fields.h
rename to src/lib/xchange/xchange_2fields.h
diff --git a/xchange/xchange_deri.c b/src/lib/xchange/xchange_deri.c
similarity index 100%
rename from xchange/xchange_deri.c
rename to src/lib/xchange/xchange_deri.c
diff --git a/xchange/xchange_deri.h b/src/lib/xchange/xchange_deri.h
similarity index 100%
rename from xchange/xchange_deri.h
rename to src/lib/xchange/xchange_deri.h
diff --git a/xchange/xchange_field.c b/src/lib/xchange/xchange_field.c
similarity index 100%
rename from xchange/xchange_field.c
rename to src/lib/xchange/xchange_field.c
diff --git a/xchange/xchange_field.h b/src/lib/xchange/xchange_field.h
similarity index 100%
rename from xchange/xchange_field.h
rename to src/lib/xchange/xchange_field.h
diff --git a/xchange/xchange_gauge.c b/src/lib/xchange/xchange_gauge.c
similarity index 100%
rename from xchange/xchange_gauge.c
rename to src/lib/xchange/xchange_gauge.c
diff --git a/xchange/xchange_gauge.h b/src/lib/xchange/xchange_gauge.h
similarity index 100%
rename from xchange/xchange_gauge.h
rename to src/lib/xchange/xchange_gauge.h
diff --git a/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
similarity index 100%
rename from xchange/xchange_halffield.c
rename to src/lib/xchange/xchange_halffield.c
diff --git a/xchange/xchange_halffield.h b/src/lib/xchange/xchange_halffield.h
similarity index 100%
rename from xchange/xchange_halffield.h
rename to src/lib/xchange/xchange_halffield.h
diff --git a/xchange/xchange_lexicfield.c b/src/lib/xchange/xchange_lexicfield.c
similarity index 100%
rename from xchange/xchange_lexicfield.c
rename to src/lib/xchange/xchange_lexicfield.c
diff --git a/xchange/xchange_lexicfield.h b/src/lib/xchange/xchange_lexicfield.h
similarity index 100%
rename from xchange/xchange_lexicfield.h
rename to src/lib/xchange/xchange_lexicfield.h

From e1d95e310b297bda1f50b50a5264ab5c22a0b813 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 9 Feb 2026 09:11:11 +0100
Subject: [PATCH 02/19] Use TM_BLA for #ifdef flags

- Moved git hash string to a c file
- Removed the CRAY keyword
- Moved tests files in separate directory
- Namespace all #ifdef varaibles TM_XXX
- Moved profile directory to the root
- updated the url info
---
 CMakeLists.txt                                |  160 +-
 Makefile.global                               |   64 -
 Makefile.in                                   |  167 --
 Makefile.tests                                |   64 -
 cmake/FindDDAlphaAMG.cmake                    |   29 +
 cmake/{git_hash.h.in => git_hash.c.in}        |    2 +-
 cmake/tmlqcd_config_internal.h.in             |   58 +-
 cmake_includes.txt                            |  425 ----
 config.guess                                  | 1701 -------------
 config.sub                                    | 1855 --------------
 configure.in                                  |  737 ------
 .../lib/profiling => profiling}/hmc/Readme.md |    0
 .../hmc/example_profile.pdf                   |  Bin
 .../profiling => profiling}/hmc/profile.Rmd   |    0
 {src/lib/profiling => profiling}/hmc/timing.R |    0
 .../hmc_mk2/.gitignore                        |    0
 .../profiling => profiling}/hmc_mk2/README.md |    0
 .../hmc_mk2/logs/example_log.out              |    4 +-
 .../hmc_mk2/make_profile.R                    |    0
 .../hmc_mk2/profile.Rmd                       |    0
 qphix_base_classes.hpp                        |  771 ------
 qphix_interface.cpp                           | 2192 -----------------
 qphix_interface.hpp                           |   51 -
 qphix_interface_utils.hpp                     |   33 -
 src/bin/LapH_ev.c                             |   20 +-
 src/bin/benchmark.c                           |   46 +-
 src/bin/deriv_mg_tune.c                       |   12 +-
 src/bin/hmc_tm.c                              |   16 +-
 src/bin/invert.c                              |   14 +-
 src/bin/offline_measurement.c                 |   10 +-
 src/bin/{ => tests}/check_locallity.c         |   12 +-
 src/bin/{ => tests}/hopping_test.c            |   40 +-
 src/bin/{ => tests}/qphix_test_Dslash.c       |   12 +-
 src/bin/{ => tests}/scalar_prod_r_test.c      |    0
 src/bin/{ => tests}/test_eigenvalues.c        |   12 +-
 src/bin/{ => tests}/test_lemon.c              |    4 +-
 src/lib/CMakeLists.txt                        |   20 +-
 src/lib/DDalphaAMG_interface.c                |   60 +-
 src/lib/DDalphaAMG_interface.h                |    2 +-
 .../utils_generic_exchange.blocking.inc       |   12 +-
 src/lib/buffers/utils_generic_exchange.c      |   12 +-
 .../utils_generic_exchange.nonblocking.inc    |   16 +-
 src/lib/deriv_Sb.c                            |   22 +-
 src/lib/deriv_Sb_D_psi.c                      |    4 +-
 .../lib/fixed_volume.h.in                     |    0
 src/lib/geometry_eo.c                         |  148 +-
 src/lib/get_rectangle_staples.c               |    4 +-
 src/lib/get_staples.c                         |   12 +-
 src/lib/gettime.c                             |    4 +-
 src/lib/git_hash.h                            |    6 +
 src/lib/global.h                              |    6 +-
 src/lib/init/init_dirac_halfspinor.c          |   48 +-
 src/lib/init/init_gauge_field.c               |    8 +-
 src/lib/init/init_geometry_indices.c          |    4 +-
 src/lib/init/init_parallel.h                  |    4 +-
 src/lib/init/init_spinor_field.c              |   16 +-
 src/lib/invert_clover_eo.c                    |    6 +-
 src/lib/invert_doublet_eo.c                   |    6 +-
 src/lib/invert_eo.c                           |    4 +-
 src/lib/io/gauge_read.c                       |    4 +-
 src/lib/io/gauge_read_binary.c                |    6 +-
 src/lib/io/gauge_write_binary.c               |    6 +-
 src/lib/io/selector.h                         |    8 +-
 src/lib/io/spinor_read_binary.c               |   12 +-
 src/lib/io/spinor_write_binary.c              |   12 +-
 src/lib/io/spinor_write_propagator_type.c     |    8 +-
 src/lib/io/spinor_write_source_format.c       |    8 +-
 src/lib/io/utils_construct_reader.c           |   14 +-
 src/lib/io/utils_construct_writer.c           |    8 +-
 src/lib/io/utils_destruct_reader.c            |    6 +-
 src/lib/io/utils_destruct_writer.c            |    6 +-
 src/lib/io/utils_kill_with_error.c            |    4 +-
 src/lib/io/utils_write_first_message.c        |   24 +-
 src/lib/io/utils_write_header.c               |    8 +-
 src/lib/io/utils_write_message.c              |    8 +-
 src/lib/linalg/blas.h                         |    4 +-
 src/lib/linalg/lapack.h                       |    2 +-
 src/lib/little_D.c                            |    8 +-
 src/lib/meas/polyakov_loop.c                  |   10 +-
 src/lib/measure_gauge_action.c                |    2 -
 src/lib/mpi_init.c                            |   74 +-
 src/lib/mpi_init.h                            |    4 +-
 src/lib/operator.c                            |    2 +-
 src/lib/operator/D_psi_body.c                 |    2 +-
 src/lib/operator/Hopping_Matrix.c             |   14 +-
 src/lib/operator/Hopping_Matrix_32.c          |   10 +-
 src/lib/operator/Hopping_Matrix_nocom.c       |    4 +-
 src/lib/operator/halfspinor_body.c            |    4 +-
 src/lib/operator/hopping_bg_dbl.c             |   20 +-
 src/lib/operator/hopping_body_dbl.c           |   20 +-
 src/lib/operator/hopping_sgl.c                |   18 +-
 src/lib/operator/tm_sub_Hopping_Matrix.c      |    8 +-
 src/lib/operator/tm_times_Hopping_Matrix.c    |   10 +-
 src/lib/overrelaxation.c                      |    2 +-
 src/lib/parallel_io.h                         |    4 +-
 src/lib/read_input.l                          |   44 +-
 src/lib/solver/cg_her.c                       |    2 +-
 src/lib/solver/cg_her_nd.c                    |    2 +-
 src/lib/solver/cr.c                           |    2 +-
 src/lib/solver/diagonalise_general_matrix.c   |    2 +-
 src/lib/solver/dirac_operator_eigenvectors.c  |   20 +-
 src/lib/solver/dirac_operator_eigenvectors.h  |    6 +-
 src/lib/solver/eigenvalues.c                  |    2 +-
 src/lib/solver/fgmres.c                       |    2 +-
 src/lib/solver/fgmres4complex_body.c          |    2 +-
 src/lib/solver/gmres_dr.c                     |    2 +-
 src/lib/solver/gram-schmidt.c                 |    6 +-
 src/lib/solver/mcr.c                          |    2 +-
 src/lib/solver/monomial_solve.c               |   10 +-
 src/lib/solver/solver_field.c                 |    8 +-
 src/lib/spinor_fft.c                          |   10 +-
 src/lib/test/Makefile                         |   88 -
 src/lib/test/check_geometry.c                 |   30 +-
 src/lib/test/check_overlap.c                  |   18 +-
 src/lib/test/check_xchange.c                  |   68 +-
 src/lib/test/measure_rectangles.debug.c       |    4 +-
 src/lib/update_backward_gauge.c               |    2 +-
 src/lib/update_gauge.c                        |    8 +-
 src/lib/update_momenta_fg.c                   |    8 +-
 src/lib/update_tm.c                           |    8 +-
 src/lib/util/io.c                             |    2 +-
 src/lib/util/laguer/Makefile                  |    9 -
 src/lib/util/oox/Makefile                     |   46 -
 src/lib/wrapper/lib_wrapper.c                 |    6 +-
 src/lib/xchange/xchange_2fields.c             |   16 +-
 src/lib/xchange/xchange_2fields.h             |    2 +-
 src/lib/xchange/xchange_deri.c                |   24 +-
 src/lib/xchange/xchange_field.c               |   70 +-
 src/lib/xchange/xchange_gauge.c               |   36 +-
 src/lib/xchange/xchange_halffield.c           |   68 +-
 src/lib/xchange/xchange_lexicfield.c          |   80 +-
 131 files changed, 903 insertions(+), 9111 deletions(-)
 delete mode 100644 Makefile.global
 delete mode 100644 Makefile.in
 delete mode 100644 Makefile.tests
 create mode 100644 cmake/FindDDAlphaAMG.cmake
 rename cmake/{git_hash.h.in => git_hash.c.in} (62%)
 delete mode 100644 cmake_includes.txt
 delete mode 100644 config.guess
 delete mode 100644 config.sub
 delete mode 100644 configure.in
 rename {src/lib/profiling => profiling}/hmc/Readme.md (100%)
 rename {src/lib/profiling => profiling}/hmc/example_profile.pdf (100%)
 rename {src/lib/profiling => profiling}/hmc/profile.Rmd (100%)
 rename {src/lib/profiling => profiling}/hmc/timing.R (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/.gitignore (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/README.md (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/logs/example_log.out (99%)
 rename {src/lib/profiling => profiling}/hmc_mk2/make_profile.R (100%)
 rename {src/lib/profiling => profiling}/hmc_mk2/profile.Rmd (100%)
 delete mode 100644 qphix_base_classes.hpp
 delete mode 100644 qphix_interface.cpp
 delete mode 100644 qphix_interface.hpp
 delete mode 100644 qphix_interface_utils.hpp
 rename src/bin/{ => tests}/check_locallity.c (98%)
 rename src/bin/{ => tests}/hopping_test.c (94%)
 rename src/bin/{ => tests}/qphix_test_Dslash.c (99%)
 rename src/bin/{ => tests}/scalar_prod_r_test.c (100%)
 rename src/bin/{ => tests}/test_eigenvalues.c (98%)
 rename src/bin/{ => tests}/test_lemon.c (99%)
 rename fixed_volume.h.in => src/lib/fixed_volume.h.in (100%)
 create mode 100644 src/lib/git_hash.h
 delete mode 100644 src/lib/test/Makefile
 delete mode 100644 src/lib/util/laguer/Makefile
 delete mode 100644 src/lib/util/oox/Makefile

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dc9f71f2..39adba1c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.24)
 project(
   tmlqcd
   DESCRIPTION "tmlQCD"
-  HOMEPAGE_URL "http://www.itkp.uni-bonn.de/~urbach/software.html"
+  HOMEPAGE_URL "https://github.com/etmc/tmLQCD"
   VERSION "6.0.0"
   LANGUAGES C CXX)
 
@@ -80,26 +80,22 @@ option(TM_USE_FFTW "Enable fftw support" OFF)
 option(TM_USE_MPI "Enable MPI support" OFF)
 option(TM_USE_CUDA "Enable QUDA support" OFF)
 option(TM_USE_HIP "Enable HIP support" OFF)
-option(TM_USE_DDALPHAAMG "Enable DDalphaAMG support" OFF)
-option(TM_USE_OPENMP "Enable openMP" ON)
-option(TM_FIXED_VOLUME "fix volume at compile time" OFF)
-set(
-  TM_ENABLE_ALIGNMENT
-  "auto"
-  CACHE STRING   "Automatically or expliclty align arrays to byte number. auto, none, 16, 32, 64")
-
-set_property(
-  CACHE TM_ENABLE_ALIGNMENT
-  PROPERTY STRINGS
-  "auto"
-  "none"
-  "16"
-  "32"
-  "64")
+option(TM_USE_DDalphaAMG "Enable DDalphaAMG support" OFF)
+option(TM_USE_OMP "Enable openMP" ON)
+option(TM_FIXEDVOLUME "fix volume at compile time" OFF)
+set(TM_ENABLE_ALIGNMENT
+    "auto"
+    CACHE
+      STRING
+      "Automatically or expliclty align arrays to byte number. auto, none, 16, 32, 64"
+)
+
+set_property(CACHE TM_ENABLE_ALIGNMENT PROPERTY STRINGS "auto" "none" "16" "32"
+                                                "64")
 
 option(TM_BGL_DRAM "use BGL dram window (BGL only!)" ON)
 option(TM_USE_OPTIMIZATION "enable optimisation" ON)
-option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
+option(TM_USE_GAUGECOPY "Enable use of a copy of the gauge field" ON)
 option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
 option(TM_USE_TSPLITPAR "Enable timeslice-splitted communications" ON)
 option(TM_USE_QPHIX "enable QPhiX" OFF)
@@ -110,8 +106,8 @@ option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 
 # MPI dependent options
 cmake_dependent_option(
-  TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]"
-  OFF "TM_USE_MPI" OFF)
+  TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
+  "TM_USE_MPI" OFF)
 cmake_dependent_option(
   TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gaug" ON
   "TM_USE_MPI" OFF)
@@ -126,15 +122,15 @@ cmake_dependent_option(TM_USE_CUDA_HIP "Enable CUDA support in HIP" OFF
                        "TM_USE_HIP" OFF)
 
 # clime and lemon depend on MPI
-cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF
-                       "TM_USE_MPI" ON)
+cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF "TM_USE_MPI"
+                       ON)
 
 # GPU dependent options
 cmake_dependent_option(TM_USE_QUDA_EXPERIMENTAL "Enable QUDA support" ON
                        "TM_USE_QUDA" OFF)
 cmake_dependent_option(
-  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA"
-  ON "TM_USE_QUDA" OFF)
+  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA" ON
+  "TM_USE_QUDA" OFF)
 
 cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
                        "TM_USE_CUDA" OFF)
@@ -143,7 +139,7 @@ cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
 find_package(BLAS REQUIRED)
 #
 find_package(LAPACK REQUIRED)
-set(HAVE_LAPACK ON)
+set(TM_LAPACK ON)
 find_package(FLEX REQUIRED)
 # do we need bison ?
 find_package(BISON REQUIRED)
@@ -154,77 +150,51 @@ set(PACKAGE_TARNAME "tmlqcd")
 set(PACKAGE_BUGREPORT "curbach@gmx.de")
 set(PACKAGE_STRING "${PROJECT_DESCRIPTION} ${PROJECT_VERSION}")
 
-unset(TM_USE_MPI)
-unset(TM_USE_OMP)
-unset(HAVE_LIBLEMON)
-unset(HAVE_LIBLIME)
-unset(FIXEDVOLUME)
-unset(_PERSISTENT)
-unset(_NON_BLOCKING)
-unset(HAVE_LIBQUDA)
-unset(TM_USE_QUDA)
-unset(TM_QUDA_EXPERIMENTAL)
-unset(TM_QUDA_FERMIONIC_FORCES)
-unset(DDalphaAMG)
-unset(TM_USE_QPHIX)
-unset(QPHIX_SOALEN)
-unset(_NEW_GEOMETRY)
-unset(_NON_BLOCKING)
-unset(_USE_SHMEM)
-unset(_USE_HALFSPINOR)
 set(ALIGN " ")
 set(ALIGN_BASE "0")
 set(ALIGN_BASE32 "0")
 set(ALIGN32 " ")
 
 message("${TM_ENABLE_ALIGNMENT}")
-if (${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
+if(${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
   set(ALIGN_BASE "0x00")
   set(ALIGN " ")
   set(ALIGN_BASE32 "0x00")
   set(ALIGN32 " ")
-elseif (TM_ENABLE_ALIGNMENT EQUAL 16)
+elseif(TM_ENABLE_ALIGNMENT EQUAL 16)
   set(ALIGN_BASE "0x0F")
   set(ALIGN "__attribute__ ((aligned (16)))")
   set(ALIGN_BASE32 "0x0F")
   set(ALIGN32 "__attribute__ ((aligned (16)))")
-elseif (TM_ENABLE_ALIGNMENT EQUAL 32)
+elseif(TM_ENABLE_ALIGNMENT EQUAL 32)
   set(ALIGN_BASE "0x2F")
   set(ALIGN "__attribute__ ((aligned (32)))")
   set(ALIGN_BASE32 "0x2F")
   set(ALIGN32 "__attribute__ ((aligned (32)))")
-elseif (TM_ENABLE_ALIGNMENT EQUAL 64)
+elseif(TM_ENABLE_ALIGNMENT EQUAL 64)
   set(ALIGN_BASE "0x3F")
   set(ALIGN "__attribute__ ((aligned (64)))")
   set(ALIGN_BASE32 "0x3F")
   set(ALIGN32 "__attribute__ ((aligned (64)))")
 else()
-  message(FATAL_ERROR "Unusable value for array alignment. Allowed values are: auto, none, 16, 32, 64")
-endif()
-
-if(TM_USE_HALFSPINOR)
-  set(_USE_HALFSPINOR ON)
-endif()
-
-if(TM_FIXED_VOLUME)
-  set(FIXEDVOLUME ON)
-endif()
-
-if(TM_PERSISTENT_MPI)
-  set(_PERSISTENT ON)
+  message(
+    FATAL_ERROR
+      "Unusable value for array alignment. Allowed values are: auto, none, 16, 32, 64"
+  )
 endif()
 
 if(TM_USE_MPI)
   find_package(MPI REQUIRED)
-  set(TM_USE_MPI ON)
   if(TM_NONBLOCKING_MPI)
-    set(_NON_BLOCKING ON)
+    set(TM_NONBLOCKING ON)
+  endif()
+  if(TM_PERSISTENT_MPI)
+    set(TM_PERSISTENT ON)
   endif()
 endif()
 
-if(TM_USE_OPENMP)
+if(TM_USE_OMP)
   find_package(OpenMP REQUIRED COMPONENTS C CXX)
-  set(TM_USE_OMP ON)
 endif()
 
 if(TM_USE_HDF5)
@@ -233,24 +203,23 @@ endif()
 
 if(TM_USE_LEMON)
   find_package(Clemon REQUIRED)
-  set(HAVE_LIBLEMON ON)
 endif()
 
 find_package(CLime REQUIRED)
-set(HAVE_LIBLIME ON)
+set(TM_USE_LIME ON)
 
 if(TM_USE_QUDA)
   find_package(QUDA REQUIRED config)
-  set(HAVE_LIBQUDA ON)
   if(TM_USE_QUDA_EXPERIMENTAL)
     set(TM_QUDA_EXPERIMENTAL ON)
   endif()
   if(TM_QUDA_FERMIONIC_FORCES)
     set(TM_QUDA_FERMIONIC_FORCES ON)
   endif()
-  if(TM_USE_CUDA OR TM_USE_HIP)
-    set(TM_USE_QUDA ON)
-  endif()
+endif()
+
+if(TM_USE_SHMEM)
+  message(INFO "SHMEM needs to be included")
 endif()
 
 if(TM_USE_CUDA AND TM_USE_HIP)
@@ -268,7 +237,6 @@ if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
   endif()
 endif()
 
-message("QUDA_TARGET: ${QUDA_TARGET_CUDA}")
 if(TM_USE_HIP OR QUDA_TARGET_HIP)
   enable_language(hip)
 
@@ -285,20 +253,15 @@ if(TM_USE_HIP OR QUDA_TARGET_HIP)
   endif()
 endif()
 
-if(TM_USE_SHMEM)
-  set(_USE_SHMEM ON)
-endif()
-
 if(TM_USE_QPIHX)
   find_package(QPhiX REQUIRED)
   if(NOT TARGET tmlqcd::qphix)
     add_library(tmlqcd::qphix INTERFACE IMPORTED)
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
-      "${QPHIX_LIBRARIES}")
+                                                   "${QPHIX_LIBRARIES}")
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-      "${QPHIX_INCLUDE_DIRS}")
+                                                   "${QPHIX_INCLUDE_DIRS}")
   endif()
-  set(TM_USE_QPHIX ON)
 endif()
 
 # check for fftw3 (rely on pkgconfig).
@@ -309,57 +272,60 @@ if(TM_USE_FFTW)
   endif()
 endif()
 
+if(TM_USE_DDalphaAMG)
+  find_package(DDAlphaAMG REQUIRED)
+endif()
+
 # gprofiler
 
-if (TM_USE_GPROF)
+if(TM_USE_GPROF)
   set(PROFILE_FLAGS "-pg;-g")
-  if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
     list(APPEND PROFILE_FLAGS "-qfullpath")
   endif()
   add_compile_options($<BOOL:$<COMPILE_LANGUAGE:C>:$PROFILE_FLAGS>)
 endif()
 
-if (TM_ENABLE_WARNINGS)
-  add_compile_options(
-    $<$<COMPILE_LANG_AND_ID:C,GNU>:-Wall>
-    $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
+if(TM_ENABLE_WARNINGS)
+  add_compile_options($<$<COMPILE_LANG_AND_ID:C,GNU>:-Wall>
+                      $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
 endif()
 
 # check for the presence of clock_gettime in libc or librt
-check_symbol_exists(clock_gettime "time.h" HAVE_CLOCK_GETTIME)
-check_library_exists(rt clock_gettime "" HAVE_CLOCK_GETTIME_IN_RT)
-check_function_exists(fseeko HAVE_FSEEKO)
+check_symbol_exists(clock_gettime "time.h" TM_CLOCK_GETTIME)
+check_library_exists(rt clock_gettime "" TM_CLOCK_GETTIME_IN_RT)
+check_function_exists(fseeko TM_FSEEKO)
 
 # set the parallelization
 
 if(TM_USE_MPI)
   if(TM_MPI_DIMENSION EQUAL "1")
     # T parallelisation
-    set(PARALLELT ON)
+    set(TM_PARALLELT ON)
   elseif(TM_MPI_DIMENSION EQUAL "2")
     # XT parallelisation
-    set(PARALLELXT ON)
+    set(TM_PARALLELXT ON)
   elseif(TM_MPI_DIMENSION EQUAL "3")
-    set(PARALLELXYT ON)
+    set(TM_PARALLELXYT ON)
     # XYZ parallelisation
   elseif(TM_MPI_DIMENSION EQUAL "4")
     # timeslice-splitted communications
-    set(PARALLELXYZT ON)
+    set(TM_PARALLELXYZT ON)
   elseif(TM_MPI_DIMENSION EQUAL "X")
-    set(PARALLELX ON)
+    set(TM_PARALLELX ON)
   elseif(TM_MPI_DIMENSION EQUAL "XY")
-    set(PARALLELXY ON)
+    set(TM_PARALLELXY ON)
   elseif(TM_MPI_DIMENSION EQUAL "XYZ")
-    set(PARALLELXYZ ON)
+    set(TM_PARALLELXYZ ON)
   else()
-    set(PARALLELXYZT ON)
+    set(TM_PARALLELXYZT ON)
   endif()
 endif()
 
 # keep the autotool config.h header.
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
                "${PROJECT_BINARY_DIR}/tmlqcd_config_internal.h" @ONLY)
-configure_file("${PROJECT_SOURCE_DIR}/fixed_volume.h.in"
+configure_file("${PROJECT_SOURCE_DIR}/src/lib/fixed_volume.h.in"
                "${PROJECT_BINARY_DIR}/fixed_volume.h" @ONLY)
 # check if git command exists
 find_program(GIT_EXE NAMES git)
@@ -385,6 +351,6 @@ else()
   )
 endif()
 
-configure_file(cmake/git_hash.h.in git_hash.h @ONLY)
+configure_file(cmake/git_hash.c.in git_hash.c @ONLY)
 add_subdirectory(src/lib)
 add_subdirectory(src/bin)
diff --git a/Makefile.global b/Makefile.global
deleted file mode 100644
index dc1eefcf1..000000000
--- a/Makefile.global
+++ /dev/null
@@ -1,64 +0,0 @@
-# This Makefile is included from the other Makefiles
-# It contains some overall targets...
-
-# refresh Makefile and other stuff
-
-
-
-PROGRAMS_WITH_GIT_HASH := hmc_tm invert offline_measurement test_Dslash deriv_mg_tune
-
-.SUFFIXES:
-
-Makefile: ${top_srcdir}/Makefile.global $(srcdir)/Makefile.in $(abs_top_builddir)/config.status 
-	cd $(abs_top_builddir) \
-	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
-
-$(abs_top_builddir)/config.status: $(top_srcdir)/configure
-	( cd ${abs_top_builddir} && $(SHELL) ./config.status --recheck ) 
-
-$(abs_top_builddir)/include/tmlqcd_config_internal.h: $(top_srcdir)/include/tmlqcd_config_internal.h.in $(abs_top_builddir)/config.status $(top_srcdir)/configure
-	( cd ${abs_top_builddir} && $(SHELL) ./config.status --header=include/tmlqcd_config_internal.h )
-
-# rebuild configure if configure.in changes but ignore errors
-# on many machines some of the macros fail to be recognized
-# but the resulting configure still works
-$(top_srcdir)/configure: $(top_srcdir)/configure.in 
-	-( cd $(top_srcdir) && $(AUTOCONF) )
-
-#dep rules
-
-# PROGRAMS_WITH_GIT_HASH require git_hash.h which is dynamically built by a phony make target
-# to prevent too frequent building of git_hash (slowing down the build)
-# we filter the list of all objects and treat these separately
-$(addsuffix .d, $(filter-out ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c Makefile
-	@ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-$(addsuffix .d, $(filter-out ${PROGRAMS_WITH_GIT_HASH},${CXXMODULES})): %.d: ${srcdir}/%.cpp Makefile
-	@ $(CXXDEP) ${CXXDEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-	
-# dirty hack to prevent make from entering an infinite loop because a phony target is given as a real
-# dependency (make will build invert.d and hmc_tm.d indefinitely)
-# when git_hash.h does not exist (as checked using wildcard) it is given as a dependency of invert.d and hmc_tm.d
-# once it exists, this is no longer the case
-# while this does break updating of git_hash.h while the dependencies are built, this is quite
-# irrelevant because it will be rebuilt during the compilation of either invert or hmc_tm
-ifneq (git_hash.h, $(findstring git_hash.h,$(wildcard $(top_srcdir)/git_has*.h)))
-$(addsuffix .d, $(filter ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c ${top_srcdir}/git_hash.h Makefile
-	@ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-else
-$(addsuffix .d, $(filter ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c Makefile
-	@ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@
-endif
-
-${top_builddir}/fixed_volume.h: ${top_srcdir}/fixed_volume.h.in ${top_builddir}/config.status
-	cd ${abs_top_builddir} && CONFIG_FILES=fixed_volume.h CONFIG_HEADERS= $(SHELL) ${top_builddir}/config.status
-
-all-recursive all-debug-recursive all-profile-recursive clean-recursive distclean-recursive compile-clean-recursive: Makefile
-	@set fnord ${MAKEFLAGS}; amf=$$2; \
-	dot_seen=no; \
-	target=`echo $@ | sed s/-recursive//`; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  echo "Making $$target in $$subdir"; \
-	  local_target="$$target"; \
-	  ( cd $$subdir && $(MAKE) $$local_target ) \
-	    || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
-	done; test -z "$$fail";
diff --git a/Makefile.in b/Makefile.in
deleted file mode 100644
index 51437ff05..000000000
--- a/Makefile.in
+++ /dev/null
@@ -1,167 +0,0 @@
-srcdir = @srcdir@
-top_srcdir = @top_srcdir@
-abs_top_srcdir = @abs_top_srcdir@
-top_builddir =  .
-abs_top_builddir = @abs_top_builddir@
-builddir = @builddir@
-prefix = @prefix@
-exec_prefix = @exec_prefix@
-bindir = @bindir@
-program_transform_name = @program_transform_name@
-subdir = .
-
-AR = @AR@
-RANLIB = @RANLIB@
-CC = @CC@
-CXX = @CXX@
-CCDEP = @CCDEP@
-CXXDEP = @CXXDEP@
-CFLAGS = @CFLAGS@
-CXXFLAGS = @CXXFLAGS@
-LDFLAGS = @LDFLAGS@
-DEPFLAGS = @DEPFLAGS@
-CXXDEPFLAGS = @CXXDEPFLAGS@
-CPPFLAGS = @CPPFLAGS@
-CCLD = @CCLD@
-LEX = @LEX@
-AUTOCONF = @AUTOCONF@
-LIBS = @LIBS@
-SHELL = @SHELL@
-OPTARGS = @OPTARGS@
-SOPTARGS = @SOPTARGS@
-DEFS = @DEFS@
-USESUBDIRS = @USESUBDIRS@
-NVCC = @NVCC@
-GPUMPICOMPILER = @GPUMPICOMPILER@
-
-INCLUDES = @INCLUDES@
-LINK = $(CCLD) -o $@ ${LDFLAGS}
-
-COMPILE = ${CC} ${DEFS} ${INCLUDES} -o $@ ${CFLAGS}
-CXXCOMPILE = ${CXX} ${DEFS} ${INCLUDES} -o $@ ${CXXFLAGS} ${LDFLAGS}
-
-SMODULES = 
-
-MODULES = read_input gamma measure_gauge_action start \
-	expo matrix_utils get_staples update_backward_gauge \
-	measure_rectangles get_rectangle_staples  \
-	test/check_geometry test/check_xchange \
-	test/overlaptests \
-	invert_eo invert_doublet_eo update_gauge \
-	getopt sighandler reweighting_factor \
-	source_generation boundary update_tm ranlxd  \
-	mpi_init deriv_Sb deriv_Sb_D_psi ranlxs \
-	geometry_eo invert_overlap aligned_malloc \
-	prepare_source chebyshev_polynomial_nd Ptilde_nd  \
-	reweighting_factor_nd rnd_gauge_trafo \
-        update_momenta update_momenta_fg integrator  phmc \
-	little_D block operator \
-	spinor_fft \
-	fatal_error invert_clover_eo gettime \
-	tm_debug_printf compare_derivative \
-        @QUDA_INTERFACE@ @DDalphaAMG_INTERFACE@
-
-CXXMODULES = @QPHIX_INTERFACE@
-
-NOOPTMOD = test/check_xchange test/check_geometry
-
-PROGRAMS = hmc_tm benchmark invert gen_sources  \
-	check_locallity test_lemon hopping_test \
-	offline_measurement deriv_mg_tune @QPHIX_PROGRAMS@
-
-ALLOBJ = ${MODULES} ${PROGRAMS} ${SMODULES}
-SUBDIRS = ${USESUBDIRS}
-
-# delete the default suffix rules
-.SUFFIXES:
-
-# need to build modules before subdirs!
-all: Makefile dep $(SUBDIRS) hmc_tm invert benchmark offline_measurement deriv_mg_tune @QPHIX_PROGRAMS@
-
-$(SUBDIRS):
-	$(MAKE) --directory=$@
-
-# run the GIT-VERSION-GEN script to generate version information in git_hash.h
-# making sure that we run in the correct directory
-${top_srcdir}/git_hash.h:
-	@ ( cd @srcdir@ && sh GIT-VERSION-GEN )
-
--include $(addsuffix .d,$(ALLOBJ))
--include $(addsuffix .d,$(CXXMODULES))
-
-include ${top_srcdir}/Makefile.global
-
-# follow https://www.owlfolio.org/possibly-useful/flex-input-scanner-rules-are-too-complicated/
-# and pass the -Ca option such that more than 32k "NFA" states are allowed
-# our ruleset is so complicated that this has become necessary!
-${top_srcdir}/read_input.c: ${top_srcdir}/read_input.l
-ifneq (,$(findstring lex,${LEX}))
-	${LEX} -Ca -Ptmlqcd -i -t ${top_srcdir}/read_input.l > ${top_srcdir}/read_input.c
-else
-	$(error Unable to find (f)lex, read_input.c not built. Please install (f)lex!)
-endif
-
-libhmc.a: ${addsuffix .o, ${MODULES} ${SMODULES}} Makefile
-	@rm -f libhmc.a
-	@${AR} cru libhmc.a ${addsuffix .o, ${MODULES} ${SMODULES}}
-	@$(RANLIB) libhmc.a
-	@cp libhmc.a ${top_builddir}/lib/libhmc.a
-
-$(addsuffix .o,$(filter-out ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${COMPILE} ${OPTARGS} -c $<
-
-#here we don't need optimisation
-$(addsuffix .o,$(filter ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${COMPILE} -c $<
-
-${addsuffix .o, ${SMODULES}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${COMPILE} ${SOPTARGS} -c $<
-
-# C++ modules
-$(addsuffix .o,${CXXMODULES}): %.o: ${srcdir}/%.cpp %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h
-	${CXXCOMPILE} -c $<
-	
-${addsuffix .o, ${PROGRAMS}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/include/tmlqcd_config_internal.h ${top_srcdir}/git_hash.h
-	${COMPILE} ${OPTARGS} -c $<
-
-${PROGRAMS}: %: %.o libhmc.a $(SUBDIRS) $(addsuffix .o,${CXXMODULES})
-	 ${LINK} $@.o $(addsuffix .o,${CXXMODULES}) $(GPUOBJECTS) $(GPUOBJECTS_C) $(LIBS) ${LDFLAGS}
-
-
-# The rules for unit tests are kept in a separate file for tidyness
-include ${top_srcdir}/Makefile.tests
-
-dep: $(addsuffix .d,$(ALLOBJ)) $(addsuffix .d,$(CXXMODULES))
-	@ echo "...dependency files built"
-
-install: Makefile
-	@mkdir -p $(bindir); \
-	for p in hmc_tm invert benchmark offline_measurement deriv_mg_tune; do \
-	  progname=`echo $$p | sed '$(program_transform_name)'`; \
-	  echo "Installing $$p as $$progname in $(bindir)..."; \
-	  cp $$p $(bindir)/$$progname; \
-	done; \
-	echo "done";
-
-uninstall: Makefile
-	for p in hmc_tm invert benchmark offline_measurement deriv_mg_tune; do \
-	  progname=`echo $$p | sed '$(program_transform_name)'`; \
-	  echo "Un-Installing $$progname in $(bindir)..."; \
-	  rm $(bindir)/$$progname; \
-	done; \
-	echo "done";
-
-compile-clean: compile-clean-recursive Makefile
-	rm -f *.o *.d test/*.o test/*.d tests/*.o tests/*.d
-
-clean: clean-recursive Makefile
-	rm -f benchmark hmc_tm invert offline_measurement test_Dslash deriv_mg_tune @QPHIX_PROGRAMS@ *.o *.d test/*.o test/*.d tests/*.o tests/*.d
-
-distclean: distclean-recursive Makefile
-	rm -f benchmark hmc_tm invert offline_measurement *.o *.d *~ Makefile config.log config.status fixed_volume.h
-	rm -f include/tmlqcd_config_internal.h
-
-.PHONY: all ${SUBDIRS} ${top_srcdir}/git_hash.h clean compile-clean distclean dep install \
-	all-recursive all-debug-recursive all-profile-recursive \
-	clean-recursive distclean-recursive \
-	compile-clean-recursive
diff --git a/Makefile.tests b/Makefile.tests
deleted file mode 100644
index a9a393ac6..000000000
--- a/Makefile.tests
+++ /dev/null
@@ -1,64 +0,0 @@
-TESTS = tests/test_sample tests/test_su3 tests/test_buffers tests/test_qpx tests/test_linalg tests/test_clover tests/test_rat
-
-TEMP = $(patsubst %.c,%,$(wildcard $(top_srcdir)/tests/*.c))
-TESTMODULES = $(patsubst $(top_srcdir)/%,%,$(TEMP))
-
-TESTFLAGS = -L$(top_builddir)/cu/ -lcu
-
-$(addsuffix .o,$(TESTMODULES)): %.o : $(top_srcdir)/%.c
-	${COMPILE} -c $(OPTARGS) ${DEFS} $<
-
-# The linking stage needs to be differentiated because different tests rely on
-# different modules from the codebase
-# Each test itself consists of a number of modules that need to be linked.
-
-# when used as a prerequisite, the wildcard with "tests/test_sample*.c" replaced by "$@*.c" is not evaluated
-# correctly, even though it works perfectly in an echo statement, it results in make
-# trying to compile all objects in top_srcdir
-# we therefore evaluate the wildcard into a variable
-
-TEST_SAMPLE_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_sample*.c))
-TEST_SAMPLE_FLAGS:=
-TEST_SAMPLE_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_sample: $(TEST_SAMPLE_OBJECTS) $(TEST_SAMPLE_LIBS)
-	${LINK} $(TEST_SAMPLE_OBJECTS) $(TESTFLAGS) $(TEST_SAMPLE_FLAGS)
-
-TEST_SU3_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_su3*.c)) expo.o
-TEST_SU3_FLAGS:=-lm
-TEST_SU3_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_su3: $(TEST_SU3_OBJECTS) $(TEST_SU3_LIBS)
-	${LINK} $(TEST_SU3_OBJECTS) $(TESTFLAGS) $(TEST_SU3_FLAGS)
-
-TEST_QPX_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_qpx*.c)) 
-TEST_QPX_FLAGS:=-lm
-TEST_QPX_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_qpx: $(TEST_QPX_OBJECTS) $(TEST_QPX_LIBS)
-	${LINK} $(TEST_QPX_OBJECTS) $(TESTFLAGS) $(TEST_QPX_FLAGS)
-
-TEST_LINALG_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_linalg*.c)) 
-TEST_LINALG_FLAGS:=-lm
-TEST_LINALG_LIBS:=$(top_builddir)/cu/libcu.a $(top_builddir)/linalg/liblinalg.a
-tests/test_linalg: $(TEST_LINALG_OBJECTS) $(TEST_LINALG_LIBS)
-	${LINK} $(TEST_LINALG_OBJECTS) $(TEST_LINALG_LIBS) $(TESTFLAGS) $(TEST_LINALG_FLAGS)
-
-TEST_BUFFERS_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_buffers*.c)) fatal_error.o
-TEST_BUFFERS_FLAGS:=-lbuffers -L$(top_builddir)/buffers/
-TEST_BUFFERS_LIBS:=$(top_builddir)/cu/libcu.a $(top_builddir)/buffers/libbuffers.a
-tests/test_buffers: $(TEST_BUFFERS_OBJECTS) $(TEST_BUFFERS_LIBS)
-	${LINK} $(TEST_BUFFERS_OBJECTS) $(TESTFLAGS) $(TEST_BUFFERS_FLAGS)
-
-TEST_CLOVER_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_clover*.c)) operator/clover_leaf.o
-TEST_CLOVER_FLAGS:=-lm -lhmc -llinalg
-TEST_CLOVER_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_clover: $(TEST_CLOVER_OBJECTS) $(TEST_CLOVER_LIBS)
-	${LINK} $(TEST_CLOVER_OBJECTS) $(TESTFLAGS) $(TEST_CLOVER_FLAGS)
-
-TEST_RAT_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_rat*.c)) 
-TEST_RAT_FLAGS:=-lm -lrational
-TEST_RAT_LIBS:=$(top_builddir)/cu/libcu.a
-tests/test_rat: $(TEST_RAT_OBJECTS) $(TEST_RAT_LIBS)
-	${LINK} $(TEST_RAT_OBJECTS) $(TESTFLAGS) $(TEST_RAT_FLAGS)
-
-
-tests: ${TESTS}
-
diff --git a/cmake/FindDDAlphaAMG.cmake b/cmake/FindDDAlphaAMG.cmake
new file mode 100644
index 000000000..f42c943cc
--- /dev/null
+++ b/cmake/FindDDAlphaAMG.cmake
@@ -0,0 +1,29 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TM_DDALPHAAMG_LIBRARIES
+  NAMES DDalphaAMG DDalphaAMG_devel
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TM_DDALPHAAMG_INCLUDE_DIRS
+  NAMES DDalphaAMG.h
+  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
+
+find_package_handle_standard_args(
+  DDAlphaAMG DEFAULT_MSG TMLQCD_DDALPHAAMG_LIBRARIES
+  TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
+
+if(NOT TARGET tmlqcd::DDalphaAMG)
+  add_library(tmlqcd::DDalphaAMG INTERFACE IMPORTED)
+  set_target_properties(
+    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_LINK_LIBRARIES
+                                  "${TMLQCD_DDALPHAAMG_LIBRARIES}")
+  set_target_properties(
+    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                  "${TMLQCD_DDALPHAAMG_INCLUDE_DIRS}")
+endif()
+
+set(TMLQCD_DDALPHAAMG_FOUND ON)
+mark_as_advanced(TMLQCD_DDALPHAAMG_FOUND TMLQCD_DDALPHAAMG_LIBRARIES
+                 TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
diff --git a/cmake/git_hash.h.in b/cmake/git_hash.c.in
similarity index 62%
rename from cmake/git_hash.h.in
rename to cmake/git_hash.c.in
index 23f624742..912085abb 100644
--- a/cmake/git_hash.h.in
+++ b/cmake/git_hash.c.in
@@ -1,6 +1,6 @@
 #ifndef _GIT_HASH_H
 #define _GIT_HASH_H
 
-const char git_hash[] = "@TMLQCD_SHA@";
+const char git_hash[] = "@TM_SHA@";
 
 #endif /* _GIT_HASH_H */
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 5dd9c7096..2765a2b7c 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -3,20 +3,17 @@
  * into static const variables, following the convention used by the USQCD build
  * systems, for example. */
 
-/* We are on a CRAY */
-#cmakedefine CRAY
-
 /* lapack available */
-#cmakedefine HAVE_LAPACK 
+#cmakedefine TM_LAPACK 
 
 /* Define to 1 if you have the `lime' library (-llime). */
-#cmakedefine HAVE_LIBLIME 
+#cmakedefine TM_USE_LIME 
 
 /* Define to 1 if you have the `lemon' library (-llemon). */
-#cmakedefine HAVE_LIBLEMON 
+#cmakedefine TM_USE_LEMON 
 
 /* 1 if clock_gettime is available for use in benchmark */
-#cmakedefine HAVE_CLOCK_GETTIME 
+#cmakedefine TM_CLOCK_GETTIME 
 
 /* Compile with MPI support */
 #cmakedefine TM_USE_MPI
@@ -25,7 +22,7 @@
 #cmakedefine TM_USE_OMP
 
 /* Compile with FFTW support */
-#cmakedefine HAVE_FFTW 
+#cmakedefine TM_USE_FFTW 
 
 /* Fortran has not extra _ */
 #cmakedefine NOF77_
@@ -45,31 +42,31 @@
 #define PACKAGE_VERSION "@PROJECT_DESCRIPTION@ @PROJECT_VERSION@"
 
 /* X parallelisation */
-#cmakedefine PARALLELX 
+#cmakedefine TM_PARALLELX 
 
 /* XY parallelisation */
-#cmakedefine PARALLELXY 
+#cmakedefine TM_PARALLELXY 
 
 /* XYZ parallelisation */
-#cmakedefine PARALLELXYZ
+#cmakedefine TM_PARALLELXYZ
 
 /* One dimensional parallelisation */
-#cmakedefine PARALLELT
+#cmakedefine TM_PARALLELT
 
 /* Two dimensional parallelisation */
-#cmakedefine PARALLELXT
+#cmakedefine TM_PARALLELXT
 
 /* Three dimensional parallelisation */
-#cmakedefine PARALLELXYT
+#cmakedefine TM_PARALLELXYT
 
 /* Four dimensional parallelisation */
-#cmakedefine PARALLELXYZT
+#cmakedefine TM_PARALLELXYZT
 
 /* Fixed volume at compiletime */
-#cmakedefine FIXEDVOLUME
+#cmakedefine TM_FIXEDVOLUME
 
 /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#cmakedefine HAVE_FSEEKO
+#cmakedefine TM_FSEEKO
 
 /* Alignment for arrays -- necessary for SSE and automated vectorization */
 #define ALIGN_BASE @ALIGN_BASE@
@@ -88,40 +85,37 @@
 #cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#cmakedefine _FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
+#cmakedefine TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
-#cmakedefine _GAUGE_COPY
+#cmakedefine TM_USE_GAUGECOPY
 
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
-#cmakedefine _LARGEFILE_SOURCE
+#cmakedefine TM_LARGEFILE_SOURCE
 
 /* Define for large files, on AIX-style hosts. */
-#cmakedefine _LARGE_FILES 
+#cmakedefine TM_LARGE_FILES 
 
 /* Use even/odd geometry in the gauge fields */
-#cmakedefine _NEW_GEOMETRY
+#cmakedefine TM_NEW_GEOMETRY
 
 /* x86 64 Bit architecture */
-#cmakedefine _x86_64
+#cmakedefine TM_x86_64
 
 /* Define to 1 if Dirac operator with halfspinor should be used */
-#cmakedefine _USE_HALFSPINOR 
+#cmakedefine TM_USE_HALFSPINOR 
 
 /* Define to 1 if shmem API should be used */
-#cmakedefine _USE_SHMEM
+#cmakedefine TM_USE_SHMEM
 
 /* Define to 1 if KOJAK instrumentalisation should be done*/
-#cmakedefine _KOJAK_INST
+#cmakedefine TM_KOJAK_INST
 
 /* Define to 1 if persistent MPI calls for halfspinor should be used */
-#cmakedefine _PERSISTENT
+#cmakedefine TM_PERSISTENT
 
 /* Define to 1 if non-blocking MPI calls for spinor and gauge should be used */
-#cmakedefine _NON_BLOCKING
-
-/* Define to 1 if you have the `quda' library (-lquda). */
-#cmakedefine HAVE_LIBQUDA
+#cmakedefine TM_NONBLOCKING
 
 /* Using QUDA GPU */
 #cmakedefine TM_USE_QUDA 
@@ -133,7 +127,7 @@
 #cmakedefine TM_QUDA_FERMIONIC_FORCES
 
 /* Using DDalphaAMG */
-#cmakedefine DDalphaAMG
+#cmakedefine TM_USE_DDalphaAMG
 
 /* Using QPHIX */
 #cmakedefine TM_USE_QPHIX 
diff --git a/cmake_includes.txt b/cmake_includes.txt
deleted file mode 100644
index b8e105cc0..000000000
--- a/cmake_includes.txt
+++ /dev/null
@@ -1,425 +0,0 @@
-LIST(APPEND IO_SRC_C io_srcio/utils_write_inverter_info.c
-io/gauge_read.c
-io/utils_write_xlf.c
-io/utils_construct_reader.c
-io/params_construct_xlfInfo.c
-io/utils_kill_with_error.c
-io/DML_crc32.c
-io/spinor_write_source_format.c
-io/deri_write_stdout.c
-io/spinor_write_propagator_format.c
-io/utils_engineering.c
-io/utils_parse_propagator_type.c
-io/io_cm.c
-io/utils_parse_ildgformat_xml.c
-io/utils_read_message.c
-io/utils_write_ildg_format.c
-io/utils_destruct_writer.c
-io/gauge_write.c
-io/utils_write_message.c
-io/params_construct_ildgFormat.c
-io/spinor_read.c
-io/utils_close_reader_record.c
-io/spinor_read_binary.c
-io/utils.c
-io/spinor_write_stdout.c
-io/spinor_write_info.c
-io/utils_write_checksum.c
-io/utils_write_header.c
-io/eospinor_read.c
-io/utils_write_first_message.c
-io/params_construct_InverterInfo.c
-io/utils_parse_checksum_xml.c
-io/utils_construct_writer.c
-io/sw_write_stdout.c
-io/spinor_write_propagator_type.c
-io/gauge_write_binary.c
-io/spinor_write.c
-io/utils_write_xlf_xml.c
-io/params_construct_propagatorFormat.c
-io/gauge_read_binary.c
-io/dml.c
-io/spinor_write_binary.c
-io/utils_destruct_reader.c
-io/utils_close_writer_record.c
-io/eospinor_write.c
-io/gauge_write_luscher_binary.c
-io/params_construct_sourceFormat.c)
-
-list(APPEND INIT_SRC_C init/init_dirac_halfspinor.c
-     init/init_geometry_indices.c
-     init/init_openmp.c
-     init/init_gauge_field.c
-     init/init_parallel.c
-     init/init_chi_spinor_field.c
-     init/init_gauge_fg.c
-     init/init_spinor_field.c
-     init/init_global_states.c
-     init/init_bispinor_field.c
-     init/init_gauge_tmp.c
-     init/init_critical_globals.c
-     init/init_omp_accumulators.c
-     init/init_jacobi_field.c
-     init/init_stout_smear_vars.c
-     init/init_moment_field.c)
-
-list(APPEND SOLVER_SRC_C
-solver/bicg_complex.c
-solver/dfl_projector.c
-solver/eigenvalues_Jacobi.c
-solver/gcr.c
-solver/gmres_precon.c
-solver/chrono_guess.c
-solver/gcr4complex.c
-solver/jdher.c
-solver/gcr4complex_body.c
-solver/gmres_dr.c
-solver/fgmres4complex_body.c
-solver/cg_her_bi.c
-solver/solver_field.c
-solver/quicksort.c
-solver/bicgstab2.c
-solver/cgs_real.c
-solver/M_plus_block_psi_body.c
-solver/little_mg_precon_body.c
-solver/cg_her_su3vect.c
-solver/little_project_eo_body.c
-solver/monomial_solve.c
-solver/cr.c
-solver/gram-schmidt.c
-solver/solver_types.c
-solver/mode_number.c
-solver/cg_her.c
-solver/jdher_bi.c
-solver/mrblk_body.c
-solver/eigcg.c
-solver/jdher_su3vect.c
-solver/poly_precon.c
-solver/Msap.c
-solver/fgmres.c
-solver/dirac_operator_eigenvectors.c
-solver/incr_eigcg.c
-solver/index_jd.c
-solver/sumr.c
-solver/cgne4complex.c
-solver/eigenvalues_bi.c
-solver/gmres.c
-solver/lu_solve.c
-solver/diagonalise_general_matrix.c
-solver/mcr.c
-solver/bicgstabell.c
-solver/rg_mixed_cg_her.c
-solver/mixed_cg_her.c
-solver/mixed_cg_mms_tm_nd.c
-solver/rg_mixed_cg_her_nd.c
-solver/spectral_proj.c
-solver/restart_X.c
-solver/generate_dfl_subspace.c
-solver/eigenvalues.c
-solver/mcr4complex.c
-solver/mr4complex.c
-solver/bicgstab_complex.c
-solver/cg_mms_tm_nd.c
-solver/mr.c
-solver/cg_her_nd.c
-solver/bicgstab_complex_bi.c
-solver/sub_low_ev.c
-solver/ortho.c
-solver/pcg_her.c
-solver/fgmres4complex.c
-solver/cg_mms_tm.c
-solver/init_guess.c)
-
-list(APPEND LINALG_SRC_C linalg/assign_mul_bra_add_mul_r.c
-     linalg/mul_r_gamma5.c
-     linalg/convert_eo_to_lexic.c
-     linalg/print_spinor.c
-     linalg/assign_add_mul_body.c
-     linalg/mul_diff_mul_r.c
-     linalg/square_norm_32.c
-     linalg/mul.c
-     linalg/mul_r.c
-     linalg/mul_gamma5.c
-     linalg/ratio.c
-     linalg/square_norm.c
-     linalg/mul_diff_mul.c
-     linalg/square_and_minmax.c
-     linalg/add.c
-     linalg/assign_add_mul_add_mul_r.c
-     linalg/comp_decomp.c
-     linalg/mul_add_mul.c
-     linalg/diff_32.c
-     linalg/assign_add_mul.c
-     linalg/addto_32.c
-     linalg/assign_mul_add_mul_add_mul_add_mul_r.c
-     linalg/assign_add_mul_r.c
-     linalg/diff.c
-     linalg/assign_mul_add_mul_r.c
-     linalg/scalar_prod_r.c
-     linalg/assign_to_32.c
-     linalg/assign_add_mul_add_mul.c
-     linalg/mul_diff_r.c
-     linalg/assign_mul_add_r_and_square.c
-     linalg/assign_mul_add_mul_r_32.c
-     linalg/assign_mul_add_mul.c
-     linalg/assign_mul_add_mul_add_mul_r.c
-     linalg/scalar_prod_r_32.c
-     linalg/assign_mul_add_r.c
-     linalg/assign_mul_add_r_32.c
-     linalg/scalar_prod_su3spinor.c
-     linalg/convert_even_to_lexic.c
-     linalg/mul_r_32.c
-     linalg/assign_add_mul_r_add_mul.c
-     linalg/convert_odd_to_lexic.c
-     linalg/diff_and_square_norm.c
-     linalg/scalar_prod_i.c
-     linalg/mul_add_mul_r.c
-     linalg/assign_diff_mul.c
-     linalg/assign_mul_bra_add_mul_ket_add_r.c
-     linalg/set_even_to_zero.c
-     linalg/assign_mul_add.c
-     linalg/square_and_prod_r.c
-     linalg/scalar_prod_body.c
-     linalg/assign_mul_bra_add_mul_ket_add.c
-     linalg/assign_add_mul_r_32.c
-     linalg/scalar_prod.c
-     linalg/mattimesvec.c
-     linalg/assign.c
-     linalg/print_spinor_similar_components.c)
-
-list(APPEND RATIONAL_SRC_C rational/zolotarev.c
-     rational/rational.c
-     rational/elliptic.c)
-
-list(APPEND OPERATOR_SRC_C operator/clover_invert.c
-     operator/hopping_body_dbl.c
-     operator/tm_operators_nd_32.c
-     operator/hopping_sse_dbl.c
-     operator/halfspinor_body.c
-     operator/Block_D_psi_body.c
-     operator/mul_one_pm_imu_sub_mul_body.c
-     operator/assign_mul_one_sw_pm_imu_site_lexic_body.c
-     operator/assign_mul_one_sw_pm_imu_inv_block_body.c
-     operator/clover_accumulate_deriv.c
-     operator/Hopping_Matrix.c
-     operator/hopping_bg_dbl.c
-     operator/tm_operators.c
-     operator/tm_times_Hopping_Matrix.c
-     operator/clovertm_operators_32.c
-     operator/hopping_sgl.c
-     operator/Dov_proj.c
-     operator/clover_deriv.c
-     operator/halfspinor_bg_dbl.c
-     operator/clover_det.c
-     operator/clover_leaf.c
-     operator/D_psi_body.c
-     operator/clovertm_operators.c
-     operator/hopping_sse_sgl.c
-     operator/halfspinor_sse_dbl.c
-     operator/Dov_psi.c
-     operator/tm_operators_nd.c
-     operator/tm_sub_Hopping_Matrix.c
-     operator/Hopping_Matrix_nocom.c
-     operator/clover_term.c
-     operator/halfspinor_bgq_dbl.c
-     operator/Hopping_Matrix_32_nocom.c
-     operator/D_psi.c
-     operator/tm_operators_32.c
-     operator/Hopping_Matrix_32.c
-     operator/halfspinor_body_32.c
-     operator/mul_one_pm_imu_inv_body.c)
-
-list(APPEND SMEARING_SRC_C smearing/hex_stout_exclude_two.c
-     smearing/hex_hex_smear.c
-     smearing/utils_print_su3.c
-     smearing/hyp_APE_project_exclude_none.c
-     smearing/hyp_hyp_staples_exclude_one.c
-     smearing/hyp_APE_project_exclude_one.c
-     smearing/hex_stout_exclude_one.c
-     smearing/hyp_hyp_staples_exclude_two.c
-     smearing/hex_stout_exclude_none.c
-     smearing/stout_stout_smear.c
-     smearing/hyp_hyp_smear.c
-     smearing/hyp_APE_project_exclude_two.c
-     smearing/utils_project_herm.c
-     smearing/utils_reunitarize.c
-     smearing/utils_generic_staples.c
-     smearing/hyp_hyp_staples_exclude_none.c
-     smearing/ape_ape_smear.c
-     smearing/uils_print_config_to_screen.c
-     smearing/utils_project_antiherm.c
-     smearing/utils_print_config_to_screen.c
-     smearing/utils_reunitarize_MILC.c)
-
-list(APPEND BUFFER_SRC_C
-     buffers/gauge_return_gauge_field.c
-     buffers/gauge_get_gauge_field.c
-     buffers/gauge_finalize_gauge_buffers.c
-     buffers/gauge_initialize_gauge_buffers.c
-     buffers/gauge.c
-     buffers/gauge_free_unused_gauge_buffers.c
-     buffers/gauge_get_gauge_field_array.c
-     buffers/utils_generic_exchange.c
-     buffers/gauge_allocate_gauge_buffers.c
-     buffers/gauge_return_gauge_field_array.c)
-
-list(APPEND MONOMIAL_SRC_C
-     monomial/detratio_monomial.c
-     monomial/sf_gauge_monomial.c
-     monomial/poly_monomial.c
-     monomial/cloverdetratio_monomial.c
-     monomial/ndrat_monomial.c
-     monomial/cloverdet_monomial.c
-     monomial/clover_trlog_monomial.c
-     monomial/cloverndpoly_monomial.c
-     monomial/monitor_forces.c
-     monomial/ndpoly_monomial.c
-     monomial/det_monomial.c
-     monomial/monomial.c
-     monomial/cloverdetratio_rwmonomial.c
-     monomial/gauge_monomial.c
-     monomial/clovernd_trlog_monomial.c
-     monomial/ratcor_monomial.c
-     monomial/nddetratio_monomial.c
-     monomial/rat_monomial.c
-     monomial/ndratcor_monomial.c
-     monomial/moment_energy.c)
-
-list(APPEND EXCHANGE_SRC_C xchange/xchange_lexicfield.c
-xchange/xchange_2fields.c
-xchange/xchange_gauge.c
-xchange/xchange_halffield.c
-xchange/xchange_jacobi.c
-xchange/little_field_gather_body.c
-xchange/little_field_gather.c
-xchange/xchange_deri.c
-xchange/xchange_field.c
-xchange/xchange_field_tslice.c)
-
-list(APPEND MEAS_SRC_C
-meas/pion_norm.c
-meas/correlators.c
-meas/polyakov_loop.c
-meas/measurements.c
-meas/oriented_plaquettes.c
-meas/gradient_flow.c
-meas/measure_clover_field_strength_observables.c)
-
-list(APPEND SF_SRC_C sf/sf_calc_action.c
-     sf/sf_get_rectangle_staples.c
-     sf/sf_get_staples.c
-     sf/sf_observables.c
-     sf/sf_utils.c
-     )
-
-list(APPEND MAIN_SRC_C
-measure_gauge_action.c
-start.c
-deriv_Sb.c
-reweighting_factor_nd.c
-ranlxs.c
-source_generation.c
-read_input.c
-invert_doublet_eo.c
-geometry_eo.c
-getopt.c
-offline_measurement.c
-tm_debug_printf.c
-chebyshev_polynomial_nd.c
-invert_eo.c
-little_D.c
-get_rectangle_staples.c
-gen_sources.c
-rnd_gauge_trafo.c
-test_lemon.c
-LapH_ev.c
-benchmark.c
-measure_rectangles.c
-check_locallity.c
-invert.c
-deriv_Sb_D_psi.c
-deriv_mg_tune.c
-mpi_init.c
-update_momenta_fg.c
-gamma.c
-matrix_utils.c
-reweighting_factor.c
-update_tm.c
-jacobi.c
-invert_overlap.c
-phmc.c
-get_staples.c
-clenshaw_coef.c
-block.c
-spinor_fft.c
-boundary.c
-little_D_body.c
-X_psi.c
-prepare_source.c
-DDalphaAMG_interface.c
-update_backward_gauge.c
-invert_clover_eo.c
-gettime.c
-hmc_tm.c
-update_momenta.c
-sighandler.c
-compare_derivative.c
-ranlxd.c
-DirectPut.c
-aligned_malloc.c
-fatal_error.c
-operator.c
-cu/cu.c
-chebyshev_polynomial.c
-qphix_test_Dslash.c
-expo.c
-overrelaxation.c
-Ptilde_nd.c
-update_gauge.c
-hopping_test.c
-integrator.c
-P_M_eta.c)
-
-if (TMLQCD_USE_QPHIX)
-list(APPEND MAIN_SRC_C qphix_interface.cpp)
-endif()
-
-if (TMLQCD_USE_QUDA)
-list(APPEND MAIN_SRC_C quda_interface.c)
-endif()
-
-list(APPEND ALL_SRC ${MAIN_SRC_C} ${SF_SRC_C} ${XCHANGE_SRC_C} ${MONOMIAL_SRC_C} ${BUFFER_SRC_C} ${SMEARING_SRC_C} ${OPERATOR_SRC_C} ${RATIONAL_SRC_C} ${LINALG_SRC_C} ${IO_SRC_C} ${INIT_SRC_C} ${SOLVER_SRC_C})
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
-# cmake 4.0 uses a different syntax for the option
-flex_target(tmlqcd_input_read input_read.l input_read.c
-            $<$<VERSION_LESS:${CMAKE_MAJOR_VERSION},4>:COMPILE_FLAGS "-Ca -Ptmlqcd">
-            $<$<VERSION_GREATER_EQUAL:${CMAKE_MAJOR_VERSION},4>:OPTIONS "-Ca;-Ptmlqcd">)
-
-# create a target library with namespacing because cmake does not know name space at all
-add_library(tmlqcd::hmc ALL_SRC ${FLEX_tmlqcd_input_read_OUTPUTS})
-set_target_properties(tmlqcd::hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
-
-# define a library and add the dependencies
-target_link_libraries(tmlqcd::hmc
-                      $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
-                      $<$<BOOL:${TMLQCD_USE_LIME}>:tmlqcd::lime>
-                      $<$<BOOL:${TMLQCD_USE_LEMON}>:tmlqcd::lemon>
-                      $<$<BOOL:${TMLQCD_USE_QPHIX}>:tmlqcd::qphix>
-                      $<$<BOOL:${TMLQCD_USE_FFTW}>:tmlqcd::fftw3>
-                      $<$<BOOL:${TMLQCD_USE_MPI}>:MPI::MPI_C MPI::MPI_CXX>
-                      $<$<BOOL:${TMLQCD_USE_QUDA}>:quda::quda>
-                      $<$<BOOL:${TMLQCD_USE_CUDA}>:CUDA::cufft CUDA::cufftw CUDA::cublas CUDA::cudart CUDA::cuda_driver>
-                      $<$<BOOL:${TMLQCD_USE_HIP}>:hip::hipfft roc::hipblas hip::host>
-                      ${LAPACK_LIBRARIES}
-                      ${BLAS_LIBRARIES}
-                      $<$<BOOL:${TMLQCD_USE_OPENMP}>:OpenMP::OpenMP_C OpenMP::OpenMP_CXX>
-                      m)
-
-target_compile_definitions(tmlqcd::hmc
-                           $<$<BOOL:${TMLQCD_USE_HIP}>:${TMLQCD_GPU_PLATFORM_DFLAGS}>
-                           )
-
-target_include_directories(tmlqcd::hmc PUBLIC $<INSTALL_INTERFACE:include>
-                           PRIVATE "init io linalg meas monomial operator profiling rational sf smearing solver util xchange wrapper")
diff --git a/config.guess b/config.guess
deleted file mode 100644
index f7727026b..000000000
--- a/config.guess
+++ /dev/null
@@ -1,1701 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-#   Copyright 1992-2021 Free Software Foundation, Inc.
-
-timestamp='2021-01-01'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <https://www.gnu.org/licenses/>.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
-#
-# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
-#
-# You can get the latest version of this script from:
-# https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
-#
-# Please send patches to <config-patches@gnu.org>.
-
-
-me=$(echo "$0" | sed -e 's,.*/,,')
-
-usage="\
-Usage: $0 [OPTION]
-
-Output the configuration name of the system \`$me' is run on.
-
-Options:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.guess ($timestamp)
-
-Originally written by Per Bothner.
-Copyright 1992-2021 Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-    * )
-       break ;;
-  esac
-done
-
-if test $# != 0; then
-  echo "$me: too many arguments$help" >&2
-  exit 1
-fi
-
-# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
-# compiler to aid in system detection is discouraged as it requires
-# temporary files to be created and, as you can see below, it is a
-# headache to deal with in a portable fashion.
-
-# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
-# use `HOST_CC' if defined, but it is deprecated.
-
-# Portable tmp directory creation inspired by the Autoconf team.
-
-tmp=
-# shellcheck disable=SC2172
-trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15
-
-set_cc_for_build() {
-    # prevent multiple calls if $tmp is already set
-    test "$tmp" && return 0
-    : "${TMPDIR=/tmp}"
-    # shellcheck disable=SC2039
-    { tmp=$( (umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null) && test -n "$tmp" && test -d "$tmp" ; } ||
-	{ test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } ||
-	{ tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } ||
-	{ echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; }
-    dummy=$tmp/dummy
-    case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in
-	,,)    echo "int x;" > "$dummy.c"
-	       for driver in cc gcc c89 c99 ; do
-		   if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then
-		       CC_FOR_BUILD="$driver"
-		       break
-		   fi
-	       done
-	       if test x"$CC_FOR_BUILD" = x ; then
-		   CC_FOR_BUILD=no_compiler_found
-	       fi
-	       ;;
-	,,*)   CC_FOR_BUILD=$CC ;;
-	,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-    esac
-}
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 1994-08-24)
-if test -f /.attbin/uname ; then
-	PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=$( (uname -m) 2>/dev/null) || UNAME_MACHINE=unknown
-UNAME_RELEASE=$( (uname -r) 2>/dev/null) || UNAME_RELEASE=unknown
-UNAME_SYSTEM=$( (uname -s) 2>/dev/null) || UNAME_SYSTEM=unknown
-UNAME_VERSION=$( (uname -v) 2>/dev/null) || UNAME_VERSION=unknown
-
-case "$UNAME_SYSTEM" in
-Linux|GNU|GNU/*)
-	LIBC=unknown
-
-	set_cc_for_build
-	cat <<-EOF > "$dummy.c"
-	#include <features.h>
-	#if defined(__UCLIBC__)
-	LIBC=uclibc
-	#elif defined(__dietlibc__)
-	LIBC=dietlibc
-	#elif defined(__GLIBC__)
-	LIBC=gnu
-	#else
-	#include <stdarg.h>
-	/* First heuristic to detect musl libc.  */
-	#ifdef __DEFINED_va_list
-	LIBC=musl
-	#endif
-	#endif
-	EOF
-	eval "$($CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g')"
-
-	# Second heuristic to detect musl libc.
-	if [ "$LIBC" = unknown ] &&
-	   command -v ldd >/dev/null &&
-	   ldd --version 2>&1 | grep -q ^musl; then
-		LIBC=musl
-	fi
-
-	# If the system lacks a compiler, then just pick glibc.
-	# We could probably try harder.
-	if [ "$LIBC" = unknown ]; then
-		LIBC=gnu
-	fi
-	;;
-esac
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "$UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION" in
-    *:NetBSD:*:*)
-	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
-	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
-	# switched to ELF, *-*-netbsd* would select the old
-	# object file format.  This provides both forward
-	# compatibility and a consistent mechanism for selecting the
-	# object file format.
-	#
-	# Note: NetBSD doesn't particularly care about the vendor
-	# portion of the name.  We always set it to "unknown".
-	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=$( (uname -p 2>/dev/null || \
-	    "/sbin/$sysctl" 2>/dev/null || \
-	    "/usr/sbin/$sysctl" 2>/dev/null || \
-	    echo unknown))
-	case "$UNAME_MACHINE_ARCH" in
-	    aarch64eb) machine=aarch64_be-unknown ;;
-	    armeb) machine=armeb-unknown ;;
-	    arm*) machine=arm-unknown ;;
-	    sh3el) machine=shl-unknown ;;
-	    sh3eb) machine=sh-unknown ;;
-	    sh5el) machine=sh5le-unknown ;;
-	    earmv*)
-		arch=$(echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,')
-		endian=$(echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p')
-		machine="${arch}${endian}"-unknown
-		;;
-	    *) machine="$UNAME_MACHINE_ARCH"-unknown ;;
-	esac
-	# The Operating System including object format, if it has switched
-	# to ELF recently (or will in the future) and ABI.
-	case "$UNAME_MACHINE_ARCH" in
-	    earm*)
-		os=netbsdelf
-		;;
-	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-		set_cc_for_build
-		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep -q __ELF__
-		then
-		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
-		    # Return netbsd for either.  FIX?
-		    os=netbsd
-		else
-		    os=netbsdelf
-		fi
-		;;
-	    *)
-		os=netbsd
-		;;
-	esac
-	# Determine ABI tags.
-	case "$UNAME_MACHINE_ARCH" in
-	    earm*)
-		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
-		abi=$(echo "$UNAME_MACHINE_ARCH" | sed -e "$expr")
-		;;
-	esac
-	# The OS release
-	# Debian GNU/NetBSD machines have a different userland, and
-	# thus, need a distinct triplet. However, they do not need
-	# kernel version information, so it can be replaced with a
-	# suitable tag, in the style of linux-gnu.
-	case "$UNAME_VERSION" in
-	    Debian*)
-		release='-gnu'
-		;;
-	    *)
-		release=$(echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2)
-		;;
-	esac
-	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
-	# contains redundant information, the shorter form:
-	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "$machine-${os}${release}${abi-}"
-	exit ;;
-    *:Bitrig:*:*)
-	UNAME_MACHINE_ARCH=$(arch | sed 's/Bitrig.//')
-	echo "$UNAME_MACHINE_ARCH"-unknown-bitrig"$UNAME_RELEASE"
-	exit ;;
-    *:OpenBSD:*:*)
-	UNAME_MACHINE_ARCH=$(arch | sed 's/OpenBSD.//')
-	echo "$UNAME_MACHINE_ARCH"-unknown-openbsd"$UNAME_RELEASE"
-	exit ;;
-    *:LibertyBSD:*:*)
-	UNAME_MACHINE_ARCH=$(arch | sed 's/^.*BSD\.//')
-	echo "$UNAME_MACHINE_ARCH"-unknown-libertybsd"$UNAME_RELEASE"
-	exit ;;
-    *:MidnightBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-midnightbsd"$UNAME_RELEASE"
-	exit ;;
-    *:ekkoBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-ekkobsd"$UNAME_RELEASE"
-	exit ;;
-    *:SolidBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-solidbsd"$UNAME_RELEASE"
-	exit ;;
-    *:OS108:*:*)
-	echo "$UNAME_MACHINE"-unknown-os108_"$UNAME_RELEASE"
-	exit ;;
-    macppc:MirBSD:*:*)
-	echo powerpc-unknown-mirbsd"$UNAME_RELEASE"
-	exit ;;
-    *:MirBSD:*:*)
-	echo "$UNAME_MACHINE"-unknown-mirbsd"$UNAME_RELEASE"
-	exit ;;
-    *:Sortix:*:*)
-	echo "$UNAME_MACHINE"-unknown-sortix
-	exit ;;
-    *:Twizzler:*:*)
-	echo "$UNAME_MACHINE"-unknown-twizzler
-	exit ;;
-    *:Redox:*:*)
-	echo "$UNAME_MACHINE"-unknown-redox
-	exit ;;
-    mips:OSF1:*.*)
-	echo mips-dec-osf1
-	exit ;;
-    alpha:OSF1:*:*)
-	case $UNAME_RELEASE in
-	*4.0)
-		UNAME_RELEASE=$(/usr/sbin/sizer -v | awk '{print $3}')
-		;;
-	*5.*)
-		UNAME_RELEASE=$(/usr/sbin/sizer -v | awk '{print $4}')
-		;;
-	esac
-	# According to Compaq, /usr/sbin/psrinfo has been available on
-	# OSF/1 and Tru64 systems produced since 1995.  I hope that
-	# covers most systems running today.  This code pipes the CPU
-	# types through head -n 1, so we only detect the type of CPU 0.
-	ALPHA_CPU_TYPE=$(/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1)
-	case "$ALPHA_CPU_TYPE" in
-	    "EV4 (21064)")
-		UNAME_MACHINE=alpha ;;
-	    "EV4.5 (21064)")
-		UNAME_MACHINE=alpha ;;
-	    "LCA4 (21066/21068)")
-		UNAME_MACHINE=alpha ;;
-	    "EV5 (21164)")
-		UNAME_MACHINE=alphaev5 ;;
-	    "EV5.6 (21164A)")
-		UNAME_MACHINE=alphaev56 ;;
-	    "EV5.6 (21164PC)")
-		UNAME_MACHINE=alphapca56 ;;
-	    "EV5.7 (21164PC)")
-		UNAME_MACHINE=alphapca57 ;;
-	    "EV6 (21264)")
-		UNAME_MACHINE=alphaev6 ;;
-	    "EV6.7 (21264A)")
-		UNAME_MACHINE=alphaev67 ;;
-	    "EV6.8CB (21264C)")
-		UNAME_MACHINE=alphaev68 ;;
-	    "EV6.8AL (21264B)")
-		UNAME_MACHINE=alphaev68 ;;
-	    "EV6.8CX (21264D)")
-		UNAME_MACHINE=alphaev68 ;;
-	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE=alphaev69 ;;
-	    "EV7 (21364)")
-		UNAME_MACHINE=alphaev7 ;;
-	    "EV7.9 (21364A)")
-		UNAME_MACHINE=alphaev79 ;;
-	esac
-	# A Pn.n version is a patched version.
-	# A Vn.n version is a released version.
-	# A Tn.n version is a released field test version.
-	# A Xn.n version is an unreleased experimental baselevel.
-	# 1.2 uses "1.2" for uname -r.
-	echo "$UNAME_MACHINE"-dec-osf"$(echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz)"
-	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
-	exitcode=$?
-	trap '' 0
-	exit $exitcode ;;
-    Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-unknown-sysv4
-	exit ;;
-    *:[Aa]miga[Oo][Ss]:*:*)
-	echo "$UNAME_MACHINE"-unknown-amigaos
-	exit ;;
-    *:[Mm]orph[Oo][Ss]:*:*)
-	echo "$UNAME_MACHINE"-unknown-morphos
-	exit ;;
-    *:OS/390:*:*)
-	echo i370-ibm-openedition
-	exit ;;
-    *:z/VM:*:*)
-	echo s390-ibm-zvmoe
-	exit ;;
-    *:OS400:*:*)
-	echo powerpc-ibm-os400
-	exit ;;
-    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix"$UNAME_RELEASE"
-	exit ;;
-    arm*:riscos:*:*|arm*:RISCOS:*:*)
-	echo arm-unknown-riscos
-	exit ;;
-    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
-	echo hppa1.1-hitachi-hiuxmpp
-	exit ;;
-    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
-	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-	if test "$( (/bin/universe) 2>/dev/null)" = att ; then
-		echo pyramid-pyramid-sysv3
-	else
-		echo pyramid-pyramid-bsd
-	fi
-	exit ;;
-    NILE*:*:*:dcosx)
-	echo pyramid-pyramid-svr4
-	exit ;;
-    DRS?6000:unix:4.0:6*)
-	echo sparc-icl-nx6
-	exit ;;
-    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
-	case $(/usr/bin/uname -p) in
-	    sparc) echo sparc-icl-nx7; exit ;;
-	esac ;;
-    s390x:SunOS:*:*)
-	echo "$UNAME_MACHINE"-ibm-solaris2"$(echo "$UNAME_RELEASE" | sed -e 's/[^.]*//')"
-	exit ;;
-    sun4H:SunOS:5.*:*)
-	echo sparc-hal-solaris2"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2"$(echo "$UNAME_RELEASE" | sed -e 's/[^.]*//')"
-	exit ;;
-    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
-	echo i386-pc-auroraux"$UNAME_RELEASE"
-	exit ;;
-    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-	set_cc_for_build
-	SUN_ARCH=i386
-	# If there is a compiler, see if it is configured for 64-bit objects.
-	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
-	# This test works for both compilers.
-	if test "$CC_FOR_BUILD" != no_compiler_found; then
-	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_64BIT_ARCH >/dev/null
-	    then
-		SUN_ARCH=x86_64
-	    fi
-	fi
-	echo "$SUN_ARCH"-pc-solaris2"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    sun4*:SunOS:6*:*)
-	# According to config.sub, this is the proper way to canonicalize
-	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
-	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    sun4*:SunOS:*:*)
-	case "$(/usr/bin/arch -k)" in
-	    Series*|S4*)
-		UNAME_RELEASE=$(uname -v)
-		;;
-	esac
-	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos"$(echo "$UNAME_RELEASE"|sed -e 's/-/_/')"
-	exit ;;
-    sun3*:SunOS:*:*)
-	echo m68k-sun-sunos"$UNAME_RELEASE"
-	exit ;;
-    sun*:*:4.2BSD:*)
-	UNAME_RELEASE=$( (sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null)
-	test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3
-	case "$(/bin/arch)" in
-	    sun3)
-		echo m68k-sun-sunos"$UNAME_RELEASE"
-		;;
-	    sun4)
-		echo sparc-sun-sunos"$UNAME_RELEASE"
-		;;
-	esac
-	exit ;;
-    aushp:SunOS:*:*)
-	echo sparc-auspex-sunos"$UNAME_RELEASE"
-	exit ;;
-    # The situation for MiNT is a little confusing.  The machine name
-    # can be virtually everything (everything which is not
-    # "atarist" or "atariste" at least should have a processor
-    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
-    # to the lowercase version "mint" (or "freemint").  Finally
-    # the system name "TOS" denotes a system which is actually not
-    # MiNT.  But MiNT is downward compatible to TOS, so this should
-    # be no problem.
-    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint"$UNAME_RELEASE"
-	exit ;;
-    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint"$UNAME_RELEASE"
-	exit ;;
-    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-	echo m68k-atari-mint"$UNAME_RELEASE"
-	exit ;;
-    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-	echo m68k-milan-mint"$UNAME_RELEASE"
-	exit ;;
-    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-	echo m68k-hades-mint"$UNAME_RELEASE"
-	exit ;;
-    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-	echo m68k-unknown-mint"$UNAME_RELEASE"
-	exit ;;
-    m68k:machten:*:*)
-	echo m68k-apple-machten"$UNAME_RELEASE"
-	exit ;;
-    powerpc:machten:*:*)
-	echo powerpc-apple-machten"$UNAME_RELEASE"
-	exit ;;
-    RISC*:Mach:*:*)
-	echo mips-dec-mach_bsd4.3
-	exit ;;
-    RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix"$UNAME_RELEASE"
-	exit ;;
-    VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix"$UNAME_RELEASE"
-	exit ;;
-    2020:CLIX:*:* | 2430:CLIX:*:*)
-	echo clipper-intergraph-clix"$UNAME_RELEASE"
-	exit ;;
-    mips:*:*:UMIPS | mips:*:*:RISCos)
-	set_cc_for_build
-	sed 's/^	//' << EOF > "$dummy.c"
-#ifdef __cplusplus
-#include <stdio.h>  /* for printf() prototype */
-	int main (int argc, char *argv[]) {
-#else
-	int main (argc, argv) int argc; char *argv[]; {
-#endif
-	#if defined (host_mips) && defined (MIPSEB)
-	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0);
-	#endif
-	#endif
-	  exit (-1);
-	}
-EOF
-	$CC_FOR_BUILD -o "$dummy" "$dummy.c" &&
-	  dummyarg=$(echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p') &&
-	  SYSTEM_NAME=$("$dummy" "$dummyarg") &&
-	    { echo "$SYSTEM_NAME"; exit; }
-	echo mips-mips-riscos"$UNAME_RELEASE"
-	exit ;;
-    Motorola:PowerMAX_OS:*:*)
-	echo powerpc-motorola-powermax
-	exit ;;
-    Motorola:*:4.3:PL8-*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:Power_UNIX:*:*)
-	echo powerpc-harris-powerunix
-	exit ;;
-    m88k:CX/UX:7*:*)
-	echo m88k-harris-cxux7
-	exit ;;
-    m88k:*:4*:R4*)
-	echo m88k-motorola-sysv4
-	exit ;;
-    m88k:*:3*:R3*)
-	echo m88k-motorola-sysv3
-	exit ;;
-    AViiON:dgux:*:*)
-	# DG/UX returns AViiON for all architectures
-	UNAME_PROCESSOR=$(/usr/bin/uname -p)
-	if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110
-	then
-	    if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \
-	       test "$TARGET_BINARY_INTERFACE"x = x
-	    then
-		echo m88k-dg-dgux"$UNAME_RELEASE"
-	    else
-		echo m88k-dg-dguxbcs"$UNAME_RELEASE"
-	    fi
-	else
-	    echo i586-dg-dgux"$UNAME_RELEASE"
-	fi
-	exit ;;
-    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
-	echo m88k-dolphin-sysv3
-	exit ;;
-    M88*:*:R3*:*)
-	# Delta 88k system running SVR3
-	echo m88k-motorola-sysv3
-	exit ;;
-    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-	echo m88k-tektronix-sysv3
-	exit ;;
-    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-	echo m68k-tektronix-bsd
-	exit ;;
-    *:IRIX*:*:*)
-	echo mips-sgi-irix"$(echo "$UNAME_RELEASE"|sed -e 's/-/_/g')"
-	exit ;;
-    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
-	exit ;;               # Note that: echo "'$(uname -s)'" gives 'AIX '
-    i*86:AIX:*:*)
-	echo i386-ibm-aix
-	exit ;;
-    ia64:AIX:*:*)
-	if test -x /usr/bin/oslevel ; then
-		IBM_REV=$(/usr/bin/oslevel)
-	else
-		IBM_REV="$UNAME_VERSION.$UNAME_RELEASE"
-	fi
-	echo "$UNAME_MACHINE"-ibm-aix"$IBM_REV"
-	exit ;;
-    *:AIX:2:3)
-	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		set_cc_for_build
-		sed 's/^		//' << EOF > "$dummy.c"
-		#include <sys/systemcfg.h>
-
-		main()
-			{
-			if (!__power_pc())
-				exit(1);
-			puts("powerpc-ibm-aix3.2.5");
-			exit(0);
-			}
-EOF
-		if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=$("$dummy")
-		then
-			echo "$SYSTEM_NAME"
-		else
-			echo rs6000-ibm-aix3.2.5
-		fi
-	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-		echo rs6000-ibm-aix3.2.4
-	else
-		echo rs6000-ibm-aix3.2
-	fi
-	exit ;;
-    *:AIX:*:[4567])
-	IBM_CPU_ID=$(/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }')
-	if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then
-		IBM_ARCH=rs6000
-	else
-		IBM_ARCH=powerpc
-	fi
-	if test -x /usr/bin/lslpp ; then
-		IBM_REV=$(/usr/bin/lslpp -Lqc bos.rte.libc |
-			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/)
-	else
-		IBM_REV="$UNAME_VERSION.$UNAME_RELEASE"
-	fi
-	echo "$IBM_ARCH"-ibm-aix"$IBM_REV"
-	exit ;;
-    *:AIX:*:*)
-	echo rs6000-ibm-aix
-	exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*)
-	echo romp-ibm-bsd4.4
-	exit ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-	echo romp-ibm-bsd"$UNAME_RELEASE"   # 4.3 with uname added to
-	exit ;;                             # report: romp-ibm BSD 4.3
-    *:BOSX:*:*)
-	echo rs6000-bull-bosx
-	exit ;;
-    DPX/2?00:B.O.S.:*:*)
-	echo m68k-bull-sysv3
-	exit ;;
-    9000/[34]??:4.3bsd:1.*:*)
-	echo m68k-hp-bsd
-	exit ;;
-    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-	echo m68k-hp-bsd4.4
-	exit ;;
-    9000/[34678]??:HP-UX:*:*)
-	HPUX_REV=$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//')
-	case "$UNAME_MACHINE" in
-	    9000/31?)            HP_ARCH=m68000 ;;
-	    9000/[34]??)         HP_ARCH=m68k ;;
-	    9000/[678][0-9][0-9])
-		if test -x /usr/bin/getconf; then
-		    sc_cpu_version=$(/usr/bin/getconf SC_CPU_VERSION 2>/dev/null)
-		    sc_kernel_bits=$(/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null)
-		    case "$sc_cpu_version" in
-		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
-		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
-		      532)                      # CPU_PA_RISC2_0
-			case "$sc_kernel_bits" in
-			  32) HP_ARCH=hppa2.0n ;;
-			  64) HP_ARCH=hppa2.0w ;;
-			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
-			esac ;;
-		    esac
-		fi
-		if test "$HP_ARCH" = ""; then
-		    set_cc_for_build
-		    sed 's/^		//' << EOF > "$dummy.c"
-
-		#define _HPUX_SOURCE
-		#include <stdlib.h>
-		#include <unistd.h>
-
-		int main ()
-		{
-		#if defined(_SC_KERNEL_BITS)
-		    long bits = sysconf(_SC_KERNEL_BITS);
-		#endif
-		    long cpu  = sysconf (_SC_CPU_VERSION);
-
-		    switch (cpu)
-			{
-			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-			case CPU_PA_RISC2_0:
-		#if defined(_SC_KERNEL_BITS)
-			    switch (bits)
-				{
-				case 64: puts ("hppa2.0w"); break;
-				case 32: puts ("hppa2.0n"); break;
-				default: puts ("hppa2.0"); break;
-				} break;
-		#else  /* !defined(_SC_KERNEL_BITS) */
-			    puts ("hppa2.0"); break;
-		#endif
-			default: puts ("hppa1.0"); break;
-			}
-		    exit (0);
-		}
-EOF
-		    (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=$("$dummy")
-		    test -z "$HP_ARCH" && HP_ARCH=hppa
-		fi ;;
-	esac
-	if test "$HP_ARCH" = hppa2.0w
-	then
-	    set_cc_for_build
-
-	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
-	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
-	    # generating 64-bit code.  GNU and HP use different nomenclature:
-	    #
-	    # $ CC_FOR_BUILD=cc ./config.guess
-	    # => hppa2.0w-hp-hpux11.23
-	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
-	    # => hppa64-hp-hpux11.23
-
-	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep -q __LP64__
-	    then
-		HP_ARCH=hppa2.0w
-	    else
-		HP_ARCH=hppa64
-	    fi
-	fi
-	echo "$HP_ARCH"-hp-hpux"$HPUX_REV"
-	exit ;;
-    ia64:HP-UX:*:*)
-	HPUX_REV=$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*.[0B]*//')
-	echo ia64-hp-hpux"$HPUX_REV"
-	exit ;;
-    3050*:HI-UX:*:*)
-	set_cc_for_build
-	sed 's/^	//' << EOF > "$dummy.c"
-	#include <unistd.h>
-	int
-	main ()
-	{
-	  long cpu = sysconf (_SC_CPU_VERSION);
-	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
-	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
-	     results, however.  */
-	  if (CPU_IS_PA_RISC (cpu))
-	    {
-	      switch (cpu)
-		{
-		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
-		  default: puts ("hppa-hitachi-hiuxwe2"); break;
-		}
-	    }
-	  else if (CPU_IS_HP_MC68K (cpu))
-	    puts ("m68k-hitachi-hiuxwe2");
-	  else puts ("unknown-hitachi-hiuxwe2");
-	  exit (0);
-	}
-EOF
-	$CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=$("$dummy") &&
-		{ echo "$SYSTEM_NAME"; exit; }
-	echo unknown-hitachi-hiuxwe2
-	exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*)
-	echo hppa1.1-hp-bsd
-	exit ;;
-    9000/8??:4.3bsd:*:*)
-	echo hppa1.0-hp-bsd
-	exit ;;
-    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
-	echo hppa1.0-hp-mpeix
-	exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*)
-	echo hppa1.1-hp-osf
-	exit ;;
-    hp8??:OSF1:*:*)
-	echo hppa1.0-hp-osf
-	exit ;;
-    i*86:OSF1:*:*)
-	if test -x /usr/sbin/sysversion ; then
-	    echo "$UNAME_MACHINE"-unknown-osf1mk
-	else
-	    echo "$UNAME_MACHINE"-unknown-osf1
-	fi
-	exit ;;
-    parisc*:Lites*:*:*)
-	echo hppa1.1-hp-lites
-	exit ;;
-    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-	echo c1-convex-bsd
-	exit ;;
-    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-	echo c34-convex-bsd
-	exit ;;
-    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-	echo c38-convex-bsd
-	exit ;;
-    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-	echo c4-convex-bsd
-	exit ;;
-    CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*[A-Z]90:*:*:*)
-	echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \
-	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
-	      -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*TS:*:*:*)
-	echo t90-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*T3E:*:*:*)
-	echo alphaev5-cray-unicosmk"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*SV1:*:*:*)
-	echo sv1-cray-unicos"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    *:UNICOS/mp:*:*)
-	echo craynv-cray-unicosmp"$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=$(uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz)
-	FUJITSU_SYS=$(uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///')
-	FUJITSU_REL=$(echo "$UNAME_RELEASE" | sed -e 's/ /_/')
-	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    5000:UNIX_System_V:4.*:*)
-	FUJITSU_SYS=$(uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///')
-	FUJITSU_REL=$(echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/')
-	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-	echo "$UNAME_MACHINE"-pc-bsdi"$UNAME_RELEASE"
-	exit ;;
-    sparc*:BSD/OS:*:*)
-	echo sparc-unknown-bsdi"$UNAME_RELEASE"
-	exit ;;
-    *:BSD/OS:*:*)
-	echo "$UNAME_MACHINE"-unknown-bsdi"$UNAME_RELEASE"
-	exit ;;
-    arm:FreeBSD:*:*)
-	UNAME_PROCESSOR=$(uname -p)
-	set_cc_for_build
-	if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
-	    | grep -q __ARM_PCS_VFP
-	then
-	    echo "${UNAME_PROCESSOR}"-unknown-freebsd"$(echo ${UNAME_RELEASE}|sed -e 's/[-(].*//')"-gnueabi
-	else
-	    echo "${UNAME_PROCESSOR}"-unknown-freebsd"$(echo ${UNAME_RELEASE}|sed -e 's/[-(].*//')"-gnueabihf
-	fi
-	exit ;;
-    *:FreeBSD:*:*)
-	UNAME_PROCESSOR=$(/usr/bin/uname -p)
-	case "$UNAME_PROCESSOR" in
-	    amd64)
-		UNAME_PROCESSOR=x86_64 ;;
-	    i386)
-		UNAME_PROCESSOR=i586 ;;
-	esac
-	echo "$UNAME_PROCESSOR"-unknown-freebsd"$(echo "$UNAME_RELEASE"|sed -e 's/[-(].*//')"
-	exit ;;
-    i*:CYGWIN*:*)
-	echo "$UNAME_MACHINE"-pc-cygwin
-	exit ;;
-    *:MINGW64*:*)
-	echo "$UNAME_MACHINE"-pc-mingw64
-	exit ;;
-    *:MINGW*:*)
-	echo "$UNAME_MACHINE"-pc-mingw32
-	exit ;;
-    *:MSYS*:*)
-	echo "$UNAME_MACHINE"-pc-msys
-	exit ;;
-    i*:PW*:*)
-	echo "$UNAME_MACHINE"-pc-pw32
-	exit ;;
-    *:Interix*:*)
-	case "$UNAME_MACHINE" in
-	    x86)
-		echo i586-pc-interix"$UNAME_RELEASE"
-		exit ;;
-	    authenticamd | genuineintel | EM64T)
-		echo x86_64-unknown-interix"$UNAME_RELEASE"
-		exit ;;
-	    IA64)
-		echo ia64-unknown-interix"$UNAME_RELEASE"
-		exit ;;
-	esac ;;
-    i*:UWIN*:*)
-	echo "$UNAME_MACHINE"-pc-uwin
-	exit ;;
-    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
-	echo x86_64-pc-cygwin
-	exit ;;
-    prep*:SunOS:5.*:*)
-	echo powerpcle-unknown-solaris2"$(echo "$UNAME_RELEASE"|sed -e 's/[^.]*//')"
-	exit ;;
-    *:GNU:*:*)
-	# the GNU system
-	echo "$(echo "$UNAME_MACHINE"|sed -e 's,[-/].*$,,')-unknown-$LIBC$(echo "$UNAME_RELEASE"|sed -e 's,/.*$,,')"
-	exit ;;
-    *:GNU/*:*:*)
-	# other systems with GNU libc and userland
-	echo "$UNAME_MACHINE-unknown-$(echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]")$(echo "$UNAME_RELEASE"|sed -e 's/[-(].*//')-$LIBC"
-	exit ;;
-    *:Minix:*:*)
-	echo "$UNAME_MACHINE"-unknown-minix
-	exit ;;
-    aarch64:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    aarch64_be:Linux:*:*)
-	UNAME_MACHINE=aarch64_be
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    alpha:Linux:*:*)
-	case $(sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null) in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-	esac
-	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    arc:Linux:*:* | arceb:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    arm*:Linux:*:*)
-	set_cc_for_build
-	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
-	    | grep -q __ARM_EABI__
-	then
-	    echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	else
-	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
-		| grep -q __ARM_PCS_VFP
-	    then
-		echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabi
-	    else
-		echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"eabihf
-	    fi
-	fi
-	exit ;;
-    avr32*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    cris:Linux:*:*)
-	echo "$UNAME_MACHINE"-axis-linux-"$LIBC"
-	exit ;;
-    crisv32:Linux:*:*)
-	echo "$UNAME_MACHINE"-axis-linux-"$LIBC"
-	exit ;;
-    e2k:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    frv:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    hexagon:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    i*86:Linux:*:*)
-	echo "$UNAME_MACHINE"-pc-linux-"$LIBC"
-	exit ;;
-    ia64:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    k1om:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    loongarch32:Linux:*:* | loongarch64:Linux:*:* | loongarchx32:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    m32r*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    m68*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    mips:Linux:*:* | mips64:Linux:*:*)
-	set_cc_for_build
-	IS_GLIBC=0
-	test x"${LIBC}" = xgnu && IS_GLIBC=1
-	sed 's/^	//' << EOF > "$dummy.c"
-	#undef CPU
-	#undef mips
-	#undef mipsel
-	#undef mips64
-	#undef mips64el
-	#if ${IS_GLIBC} && defined(_ABI64)
-	LIBCABI=gnuabi64
-	#else
-	#if ${IS_GLIBC} && defined(_ABIN32)
-	LIBCABI=gnuabin32
-	#else
-	LIBCABI=${LIBC}
-	#endif
-	#endif
-
-	#if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
-	CPU=mipsisa64r6
-	#else
-	#if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
-	CPU=mipsisa32r6
-	#else
-	#if defined(__mips64)
-	CPU=mips64
-	#else
-	CPU=mips
-	#endif
-	#endif
-	#endif
-
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	MIPS_ENDIAN=el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	MIPS_ENDIAN=
-	#else
-	MIPS_ENDIAN=
-	#endif
-	#endif
-EOF
-	eval "$($CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI')"
-	test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; }
-	;;
-    mips64el:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    openrisc*:Linux:*:*)
-	echo or1k-unknown-linux-"$LIBC"
-	exit ;;
-    or32:Linux:*:* | or1k*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    padre:Linux:*:*)
-	echo sparc-unknown-linux-"$LIBC"
-	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-"$LIBC"
-	exit ;;
-    parisc:Linux:*:* | hppa:Linux:*:*)
-	# Look for CPU level
-	case $(grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2) in
-	  PA7*) echo hppa1.1-unknown-linux-"$LIBC" ;;
-	  PA8*) echo hppa2.0-unknown-linux-"$LIBC" ;;
-	  *)    echo hppa-unknown-linux-"$LIBC" ;;
-	esac
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-"$LIBC"
-	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-"$LIBC"
-	exit ;;
-    ppc64le:Linux:*:*)
-	echo powerpc64le-unknown-linux-"$LIBC"
-	exit ;;
-    ppcle:Linux:*:*)
-	echo powerpcle-unknown-linux-"$LIBC"
-	exit ;;
-    riscv32:Linux:*:* | riscv32be:Linux:*:* | riscv64:Linux:*:* | riscv64be:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    s390:Linux:*:* | s390x:Linux:*:*)
-	echo "$UNAME_MACHINE"-ibm-linux-"$LIBC"
-	exit ;;
-    sh64*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    sh*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    tile*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    vax:Linux:*:*)
-	echo "$UNAME_MACHINE"-dec-linux-"$LIBC"
-	exit ;;
-    x86_64:Linux:*:*)
-	set_cc_for_build
-	LIBCABI=$LIBC
-	if test "$CC_FOR_BUILD" != no_compiler_found; then
-	    if (echo '#ifdef __ILP32__'; echo IS_X32; echo '#endif') | \
-		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_X32 >/dev/null
-	    then
-		LIBCABI="$LIBC"x32
-	    fi
-	fi
-	echo "$UNAME_MACHINE"-pc-linux-"$LIBCABI"
-	exit ;;
-    xtensa*:Linux:*:*)
-	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
-	exit ;;
-    i*86:DYNIX/ptx:4*:*)
-	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
-	# earlier versions are messed up and put the nodename in both
-	# sysname and nodename.
-	echo i386-sequent-sysv4
-	exit ;;
-    i*86:UNIX_SV:4.2MP:2.*)
-	# Unixware is an offshoot of SVR4, but it has its own version
-	# number series starting with 2...
-	# I am not positive that other SVR4 systems won't match this,
-	# I just have to hope.  -- rms.
-	# Use sysv4.2uw... so that sysv4* matches it.
-	echo "$UNAME_MACHINE"-pc-sysv4.2uw"$UNAME_VERSION"
-	exit ;;
-    i*86:OS/2:*:*)
-	# If we were able to find `uname', then EMX Unix compatibility
-	# is probably installed.
-	echo "$UNAME_MACHINE"-pc-os2-emx
-	exit ;;
-    i*86:XTS-300:*:STOP)
-	echo "$UNAME_MACHINE"-unknown-stop
-	exit ;;
-    i*86:atheos:*:*)
-	echo "$UNAME_MACHINE"-unknown-atheos
-	exit ;;
-    i*86:syllable:*:*)
-	echo "$UNAME_MACHINE"-pc-syllable
-	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
-	echo i386-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    i*86:*DOS:*:*)
-	echo "$UNAME_MACHINE"-pc-msdosdjgpp
-	exit ;;
-    i*86:*:4.*:*)
-	UNAME_REL=$(echo "$UNAME_RELEASE" | sed 's/\/MP$//')
-	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo "$UNAME_MACHINE"-univel-sysv"$UNAME_REL"
-	else
-		echo "$UNAME_MACHINE"-pc-sysv"$UNAME_REL"
-	fi
-	exit ;;
-    i*86:*:5:[678]*)
-	# UnixWare 7.x, OpenUNIX and OpenServer 6.
-	case $(/bin/uname -X | grep "^Machine") in
-	    *486*)	     UNAME_MACHINE=i486 ;;
-	    *Pentium)	     UNAME_MACHINE=i586 ;;
-	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
-	esac
-	echo "$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}"
-	exit ;;
-    i*86:*:3.2:*)
-	if test -f /usr/options/cb.name; then
-		UNAME_REL=$(sed -n 's/.*Version //p' </usr/options/cb.name)
-		echo "$UNAME_MACHINE"-pc-isc"$UNAME_REL"
-	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=$( (/bin/uname -X|grep Release|sed -e 's/.*= //'))
-		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
-			&& UNAME_MACHINE=i586
-		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		echo "$UNAME_MACHINE"-pc-sco"$UNAME_REL"
-	else
-		echo "$UNAME_MACHINE"-pc-sysv32
-	fi
-	exit ;;
-    pc:*:*:*)
-	# Left here for compatibility:
-	# uname -m prints for DJGPP always 'pc', but it prints nothing about
-	# the processor, so we play safe by assuming i586.
-	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configure will decide that
-	# this is a cross-build.
-	echo i586-pc-msdosdjgpp
-	exit ;;
-    Intel:Mach:3*:*)
-	echo i386-pc-mach3
-	exit ;;
-    paragon:*:*:*)
-	echo i860-intel-osf1
-	exit ;;
-    i860:*:4.*:*) # i860-SVR4
-	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv"$UNAME_RELEASE" # Stardent Vistra i860-SVR4
-	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-unknown-sysv"$UNAME_RELEASE"  # Unknown i860-SVR4
-	fi
-	exit ;;
-    mini*:CTIX:SYS*5:*)
-	# "miniframe"
-	echo m68010-convergent-sysv
-	exit ;;
-    mc68k:UNIX:SYSTEM5:3.51m)
-	echo m68k-convergent-sysv
-	exit ;;
-    M680?0:D-NIX:5.3:*)
-	echo m68k-diab-dnix
-	exit ;;
-    M68*:*:R3V[5678]*:*)
-	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
-    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
-	OS_REL=''
-	test -r /etc/.relid \
-	&& OS_REL=.$(sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid)
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
-    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4; exit; } ;;
-    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
-	OS_REL='.3'
-	test -r /etc/.relid \
-	    && OS_REL=.$(sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid)
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	    && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
-	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
-    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-	echo m68k-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    mc68030:UNIX_System_V:4.*:*)
-	echo m68k-atari-sysv4
-	exit ;;
-    TSUNAMI:LynxOS:2.*:*)
-	echo sparc-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    rs6000:LynxOS:2.*:*)
-	echo rs6000-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
-	echo powerpc-unknown-lynxos"$UNAME_RELEASE"
-	exit ;;
-    SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv"$UNAME_RELEASE"
-	exit ;;
-    RM*:ReliantUNIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    RM*:SINIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    *:SINIX-*:*:*)
-	if uname -p 2>/dev/null >/dev/null ; then
-		UNAME_MACHINE=$( (uname -p) 2>/dev/null)
-		echo "$UNAME_MACHINE"-sni-sysv4
-	else
-		echo ns32k-sni-sysv
-	fi
-	exit ;;
-    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-			# says <Richard.M.Bartel@ccMail.Census.GOV>
-	echo i586-unisys-sysv4
-	exit ;;
-    *:UNIX_System_V:4*:FTX*)
-	# From Gerald Hewes <hewes@openmarket.com>.
-	# How about differentiating between stratus architectures? -djm
-	echo hppa1.1-stratus-sysv4
-	exit ;;
-    *:*:*:FTX*)
-	# From seanf@swdc.stratus.com.
-	echo i860-stratus-sysv4
-	exit ;;
-    i*86:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo "$UNAME_MACHINE"-stratus-vos
-	exit ;;
-    *:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo hppa1.1-stratus-vos
-	exit ;;
-    mc68*:A/UX:*:*)
-	echo m68k-apple-aux"$UNAME_RELEASE"
-	exit ;;
-    news*:NEWS-OS:6*:*)
-	echo mips-sony-newsos6
-	exit ;;
-    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
-	if test -d /usr/nec; then
-		echo mips-nec-sysv"$UNAME_RELEASE"
-	else
-		echo mips-unknown-sysv"$UNAME_RELEASE"
-	fi
-	exit ;;
-    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
-	echo powerpc-be-beos
-	exit ;;
-    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
-	echo powerpc-apple-beos
-	exit ;;
-    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
-	echo i586-pc-beos
-	exit ;;
-    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
-	echo i586-pc-haiku
-	exit ;;
-    x86_64:Haiku:*:*)
-	echo x86_64-unknown-haiku
-	exit ;;
-    SX-4:SUPER-UX:*:*)
-	echo sx4-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-5:SUPER-UX:*:*)
-	echo sx5-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-6:SUPER-UX:*:*)
-	echo sx6-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-7:SUPER-UX:*:*)
-	echo sx7-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-8:SUPER-UX:*:*)
-	echo sx8-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-8R:SUPER-UX:*:*)
-	echo sx8r-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    SX-ACE:SUPER-UX:*:*)
-	echo sxace-nec-superux"$UNAME_RELEASE"
-	exit ;;
-    Power*:Rhapsody:*:*)
-	echo powerpc-apple-rhapsody"$UNAME_RELEASE"
-	exit ;;
-    *:Rhapsody:*:*)
-	echo "$UNAME_MACHINE"-apple-rhapsody"$UNAME_RELEASE"
-	exit ;;
-    arm64:Darwin:*:*)
-	echo aarch64-apple-darwin"$UNAME_RELEASE"
-	exit ;;
-    *:Darwin:*:*)
-	UNAME_PROCESSOR=$(uname -p)
-	case $UNAME_PROCESSOR in
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
-	if command -v xcode-select > /dev/null 2> /dev/null && \
-		! xcode-select --print-path > /dev/null 2> /dev/null ; then
-	    # Avoid executing cc if there is no toolchain installed as
-	    # cc will be a stub that puts up a graphical alert
-	    # prompting the user to install developer tools.
-	    CC_FOR_BUILD=no_compiler_found
-	else
-	    set_cc_for_build
-	fi
-	if test "$CC_FOR_BUILD" != no_compiler_found; then
-	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		   grep IS_64BIT_ARCH >/dev/null
-	    then
-		case $UNAME_PROCESSOR in
-		    i386) UNAME_PROCESSOR=x86_64 ;;
-		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
-		esac
-	    fi
-	    # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
-	    if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
-		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		   grep IS_PPC >/dev/null
-	    then
-		UNAME_PROCESSOR=powerpc
-	    fi
-	elif test "$UNAME_PROCESSOR" = i386 ; then
-	    # uname -m returns i386 or x86_64
-	    UNAME_PROCESSOR=$UNAME_MACHINE
-	fi
-	echo "$UNAME_PROCESSOR"-apple-darwin"$UNAME_RELEASE"
-	exit ;;
-    *:procnto*:*:* | *:QNX:[0123456789]*:*)
-	UNAME_PROCESSOR=$(uname -p)
-	if test "$UNAME_PROCESSOR" = x86; then
-		UNAME_PROCESSOR=i386
-		UNAME_MACHINE=pc
-	fi
-	echo "$UNAME_PROCESSOR"-"$UNAME_MACHINE"-nto-qnx"$UNAME_RELEASE"
-	exit ;;
-    *:QNX:*:4*)
-	echo i386-pc-qnx
-	exit ;;
-    NEO-*:NONSTOP_KERNEL:*:*)
-	echo neo-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSE-*:NONSTOP_KERNEL:*:*)
-	echo nse-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSR-*:NONSTOP_KERNEL:*:*)
-	echo nsr-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSV-*:NONSTOP_KERNEL:*:*)
-	echo nsv-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    NSX-*:NONSTOP_KERNEL:*:*)
-	echo nsx-tandem-nsk"$UNAME_RELEASE"
-	exit ;;
-    *:NonStop-UX:*:*)
-	echo mips-compaq-nonstopux
-	exit ;;
-    BS2000:POSIX*:*:*)
-	echo bs2000-siemens-sysv
-	exit ;;
-    DS/*:UNIX_System_V:*:*)
-	echo "$UNAME_MACHINE"-"$UNAME_SYSTEM"-"$UNAME_RELEASE"
-	exit ;;
-    *:Plan9:*:*)
-	# "uname -m" is not consistent, so use $cputype instead. 386
-	# is converted to i386 for consistency with other x86
-	# operating systems.
-	# shellcheck disable=SC2154
-	if test "$cputype" = 386; then
-	    UNAME_MACHINE=i386
-	else
-	    UNAME_MACHINE="$cputype"
-	fi
-	echo "$UNAME_MACHINE"-unknown-plan9
-	exit ;;
-    *:TOPS-10:*:*)
-	echo pdp10-unknown-tops10
-	exit ;;
-    *:TENEX:*:*)
-	echo pdp10-unknown-tenex
-	exit ;;
-    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
-	echo pdp10-dec-tops20
-	exit ;;
-    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
-	echo pdp10-xkl-tops20
-	exit ;;
-    *:TOPS-20:*:*)
-	echo pdp10-unknown-tops20
-	exit ;;
-    *:ITS:*:*)
-	echo pdp10-unknown-its
-	exit ;;
-    SEI:*:*:SEIUX)
-	echo mips-sei-seiux"$UNAME_RELEASE"
-	exit ;;
-    *:DragonFly:*:*)
-	echo "$UNAME_MACHINE"-unknown-dragonfly"$(echo "$UNAME_RELEASE"|sed -e 's/[-(].*//')"
-	exit ;;
-    *:*VMS:*:*)
-	UNAME_MACHINE=$( (uname -p) 2>/dev/null)
-	case "$UNAME_MACHINE" in
-	    A*) echo alpha-dec-vms ; exit ;;
-	    I*) echo ia64-dec-vms ; exit ;;
-	    V*) echo vax-dec-vms ; exit ;;
-	esac ;;
-    *:XENIX:*:SysV)
-	echo i386-pc-xenix
-	exit ;;
-    i*86:skyos:*:*)
-	echo "$UNAME_MACHINE"-pc-skyos"$(echo "$UNAME_RELEASE" | sed -e 's/ .*$//')"
-	exit ;;
-    i*86:rdos:*:*)
-	echo "$UNAME_MACHINE"-pc-rdos
-	exit ;;
-    i*86:AROS:*:*)
-	echo "$UNAME_MACHINE"-pc-aros
-	exit ;;
-    x86_64:VMkernel:*:*)
-	echo "$UNAME_MACHINE"-unknown-esx
-	exit ;;
-    amd64:Isilon\ OneFS:*:*)
-	echo x86_64-unknown-onefs
-	exit ;;
-    *:Unleashed:*:*)
-	echo "$UNAME_MACHINE"-unknown-unleashed"$UNAME_RELEASE"
-	exit ;;
-esac
-
-# No uname command or uname output not recognized.
-set_cc_for_build
-cat > "$dummy.c" <<EOF
-#ifdef _SEQUENT_
-#include <sys/types.h>
-#include <sys/utsname.h>
-#endif
-#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
-#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
-#include <signal.h>
-#if defined(_SIZE_T_) || defined(SIGLOST)
-#include <sys/utsname.h>
-#endif
-#endif
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-  "4"
-#else
-  ""
-#endif
-  ); exit (0);
-#endif
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=$( (hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null);
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-  struct utsname un;
-
-  uname(&un);
-  if (strncmp(un.version, "V2", 2) == 0) {
-    printf ("i386-sequent-ptx2\n"); exit (0);
-  }
-  if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-    printf ("i386-sequent-ptx1\n"); exit (0);
-  }
-  printf ("i386-sequent-ptx\n"); exit (0);
-#endif
-
-#if defined (vax)
-#if !defined (ultrix)
-#include <sys/param.h>
-#if defined (BSD)
-#if BSD == 43
-  printf ("vax-dec-bsd4.3\n"); exit (0);
-#else
-#if BSD == 199006
-  printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#else
-  printf ("vax-dec-bsd\n"); exit (0);
-#endif
-#endif
-#else
-  printf ("vax-dec-bsd\n"); exit (0);
-#endif
-#else
-#if defined(_SIZE_T_) || defined(SIGLOST)
-  struct utsname un;
-  uname (&un);
-  printf ("vax-dec-ultrix%s\n", un.release); exit (0);
-#else
-  printf ("vax-dec-ultrix\n"); exit (0);
-#endif
-#endif
-#endif
-#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
-#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
-#if defined(_SIZE_T_) || defined(SIGLOST)
-  struct utsname *un;
-  uname (&un);
-  printf ("mips-dec-ultrix%s\n", un.release); exit (0);
-#else
-  printf ("mips-dec-ultrix\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=$($dummy) &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; }
-
-echo "$0: unable to guess system type" >&2
-
-case "$UNAME_MACHINE:$UNAME_SYSTEM" in
-    mips:Linux | mips64:Linux)
-	# If we got here on MIPS GNU/Linux, output extra information.
-	cat >&2 <<EOF
-
-NOTE: MIPS GNU/Linux systems require a C compiler to fully recognize
-the system type. Please install a C compiler and try again.
-EOF
-	;;
-esac
-
-cat >&2 <<EOF
-
-This script (version $timestamp), has failed to recognize the
-operating system you are using. If your script is old, overwrite *all*
-copies of config.guess and config.sub with the latest versions from:
-
-  https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
-and
-  https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
-EOF
-
-year=$(echo $timestamp | sed 's,-.*,,')
-# shellcheck disable=SC2003
-if test "$(expr "$(date +%Y)" - "$year")" -lt 3 ; then
-   cat >&2 <<EOF
-
-If $0 has already been updated, send the following data and any
-information you think might be pertinent to config-patches@gnu.org to
-provide the necessary information to handle your system.
-
-config.guess timestamp = $timestamp
-
-uname -m = $( (uname -m) 2>/dev/null || echo unknown)
-uname -r = $( (uname -r) 2>/dev/null || echo unknown)
-uname -s = $( (uname -s) 2>/dev/null || echo unknown)
-uname -v = $( (uname -v) 2>/dev/null || echo unknown)
-
-/usr/bin/uname -p = $( (/usr/bin/uname -p) 2>/dev/null)
-/bin/uname -X     = $( (/bin/uname -X) 2>/dev/null)
-
-hostinfo               = $( (hostinfo) 2>/dev/null)
-/bin/universe          = $( (/bin/universe) 2>/dev/null)
-/usr/bin/arch -k       = $( (/usr/bin/arch -k) 2>/dev/null)
-/bin/arch              = $( (/bin/arch) 2>/dev/null)
-/usr/bin/oslevel       = $( (/usr/bin/oslevel) 2>/dev/null)
-/usr/convex/getsysinfo = $( (/usr/convex/getsysinfo) 2>/dev/null)
-
-UNAME_MACHINE = "$UNAME_MACHINE"
-UNAME_RELEASE = "$UNAME_RELEASE"
-UNAME_SYSTEM  = "$UNAME_SYSTEM"
-UNAME_VERSION = "$UNAME_VERSION"
-EOF
-fi
-
-exit 1
-
-# Local variables:
-# eval: (add-hook 'before-save-hook 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/config.sub b/config.sub
deleted file mode 100644
index 0cbdae682..000000000
--- a/config.sub
+++ /dev/null
@@ -1,1855 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script.
-#   Copyright 1992-2021 Free Software Foundation, Inc.
-
-timestamp='2021-01-01'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <https://www.gnu.org/licenses/>.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
-
-
-# Please send patches to <config-patches@gnu.org>.
-#
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# You can get the latest version of this script from:
-# https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support.  The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-me=$(echo "$0" | sed -e 's,.*/,,')
-
-usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
-
-Canonicalize a configuration name.
-
-Options:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.sub ($timestamp)
-
-Copyright 1992-2021 Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-
-    *local*)
-       # First pass through any local machine types.
-       echo "$1"
-       exit ;;
-
-    * )
-       break ;;
-  esac
-done
-
-case $# in
- 0) echo "$me: missing argument$help" >&2
-    exit 1;;
- 1) ;;
- *) echo "$me: too many arguments$help" >&2
-    exit 1;;
-esac
-
-# Split fields of configuration type
-# shellcheck disable=SC2162
-IFS="-" read field1 field2 field3 field4 <<EOF
-$1
-EOF
-
-# Separate into logical components for further validation
-case $1 in
-	*-*-*-*-*)
-		echo Invalid configuration \`"$1"\': more than four components >&2
-		exit 1
-		;;
-	*-*-*-*)
-		basic_machine=$field1-$field2
-		basic_os=$field3-$field4
-		;;
-	*-*-*)
-		# Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two
-		# parts
-		maybe_os=$field2-$field3
-		case $maybe_os in
-			nto-qnx* | linux-* | uclinux-uclibc* \
-			| uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* \
-			| netbsd*-eabi* | kopensolaris*-gnu* | cloudabi*-eabi* \
-			| storm-chaos* | os2-emx* | rtmk-nova*)
-				basic_machine=$field1
-				basic_os=$maybe_os
-				;;
-			android-linux)
-				basic_machine=$field1-unknown
-				basic_os=linux-android
-				;;
-			*)
-				basic_machine=$field1-$field2
-				basic_os=$field3
-				;;
-		esac
-		;;
-	*-*)
-		# A lone config we happen to match not fitting any pattern
-		case $field1-$field2 in
-			decstation-3100)
-				basic_machine=mips-dec
-				basic_os=
-				;;
-			*-*)
-				# Second component is usually, but not always the OS
-				case $field2 in
-					# Prevent following clause from handling this valid os
-					sun*os*)
-						basic_machine=$field1
-						basic_os=$field2
-						;;
-					# Manufacturers
-					dec* | mips* | sequent* | encore* | pc533* | sgi* | sony* \
-					| att* | 7300* | 3300* | delta* | motorola* | sun[234]* \
-					| unicom* | ibm* | next | hp | isi* | apollo | altos* \
-					| convergent* | ncr* | news | 32* | 3600* | 3100* \
-					| hitachi* | c[123]* | convex* | sun | crds | omron* | dg \
-					| ultra | tti* | harris | dolphin | highlevel | gould \
-					| cbm | ns | masscomp | apple | axis | knuth | cray \
-					| microblaze* | sim | cisco \
-					| oki | wec | wrs | winbond)
-						basic_machine=$field1-$field2
-						basic_os=
-						;;
-					*)
-						basic_machine=$field1
-						basic_os=$field2
-						;;
-				esac
-			;;
-		esac
-		;;
-	*)
-		# Convert single-component short-hands not valid as part of
-		# multi-component configurations.
-		case $field1 in
-			386bsd)
-				basic_machine=i386-pc
-				basic_os=bsd
-				;;
-			a29khif)
-				basic_machine=a29k-amd
-				basic_os=udi
-				;;
-			adobe68k)
-				basic_machine=m68010-adobe
-				basic_os=scout
-				;;
-			alliant)
-				basic_machine=fx80-alliant
-				basic_os=
-				;;
-			altos | altos3068)
-				basic_machine=m68k-altos
-				basic_os=
-				;;
-			am29k)
-				basic_machine=a29k-none
-				basic_os=bsd
-				;;
-			amdahl)
-				basic_machine=580-amdahl
-				basic_os=sysv
-				;;
-			amiga)
-				basic_machine=m68k-unknown
-				basic_os=
-				;;
-			amigaos | amigados)
-				basic_machine=m68k-unknown
-				basic_os=amigaos
-				;;
-			amigaunix | amix)
-				basic_machine=m68k-unknown
-				basic_os=sysv4
-				;;
-			apollo68)
-				basic_machine=m68k-apollo
-				basic_os=sysv
-				;;
-			apollo68bsd)
-				basic_machine=m68k-apollo
-				basic_os=bsd
-				;;
-			aros)
-				basic_machine=i386-pc
-				basic_os=aros
-				;;
-			aux)
-				basic_machine=m68k-apple
-				basic_os=aux
-				;;
-			balance)
-				basic_machine=ns32k-sequent
-				basic_os=dynix
-				;;
-			blackfin)
-				basic_machine=bfin-unknown
-				basic_os=linux
-				;;
-			cegcc)
-				basic_machine=arm-unknown
-				basic_os=cegcc
-				;;
-			convex-c1)
-				basic_machine=c1-convex
-				basic_os=bsd
-				;;
-			convex-c2)
-				basic_machine=c2-convex
-				basic_os=bsd
-				;;
-			convex-c32)
-				basic_machine=c32-convex
-				basic_os=bsd
-				;;
-			convex-c34)
-				basic_machine=c34-convex
-				basic_os=bsd
-				;;
-			convex-c38)
-				basic_machine=c38-convex
-				basic_os=bsd
-				;;
-			cray)
-				basic_machine=j90-cray
-				basic_os=unicos
-				;;
-			crds | unos)
-				basic_machine=m68k-crds
-				basic_os=
-				;;
-			da30)
-				basic_machine=m68k-da30
-				basic_os=
-				;;
-			decstation | pmax | pmin | dec3100 | decstatn)
-				basic_machine=mips-dec
-				basic_os=
-				;;
-			delta88)
-				basic_machine=m88k-motorola
-				basic_os=sysv3
-				;;
-			dicos)
-				basic_machine=i686-pc
-				basic_os=dicos
-				;;
-			djgpp)
-				basic_machine=i586-pc
-				basic_os=msdosdjgpp
-				;;
-			ebmon29k)
-				basic_machine=a29k-amd
-				basic_os=ebmon
-				;;
-			es1800 | OSE68k | ose68k | ose | OSE)
-				basic_machine=m68k-ericsson
-				basic_os=ose
-				;;
-			gmicro)
-				basic_machine=tron-gmicro
-				basic_os=sysv
-				;;
-			go32)
-				basic_machine=i386-pc
-				basic_os=go32
-				;;
-			h8300hms)
-				basic_machine=h8300-hitachi
-				basic_os=hms
-				;;
-			h8300xray)
-				basic_machine=h8300-hitachi
-				basic_os=xray
-				;;
-			h8500hms)
-				basic_machine=h8500-hitachi
-				basic_os=hms
-				;;
-			harris)
-				basic_machine=m88k-harris
-				basic_os=sysv3
-				;;
-			hp300 | hp300hpux)
-				basic_machine=m68k-hp
-				basic_os=hpux
-				;;
-			hp300bsd)
-				basic_machine=m68k-hp
-				basic_os=bsd
-				;;
-			hppaosf)
-				basic_machine=hppa1.1-hp
-				basic_os=osf
-				;;
-			hppro)
-				basic_machine=hppa1.1-hp
-				basic_os=proelf
-				;;
-			i386mach)
-				basic_machine=i386-mach
-				basic_os=mach
-				;;
-			isi68 | isi)
-				basic_machine=m68k-isi
-				basic_os=sysv
-				;;
-			m68knommu)
-				basic_machine=m68k-unknown
-				basic_os=linux
-				;;
-			magnum | m3230)
-				basic_machine=mips-mips
-				basic_os=sysv
-				;;
-			merlin)
-				basic_machine=ns32k-utek
-				basic_os=sysv
-				;;
-			mingw64)
-				basic_machine=x86_64-pc
-				basic_os=mingw64
-				;;
-			mingw32)
-				basic_machine=i686-pc
-				basic_os=mingw32
-				;;
-			mingw32ce)
-				basic_machine=arm-unknown
-				basic_os=mingw32ce
-				;;
-			monitor)
-				basic_machine=m68k-rom68k
-				basic_os=coff
-				;;
-			morphos)
-				basic_machine=powerpc-unknown
-				basic_os=morphos
-				;;
-			moxiebox)
-				basic_machine=moxie-unknown
-				basic_os=moxiebox
-				;;
-			msdos)
-				basic_machine=i386-pc
-				basic_os=msdos
-				;;
-			msys)
-				basic_machine=i686-pc
-				basic_os=msys
-				;;
-			mvs)
-				basic_machine=i370-ibm
-				basic_os=mvs
-				;;
-			nacl)
-				basic_machine=le32-unknown
-				basic_os=nacl
-				;;
-			ncr3000)
-				basic_machine=i486-ncr
-				basic_os=sysv4
-				;;
-			netbsd386)
-				basic_machine=i386-pc
-				basic_os=netbsd
-				;;
-			netwinder)
-				basic_machine=armv4l-rebel
-				basic_os=linux
-				;;
-			news | news700 | news800 | news900)
-				basic_machine=m68k-sony
-				basic_os=newsos
-				;;
-			news1000)
-				basic_machine=m68030-sony
-				basic_os=newsos
-				;;
-			necv70)
-				basic_machine=v70-nec
-				basic_os=sysv
-				;;
-			nh3000)
-				basic_machine=m68k-harris
-				basic_os=cxux
-				;;
-			nh[45]000)
-				basic_machine=m88k-harris
-				basic_os=cxux
-				;;
-			nindy960)
-				basic_machine=i960-intel
-				basic_os=nindy
-				;;
-			mon960)
-				basic_machine=i960-intel
-				basic_os=mon960
-				;;
-			nonstopux)
-				basic_machine=mips-compaq
-				basic_os=nonstopux
-				;;
-			os400)
-				basic_machine=powerpc-ibm
-				basic_os=os400
-				;;
-			OSE68000 | ose68000)
-				basic_machine=m68000-ericsson
-				basic_os=ose
-				;;
-			os68k)
-				basic_machine=m68k-none
-				basic_os=os68k
-				;;
-			paragon)
-				basic_machine=i860-intel
-				basic_os=osf
-				;;
-			parisc)
-				basic_machine=hppa-unknown
-				basic_os=linux
-				;;
-			psp)
-				basic_machine=mipsallegrexel-sony
-				basic_os=psp
-				;;
-			pw32)
-				basic_machine=i586-unknown
-				basic_os=pw32
-				;;
-			rdos | rdos64)
-				basic_machine=x86_64-pc
-				basic_os=rdos
-				;;
-			rdos32)
-				basic_machine=i386-pc
-				basic_os=rdos
-				;;
-			rom68k)
-				basic_machine=m68k-rom68k
-				basic_os=coff
-				;;
-			sa29200)
-				basic_machine=a29k-amd
-				basic_os=udi
-				;;
-			sei)
-				basic_machine=mips-sei
-				basic_os=seiux
-				;;
-			sequent)
-				basic_machine=i386-sequent
-				basic_os=
-				;;
-			sps7)
-				basic_machine=m68k-bull
-				basic_os=sysv2
-				;;
-			st2000)
-				basic_machine=m68k-tandem
-				basic_os=
-				;;
-			stratus)
-				basic_machine=i860-stratus
-				basic_os=sysv4
-				;;
-			sun2)
-				basic_machine=m68000-sun
-				basic_os=
-				;;
-			sun2os3)
-				basic_machine=m68000-sun
-				basic_os=sunos3
-				;;
-			sun2os4)
-				basic_machine=m68000-sun
-				basic_os=sunos4
-				;;
-			sun3)
-				basic_machine=m68k-sun
-				basic_os=
-				;;
-			sun3os3)
-				basic_machine=m68k-sun
-				basic_os=sunos3
-				;;
-			sun3os4)
-				basic_machine=m68k-sun
-				basic_os=sunos4
-				;;
-			sun4)
-				basic_machine=sparc-sun
-				basic_os=
-				;;
-			sun4os3)
-				basic_machine=sparc-sun
-				basic_os=sunos3
-				;;
-			sun4os4)
-				basic_machine=sparc-sun
-				basic_os=sunos4
-				;;
-			sun4sol2)
-				basic_machine=sparc-sun
-				basic_os=solaris2
-				;;
-			sun386 | sun386i | roadrunner)
-				basic_machine=i386-sun
-				basic_os=
-				;;
-			sv1)
-				basic_machine=sv1-cray
-				basic_os=unicos
-				;;
-			symmetry)
-				basic_machine=i386-sequent
-				basic_os=dynix
-				;;
-			t3e)
-				basic_machine=alphaev5-cray
-				basic_os=unicos
-				;;
-			t90)
-				basic_machine=t90-cray
-				basic_os=unicos
-				;;
-			toad1)
-				basic_machine=pdp10-xkl
-				basic_os=tops20
-				;;
-			tpf)
-				basic_machine=s390x-ibm
-				basic_os=tpf
-				;;
-			udi29k)
-				basic_machine=a29k-amd
-				basic_os=udi
-				;;
-			ultra3)
-				basic_machine=a29k-nyu
-				basic_os=sym1
-				;;
-			v810 | necv810)
-				basic_machine=v810-nec
-				basic_os=none
-				;;
-			vaxv)
-				basic_machine=vax-dec
-				basic_os=sysv
-				;;
-			vms)
-				basic_machine=vax-dec
-				basic_os=vms
-				;;
-			vsta)
-				basic_machine=i386-pc
-				basic_os=vsta
-				;;
-			vxworks960)
-				basic_machine=i960-wrs
-				basic_os=vxworks
-				;;
-			vxworks68)
-				basic_machine=m68k-wrs
-				basic_os=vxworks
-				;;
-			vxworks29k)
-				basic_machine=a29k-wrs
-				basic_os=vxworks
-				;;
-			xbox)
-				basic_machine=i686-pc
-				basic_os=mingw32
-				;;
-			ymp)
-				basic_machine=ymp-cray
-				basic_os=unicos
-				;;
-			*)
-				basic_machine=$1
-				basic_os=
-				;;
-		esac
-		;;
-esac
-
-# Decode 1-component or ad-hoc basic machines
-case $basic_machine in
-	# Here we handle the default manufacturer of certain CPU types.  It is in
-	# some cases the only manufacturer, in others, it is the most popular.
-	w89k)
-		cpu=hppa1.1
-		vendor=winbond
-		;;
-	op50n)
-		cpu=hppa1.1
-		vendor=oki
-		;;
-	op60c)
-		cpu=hppa1.1
-		vendor=oki
-		;;
-	ibm*)
-		cpu=i370
-		vendor=ibm
-		;;
-	orion105)
-		cpu=clipper
-		vendor=highlevel
-		;;
-	mac | mpw | mac-mpw)
-		cpu=m68k
-		vendor=apple
-		;;
-	pmac | pmac-mpw)
-		cpu=powerpc
-		vendor=apple
-		;;
-
-	# Recognize the various machine names and aliases which stand
-	# for a CPU type and a company and sometimes even an OS.
-	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-		cpu=m68000
-		vendor=att
-		;;
-	3b*)
-		cpu=we32k
-		vendor=att
-		;;
-	bluegene*)
-		cpu=powerpc
-		vendor=ibm
-		basic_os=cnk
-		;;
-	decsystem10* | dec10*)
-		cpu=pdp10
-		vendor=dec
-		basic_os=tops10
-		;;
-	decsystem20* | dec20*)
-		cpu=pdp10
-		vendor=dec
-		basic_os=tops20
-		;;
-	delta | 3300 | motorola-3300 | motorola-delta \
-	      | 3300-motorola | delta-motorola)
-		cpu=m68k
-		vendor=motorola
-		;;
-	dpx2*)
-		cpu=m68k
-		vendor=bull
-		basic_os=sysv3
-		;;
-	encore | umax | mmax)
-		cpu=ns32k
-		vendor=encore
-		;;
-	elxsi)
-		cpu=elxsi
-		vendor=elxsi
-		basic_os=${basic_os:-bsd}
-		;;
-	fx2800)
-		cpu=i860
-		vendor=alliant
-		;;
-	genix)
-		cpu=ns32k
-		vendor=ns
-		;;
-	h3050r* | hiux*)
-		cpu=hppa1.1
-		vendor=hitachi
-		basic_os=hiuxwe2
-		;;
-	hp3k9[0-9][0-9] | hp9[0-9][0-9])
-		cpu=hppa1.0
-		vendor=hp
-		;;
-	hp9k2[0-9][0-9] | hp9k31[0-9])
-		cpu=m68000
-		vendor=hp
-		;;
-	hp9k3[2-9][0-9])
-		cpu=m68k
-		vendor=hp
-		;;
-	hp9k6[0-9][0-9] | hp6[0-9][0-9])
-		cpu=hppa1.0
-		vendor=hp
-		;;
-	hp9k7[0-79][0-9] | hp7[0-79][0-9])
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k78[0-9] | hp78[0-9])
-		# FIXME: really hppa2.0-hp
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
-		# FIXME: really hppa2.0-hp
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k8[0-9][13679] | hp8[0-9][13679])
-		cpu=hppa1.1
-		vendor=hp
-		;;
-	hp9k8[0-9][0-9] | hp8[0-9][0-9])
-		cpu=hppa1.0
-		vendor=hp
-		;;
-	i*86v32)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=sysv32
-		;;
-	i*86v4*)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=sysv4
-		;;
-	i*86v)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=sysv
-		;;
-	i*86sol2)
-		cpu=$(echo "$1" | sed -e 's/86.*/86/')
-		vendor=pc
-		basic_os=solaris2
-		;;
-	j90 | j90-cray)
-		cpu=j90
-		vendor=cray
-		basic_os=${basic_os:-unicos}
-		;;
-	iris | iris4d)
-		cpu=mips
-		vendor=sgi
-		case $basic_os in
-		    irix*)
-			;;
-		    *)
-			basic_os=irix4
-			;;
-		esac
-		;;
-	miniframe)
-		cpu=m68000
-		vendor=convergent
-		;;
-	*mint | mint[0-9]* | *MiNT | *MiNT[0-9]*)
-		cpu=m68k
-		vendor=atari
-		basic_os=mint
-		;;
-	news-3600 | risc-news)
-		cpu=mips
-		vendor=sony
-		basic_os=newsos
-		;;
-	next | m*-next)
-		cpu=m68k
-		vendor=next
-		case $basic_os in
-		    openstep*)
-		        ;;
-		    nextstep*)
-			;;
-		    ns2*)
-		      basic_os=nextstep2
-			;;
-		    *)
-		      basic_os=nextstep3
-			;;
-		esac
-		;;
-	np1)
-		cpu=np1
-		vendor=gould
-		;;
-	op50n-* | op60c-*)
-		cpu=hppa1.1
-		vendor=oki
-		basic_os=proelf
-		;;
-	pa-hitachi)
-		cpu=hppa1.1
-		vendor=hitachi
-		basic_os=hiuxwe2
-		;;
-	pbd)
-		cpu=sparc
-		vendor=tti
-		;;
-	pbb)
-		cpu=m68k
-		vendor=tti
-		;;
-	pc532)
-		cpu=ns32k
-		vendor=pc532
-		;;
-	pn)
-		cpu=pn
-		vendor=gould
-		;;
-	power)
-		cpu=power
-		vendor=ibm
-		;;
-	ps2)
-		cpu=i386
-		vendor=ibm
-		;;
-	rm[46]00)
-		cpu=mips
-		vendor=siemens
-		;;
-	rtpc | rtpc-*)
-		cpu=romp
-		vendor=ibm
-		;;
-	sde)
-		cpu=mipsisa32
-		vendor=sde
-		basic_os=${basic_os:-elf}
-		;;
-	simso-wrs)
-		cpu=sparclite
-		vendor=wrs
-		basic_os=vxworks
-		;;
-	tower | tower-32)
-		cpu=m68k
-		vendor=ncr
-		;;
-	vpp*|vx|vx-*)
-		cpu=f301
-		vendor=fujitsu
-		;;
-	w65)
-		cpu=w65
-		vendor=wdc
-		;;
-	w89k-*)
-		cpu=hppa1.1
-		vendor=winbond
-		basic_os=proelf
-		;;
-	none)
-		cpu=none
-		vendor=none
-		;;
-	leon|leon[3-9])
-		cpu=sparc
-		vendor=$basic_machine
-		;;
-	leon-*|leon[3-9]-*)
-		cpu=sparc
-		vendor=$(echo "$basic_machine" | sed 's/-.*//')
-		;;
-
-	*-*)
-		# shellcheck disable=SC2162
-		IFS="-" read cpu vendor <<EOF
-$basic_machine
-EOF
-		;;
-	# We use `pc' rather than `unknown'
-	# because (1) that's what they normally are, and
-	# (2) the word "unknown" tends to confuse beginning users.
-	i*86 | x86_64)
-		cpu=$basic_machine
-		vendor=pc
-		;;
-	# These rules are duplicated from below for sake of the special case above;
-	# i.e. things that normalized to x86 arches should also default to "pc"
-	pc98)
-		cpu=i386
-		vendor=pc
-		;;
-	x64 | amd64)
-		cpu=x86_64
-		vendor=pc
-		;;
-	# Recognize the basic CPU types without company name.
-	*)
-		cpu=$basic_machine
-		vendor=unknown
-		;;
-esac
-
-unset -v basic_machine
-
-# Decode basic machines in the full and proper CPU-Company form.
-case $cpu-$vendor in
-	# Here we handle the default manufacturer of certain CPU types in canonical form. It is in
-	# some cases the only manufacturer, in others, it is the most popular.
-	craynv-unknown)
-		vendor=cray
-		basic_os=${basic_os:-unicosmp}
-		;;
-	c90-unknown | c90-cray)
-		vendor=cray
-		basic_os=${Basic_os:-unicos}
-		;;
-	fx80-unknown)
-		vendor=alliant
-		;;
-	romp-unknown)
-		vendor=ibm
-		;;
-	mmix-unknown)
-		vendor=knuth
-		;;
-	microblaze-unknown | microblazeel-unknown)
-		vendor=xilinx
-		;;
-	rs6000-unknown)
-		vendor=ibm
-		;;
-	vax-unknown)
-		vendor=dec
-		;;
-	pdp11-unknown)
-		vendor=dec
-		;;
-	we32k-unknown)
-		vendor=att
-		;;
-	cydra-unknown)
-		vendor=cydrome
-		;;
-	i370-ibm*)
-		vendor=ibm
-		;;
-	orion-unknown)
-		vendor=highlevel
-		;;
-	xps-unknown | xps100-unknown)
-		cpu=xps100
-		vendor=honeywell
-		;;
-
-	# Here we normalize CPU types with a missing or matching vendor
-	dpx20-unknown | dpx20-bull)
-		cpu=rs6000
-		vendor=bull
-		basic_os=${basic_os:-bosx}
-		;;
-
-	# Here we normalize CPU types irrespective of the vendor
-	amd64-*)
-		cpu=x86_64
-		;;
-	blackfin-*)
-		cpu=bfin
-		basic_os=linux
-		;;
-	c54x-*)
-		cpu=tic54x
-		;;
-	c55x-*)
-		cpu=tic55x
-		;;
-	c6x-*)
-		cpu=tic6x
-		;;
-	e500v[12]-*)
-		cpu=powerpc
-		basic_os=${basic_os}"spe"
-		;;
-	mips3*-*)
-		cpu=mips64
-		;;
-	ms1-*)
-		cpu=mt
-		;;
-	m68knommu-*)
-		cpu=m68k
-		basic_os=linux
-		;;
-	m9s12z-* | m68hcs12z-* | hcs12z-* | s12z-*)
-		cpu=s12z
-		;;
-	openrisc-*)
-		cpu=or32
-		;;
-	parisc-*)
-		cpu=hppa
-		basic_os=linux
-		;;
-	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-		cpu=i586
-		;;
-	pentiumpro-* | p6-* | 6x86-* | athlon-* | athalon_*-*)
-		cpu=i686
-		;;
-	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-		cpu=i686
-		;;
-	pentium4-*)
-		cpu=i786
-		;;
-	pc98-*)
-		cpu=i386
-		;;
-	ppc-* | ppcbe-*)
-		cpu=powerpc
-		;;
-	ppcle-* | powerpclittle-*)
-		cpu=powerpcle
-		;;
-	ppc64-*)
-		cpu=powerpc64
-		;;
-	ppc64le-* | powerpc64little-*)
-		cpu=powerpc64le
-		;;
-	sb1-*)
-		cpu=mipsisa64sb1
-		;;
-	sb1el-*)
-		cpu=mipsisa64sb1el
-		;;
-	sh5e[lb]-*)
-		cpu=$(echo "$cpu" | sed 's/^\(sh.\)e\(.\)$/\1\2e/')
-		;;
-	spur-*)
-		cpu=spur
-		;;
-	strongarm-* | thumb-*)
-		cpu=arm
-		;;
-	tx39-*)
-		cpu=mipstx39
-		;;
-	tx39el-*)
-		cpu=mipstx39el
-		;;
-	x64-*)
-		cpu=x86_64
-		;;
-	xscale-* | xscalee[bl]-*)
-		cpu=$(echo "$cpu" | sed 's/^xscale/arm/')
-		;;
-	arm64-*)
-		cpu=aarch64
-		;;
-
-	# Recognize the canonical CPU Types that limit and/or modify the
-	# company names they are paired with.
-	cr16-*)
-		basic_os=${basic_os:-elf}
-		;;
-	crisv32-* | etraxfs*-*)
-		cpu=crisv32
-		vendor=axis
-		;;
-	cris-* | etrax*-*)
-		cpu=cris
-		vendor=axis
-		;;
-	crx-*)
-		basic_os=${basic_os:-elf}
-		;;
-	neo-tandem)
-		cpu=neo
-		vendor=tandem
-		;;
-	nse-tandem)
-		cpu=nse
-		vendor=tandem
-		;;
-	nsr-tandem)
-		cpu=nsr
-		vendor=tandem
-		;;
-	nsv-tandem)
-		cpu=nsv
-		vendor=tandem
-		;;
-	nsx-tandem)
-		cpu=nsx
-		vendor=tandem
-		;;
-	mipsallegrexel-sony)
-		cpu=mipsallegrexel
-		vendor=sony
-		;;
-	tile*-*)
-		basic_os=${basic_os:-linux-gnu}
-		;;
-
-	*)
-		# Recognize the canonical CPU types that are allowed with any
-		# company name.
-		case $cpu in
-			1750a | 580 \
-			| a29k \
-			| aarch64 | aarch64_be \
-			| abacus \
-			| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] \
-			| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] \
-			| alphapca5[67] | alpha64pca5[67] \
-			| am33_2.0 \
-			| amdgcn \
-			| arc | arceb \
-			| arm | arm[lb]e | arme[lb] | armv* \
-			| avr | avr32 \
-			| asmjs \
-			| ba \
-			| be32 | be64 \
-			| bfin | bpf | bs2000 \
-			| c[123]* | c30 | [cjt]90 | c4x \
-			| c8051 | clipper | craynv | csky | cydra \
-			| d10v | d30v | dlx | dsp16xx \
-			| e2k | elxsi | epiphany \
-			| f30[01] | f700 | fido | fr30 | frv | ft32 | fx80 \
-			| h8300 | h8500 \
-			| hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
-			| hexagon \
-			| i370 | i*86 | i860 | i960 | ia16 | ia64 \
-			| ip2k | iq2000 \
-			| k1om \
-			| le32 | le64 \
-			| lm32 \
-			| loongarch32 | loongarch64 | loongarchx32 \
-			| m32c | m32r | m32rle \
-			| m5200 | m68000 | m680[012346]0 | m68360 | m683?2 | m68k \
-			| m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x \
-			| m88110 | m88k | maxq | mb | mcore | mep | metag \
-			| microblaze | microblazeel \
-			| mips | mipsbe | mipseb | mipsel | mipsle \
-			| mips16 \
-			| mips64 | mips64eb | mips64el \
-			| mips64octeon | mips64octeonel \
-			| mips64orion | mips64orionel \
-			| mips64r5900 | mips64r5900el \
-			| mips64vr | mips64vrel \
-			| mips64vr4100 | mips64vr4100el \
-			| mips64vr4300 | mips64vr4300el \
-			| mips64vr5000 | mips64vr5000el \
-			| mips64vr5900 | mips64vr5900el \
-			| mipsisa32 | mipsisa32el \
-			| mipsisa32r2 | mipsisa32r2el \
-			| mipsisa32r6 | mipsisa32r6el \
-			| mipsisa64 | mipsisa64el \
-			| mipsisa64r2 | mipsisa64r2el \
-			| mipsisa64r6 | mipsisa64r6el \
-			| mipsisa64sb1 | mipsisa64sb1el \
-			| mipsisa64sr71k | mipsisa64sr71kel \
-			| mipsr5900 | mipsr5900el \
-			| mipstx39 | mipstx39el \
-			| mmix \
-			| mn10200 | mn10300 \
-			| moxie \
-			| mt \
-			| msp430 \
-			| nds32 | nds32le | nds32be \
-			| nfp \
-			| nios | nios2 | nios2eb | nios2el \
-			| none | np1 | ns16k | ns32k | nvptx \
-			| open8 \
-			| or1k* \
-			| or32 \
-			| orion \
-			| picochip \
-			| pdp10 | pdp11 | pj | pjl | pn | power \
-			| powerpc | powerpc64 | powerpc64le | powerpcle | powerpcspe \
-			| pru \
-			| pyramid \
-			| riscv | riscv32 | riscv32be | riscv64 | riscv64be \
-			| rl78 | romp | rs6000 | rx \
-			| s390 | s390x \
-			| score \
-			| sh | shl \
-			| sh[1234] | sh[24]a | sh[24]ae[lb] | sh[23]e | she[lb] | sh[lb]e \
-			| sh[1234]e[lb] |  sh[12345][lb]e | sh[23]ele | sh64 | sh64le \
-			| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet \
-			| sparclite \
-			| sparcv8 | sparcv9 | sparcv9b | sparcv9v | sv1 | sx* \
-			| spu \
-			| tahoe \
-			| thumbv7* \
-			| tic30 | tic4x | tic54x | tic55x | tic6x | tic80 \
-			| tron \
-			| ubicom32 \
-			| v70 | v850 | v850e | v850e1 | v850es | v850e2 | v850e2v3 \
-			| vax \
-			| visium \
-			| w65 \
-			| wasm32 | wasm64 \
-			| we32k \
-			| x86 | x86_64 | xc16x | xgate | xps100 \
-			| xstormy16 | xtensa* \
-			| ymp \
-			| z8k | z80)
-				;;
-
-			*)
-				echo Invalid configuration \`"$1"\': machine \`"$cpu-$vendor"\' not recognized 1>&2
-				exit 1
-				;;
-		esac
-		;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $vendor in
-	digital*)
-		vendor=dec
-		;;
-	commodore*)
-		vendor=cbm
-		;;
-	*)
-		;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if test x$basic_os != x
-then
-
-# First recognize some ad-hoc caes, or perhaps split kernel-os, or else just
-# set os.
-case $basic_os in
-	gnu/linux*)
-		kernel=linux
-		os=$(echo $basic_os | sed -e 's|gnu/linux|gnu|')
-		;;
-	os2-emx)
-		kernel=os2
-		os=$(echo $basic_os | sed -e 's|os2-emx|emx|')
-		;;
-	nto-qnx*)
-		kernel=nto
-		os=$(echo $basic_os | sed -e 's|nto-qnx|qnx|')
-		;;
-	*-*)
-		# shellcheck disable=SC2162
-		IFS="-" read kernel os <<EOF
-$basic_os
-EOF
-		;;
-	# Default OS when just kernel was specified
-	nto*)
-		kernel=nto
-		os=$(echo $basic_os | sed -e 's|nto|qnx|')
-		;;
-	linux*)
-		kernel=linux
-		os=$(echo $basic_os | sed -e 's|linux|gnu|')
-		;;
-	*)
-		kernel=
-		os=$basic_os
-		;;
-esac
-
-# Now, normalize the OS (knowing we just have one component, it's not a kernel,
-# etc.)
-case $os in
-	# First match some system type aliases that might get confused
-	# with valid system types.
-	# solaris* is a basic system type, with this one exception.
-	auroraux)
-		os=auroraux
-		;;
-	bluegene*)
-		os=cnk
-		;;
-	solaris1 | solaris1.*)
-		os=$(echo $os | sed -e 's|solaris1|sunos4|')
-		;;
-	solaris)
-		os=solaris2
-		;;
-	unixware*)
-		os=sysv4.2uw
-		;;
-	# es1800 is here to avoid being matched by es* (a different OS)
-	es1800*)
-		os=ose
-		;;
-	# Some version numbers need modification
-	chorusos*)
-		os=chorusos
-		;;
-	isc)
-		os=isc2.2
-		;;
-	sco6)
-		os=sco5v6
-		;;
-	sco5)
-		os=sco3.2v5
-		;;
-	sco4)
-		os=sco3.2v4
-		;;
-	sco3.2.[4-9]*)
-		os=$(echo $os | sed -e 's/sco3.2./sco3.2v/')
-		;;
-	sco*v* | scout)
-		# Don't match below
-		;;
-	sco*)
-		os=sco3.2v2
-		;;
-	psos*)
-		os=psos
-		;;
-	qnx*)
-		os=qnx
-		;;
-	hiux*)
-		os=hiuxwe2
-		;;
-	lynx*178)
-		os=lynxos178
-		;;
-	lynx*5)
-		os=lynxos5
-		;;
-	lynxos*)
-		# don't get caught up in next wildcard
-		;;
-	lynx*)
-		os=lynxos
-		;;
-	mac[0-9]*)
-		os=$(echo "$os" | sed -e 's|mac|macos|')
-		;;
-	opened*)
-		os=openedition
-		;;
-	os400*)
-		os=os400
-		;;
-	sunos5*)
-		os=$(echo "$os" | sed -e 's|sunos5|solaris2|')
-		;;
-	sunos6*)
-		os=$(echo "$os" | sed -e 's|sunos6|solaris3|')
-		;;
-	wince*)
-		os=wince
-		;;
-	utek*)
-		os=bsd
-		;;
-	dynix*)
-		os=bsd
-		;;
-	acis*)
-		os=aos
-		;;
-	atheos*)
-		os=atheos
-		;;
-	syllable*)
-		os=syllable
-		;;
-	386bsd)
-		os=bsd
-		;;
-	ctix* | uts*)
-		os=sysv
-		;;
-	nova*)
-		os=rtmk-nova
-		;;
-	ns2)
-		os=nextstep2
-		;;
-	# Preserve the version number of sinix5.
-	sinix5.*)
-		os=$(echo $os | sed -e 's|sinix|sysv|')
-		;;
-	sinix*)
-		os=sysv4
-		;;
-	tpf*)
-		os=tpf
-		;;
-	triton*)
-		os=sysv3
-		;;
-	oss*)
-		os=sysv3
-		;;
-	svr4*)
-		os=sysv4
-		;;
-	svr3)
-		os=sysv3
-		;;
-	sysvr4)
-		os=sysv4
-		;;
-	ose*)
-		os=ose
-		;;
-	*mint | mint[0-9]* | *MiNT | MiNT[0-9]*)
-		os=mint
-		;;
-	dicos*)
-		os=dicos
-		;;
-	pikeos*)
-		# Until real need of OS specific support for
-		# particular features comes up, bare metal
-		# configurations are quite functional.
-		case $cpu in
-		    arm*)
-			os=eabi
-			;;
-		    *)
-			os=elf
-			;;
-		esac
-		;;
-	*)
-		# No normalization, but not necessarily accepted, that comes below.
-		;;
-esac
-
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system.  Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-kernel=
-case $cpu-$vendor in
-	score-*)
-		os=elf
-		;;
-	spu-*)
-		os=elf
-		;;
-	*-acorn)
-		os=riscix1.2
-		;;
-	arm*-rebel)
-		kernel=linux
-		os=gnu
-		;;
-	arm*-semi)
-		os=aout
-		;;
-	c4x-* | tic4x-*)
-		os=coff
-		;;
-	c8051-*)
-		os=elf
-		;;
-	clipper-intergraph)
-		os=clix
-		;;
-	hexagon-*)
-		os=elf
-		;;
-	tic54x-*)
-		os=coff
-		;;
-	tic55x-*)
-		os=coff
-		;;
-	tic6x-*)
-		os=coff
-		;;
-	# This must come before the *-dec entry.
-	pdp10-*)
-		os=tops20
-		;;
-	pdp11-*)
-		os=none
-		;;
-	*-dec | vax-*)
-		os=ultrix4.2
-		;;
-	m68*-apollo)
-		os=domain
-		;;
-	i386-sun)
-		os=sunos4.0.2
-		;;
-	m68000-sun)
-		os=sunos3
-		;;
-	m68*-cisco)
-		os=aout
-		;;
-	mep-*)
-		os=elf
-		;;
-	mips*-cisco)
-		os=elf
-		;;
-	mips*-*)
-		os=elf
-		;;
-	or32-*)
-		os=coff
-		;;
-	*-tti)	# must be before sparc entry or we get the wrong os.
-		os=sysv3
-		;;
-	sparc-* | *-sun)
-		os=sunos4.1.1
-		;;
-	pru-*)
-		os=elf
-		;;
-	*-be)
-		os=beos
-		;;
-	*-ibm)
-		os=aix
-		;;
-	*-knuth)
-		os=mmixware
-		;;
-	*-wec)
-		os=proelf
-		;;
-	*-winbond)
-		os=proelf
-		;;
-	*-oki)
-		os=proelf
-		;;
-	*-hp)
-		os=hpux
-		;;
-	*-hitachi)
-		os=hiux
-		;;
-	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-		os=sysv
-		;;
-	*-cbm)
-		os=amigaos
-		;;
-	*-dg)
-		os=dgux
-		;;
-	*-dolphin)
-		os=sysv3
-		;;
-	m68k-ccur)
-		os=rtu
-		;;
-	m88k-omron*)
-		os=luna
-		;;
-	*-next)
-		os=nextstep
-		;;
-	*-sequent)
-		os=ptx
-		;;
-	*-crds)
-		os=unos
-		;;
-	*-ns)
-		os=genix
-		;;
-	i370-*)
-		os=mvs
-		;;
-	*-gould)
-		os=sysv
-		;;
-	*-highlevel)
-		os=bsd
-		;;
-	*-encore)
-		os=bsd
-		;;
-	*-sgi)
-		os=irix
-		;;
-	*-siemens)
-		os=sysv4
-		;;
-	*-masscomp)
-		os=rtu
-		;;
-	f30[01]-fujitsu | f700-fujitsu)
-		os=uxpv
-		;;
-	*-rom68k)
-		os=coff
-		;;
-	*-*bug)
-		os=coff
-		;;
-	*-apple)
-		os=macos
-		;;
-	*-atari*)
-		os=mint
-		;;
-	*-wrs)
-		os=vxworks
-		;;
-	*)
-		os=none
-		;;
-esac
-
-fi
-
-# Now, validate our (potentially fixed-up) OS.
-case $os in
-	# Sometimes we do "kernel-abi", so those need to count as OSes.
-	musl* | newlib* | uclibc*)
-		;;
-	# Likewise for "kernel-libc"
-	eabi | eabihf | gnueabi | gnueabihf)
-		;;
-	# Now accept the basic system types.
-	# The portable systems comes first.
-	# Each alternative MUST end in a * to match a version number.
-	gnu* | android* | bsd* | mach* | minix* | genix* | ultrix* | irix* \
-	     | *vms* | esix* | aix* | cnk* | sunos | sunos[34]* \
-	     | hpux* | unos* | osf* | luna* | dgux* | auroraux* | solaris* \
-	     | sym* |  plan9* | psp* | sim* | xray* | os68k* | v88r* \
-	     | hiux* | abug | nacl* | netware* | windows* \
-	     | os9* | macos* | osx* | ios* \
-	     | mpw* | magic* | mmixware* | mon960* | lnews* \
-	     | amigaos* | amigados* | msdos* | newsos* | unicos* | aof* \
-	     | aos* | aros* | cloudabi* | sortix* | twizzler* \
-	     | nindy* | vxsim* | vxworks* | ebmon* | hms* | mvs* \
-	     | clix* | riscos* | uniplus* | iris* | isc* | rtu* | xenix* \
-	     | mirbsd* | netbsd* | dicos* | openedition* | ose* \
-	     | bitrig* | openbsd* | solidbsd* | libertybsd* | os108* \
-	     | ekkobsd* | freebsd* | riscix* | lynxos* | os400* \
-	     | bosx* | nextstep* | cxux* | aout* | elf* | oabi* \
-	     | ptx* | coff* | ecoff* | winnt* | domain* | vsta* \
-	     | udi* | lites* | ieee* | go32* | aux* | hcos* \
-	     | chorusrdb* | cegcc* | glidix* \
-	     | cygwin* | msys* | pe* | moss* | proelf* | rtems* \
-	     | midipix* | mingw32* | mingw64* | mint* \
-	     | uxpv* | beos* | mpeix* | udk* | moxiebox* \
-	     | interix* | uwin* | mks* | rhapsody* | darwin* \
-	     | openstep* | oskit* | conix* | pw32* | nonstopux* \
-	     | storm-chaos* | tops10* | tenex* | tops20* | its* \
-	     | os2* | vos* | palmos* | uclinux* | nucleus* | morphos* \
-	     | scout* | superux* | sysv* | rtmk* | tpf* | windiss* \
-	     | powermax* | dnix* | nx6 | nx7 | sei* | dragonfly* \
-	     | skyos* | haiku* | rdos* | toppers* | drops* | es* \
-	     | onefs* | tirtos* | phoenix* | fuchsia* | redox* | bme* \
-	     | midnightbsd* | amdhsa* | unleashed* | emscripten* | wasi* \
-	     | nsk* | powerunix* | genode* | zvmoe* | qnx* | emx*)
-		;;
-	# This one is extra strict with allowed versions
-	sco3.2v2 | sco3.2v[4-9]* | sco5v6*)
-		# Don't forget version if it is 3.2v4 or newer.
-		;;
-	none)
-		;;
-	*)
-		echo Invalid configuration \`"$1"\': OS \`"$os"\' not recognized 1>&2
-		exit 1
-		;;
-esac
-
-# As a final step for OS-related things, validate the OS-kernel combination
-# (given a valid OS), if there is a kernel.
-case $kernel-$os in
-	linux-gnu* | linux-dietlibc* | linux-android* | linux-newlib* | linux-musl* | linux-uclibc* )
-		;;
-	uclinux-uclibc* )
-		;;
-	-dietlibc* | -newlib* | -musl* | -uclibc* )
-		# These are just libc implementations, not actual OSes, and thus
-		# require a kernel.
-		echo "Invalid configuration \`$1': libc \`$os' needs explicit kernel." 1>&2
-		exit 1
-		;;
-	kfreebsd*-gnu* | kopensolaris*-gnu*)
-		;;
-	nto-qnx*)
-		;;
-	os2-emx)
-		;;
-	*-eabi* | *-gnueabi*)
-		;;
-	-*)
-		# Blank kernel with real OS is always fine.
-		;;
-	*-*)
-		echo "Invalid configuration \`$1': Kernel \`$kernel' not known to work with OS \`$os'." 1>&2
-		exit 1
-		;;
-esac
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer.  We pick the logical manufacturer.
-case $vendor in
-	unknown)
-		case $cpu-$os in
-			*-riscix*)
-				vendor=acorn
-				;;
-			*-sunos*)
-				vendor=sun
-				;;
-			*-cnk* | *-aix*)
-				vendor=ibm
-				;;
-			*-beos*)
-				vendor=be
-				;;
-			*-hpux*)
-				vendor=hp
-				;;
-			*-mpeix*)
-				vendor=hp
-				;;
-			*-hiux*)
-				vendor=hitachi
-				;;
-			*-unos*)
-				vendor=crds
-				;;
-			*-dgux*)
-				vendor=dg
-				;;
-			*-luna*)
-				vendor=omron
-				;;
-			*-genix*)
-				vendor=ns
-				;;
-			*-clix*)
-				vendor=intergraph
-				;;
-			*-mvs* | *-opened*)
-				vendor=ibm
-				;;
-			*-os400*)
-				vendor=ibm
-				;;
-			s390-* | s390x-*)
-				vendor=ibm
-				;;
-			*-ptx*)
-				vendor=sequent
-				;;
-			*-tpf*)
-				vendor=ibm
-				;;
-			*-vxsim* | *-vxworks* | *-windiss*)
-				vendor=wrs
-				;;
-			*-aux*)
-				vendor=apple
-				;;
-			*-hms*)
-				vendor=hitachi
-				;;
-			*-mpw* | *-macos*)
-				vendor=apple
-				;;
-			*-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*)
-				vendor=atari
-				;;
-			*-vos*)
-				vendor=stratus
-				;;
-		esac
-		;;
-esac
-
-echo "$cpu-$vendor-${kernel:+$kernel-}$os"
-exit
-
-# Local variables:
-# eval: (add-hook 'before-save-hook 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/configure.in b/configure.in
deleted file mode 100644
index faf2ac35a..000000000
--- a/configure.in
+++ /dev/null
@@ -1,737 +0,0 @@
-#
-# Process this file with autoconf to produce a configure script
-#
-AC_PREREQ(2.59)
-AC_INIT(tmLQCD, 6.0.0, curbach@gmx.de)
-AC_CONFIG_HEADER(include/tmlqcd_config_internal.h)
-AC_CONFIG_SRCDIR([hmc_tm.c])
-AC_CANONICAL_HOST()
-AC_PREFIX_DEFAULT($HOME)
-AC_ARG_PROGRAM
-
-if test "$host_vendor" = "cray"; then
-  ac_cv_c_bigendian=yes
-fi
-
-AC_PROG_CC
-AC_PROG_CC_C99
-dnl AC_PROG_CC_STDC
-AC_C_CONST
-AC_C_INLINE
-AC_C_RESTRICT
-AC_F77_LIBRARY_LDFLAGS
-AC_CHECK_TOOL(AR, ar, [ar])
-LIBS="$LIBS $FLIBS -lm"
-
-AC_PROG_LEX
-dnl AC_PROG_LEX sets $LEX to ":" if neither lex nor flex are found! 
-if test "$LEX" = ":"; then
-  AC_MSG_ERROR([(F)LEX is required for building read_input.c. Please install it and run configure again.])
-fi
-
-AC_PROG_MAKE_SET
-AC_PROG_RANLIB
-AC_CHECK_PROG(CCDEP, gcc, "gcc", "$CC")
-AC_CHECK_PROG(CXXDEP, g++, "g++", "$CXX")
-#(endian="", AC_DEFINE(LITTLE_ENDIAN,1,The endian of the architechture))
-
-# AC_PROG_FC([ifort gfortran])
-# AC_FC_FUNC(testfunc, )
-
-LDFLAGS="$LDFLAGS -L\${HOME}/lib -L\${top_builddir}/lib"
-CCLD=${CC}
-
-# compilation in operator is slowest so we do it first, saves time in parallel compiles
-USESUBDIRS="operator linalg solver monomial buffers cu io meas xchange init rational smearing wrapper"
-
-AC_CHECK_HEADERS([stdint.h],
-[ dnl for inttypes.h and stdint.h for uint_xxx types
-  dnl if successful check for the actual types too
-  AC_CHECK_TYPES([uint16_t, uint32_t, uint64_t],
-                 [],
-                 [AC_MSG_ERROR([stdint.h found but either uint16_t, uint32_t or uint64_t not found]) ]
-                )
-],
-[
-  dnl no inttypes.h or stdint.h found check common unsigned types
-  dnl for sizes and make appropriate decisions in the lime_fixed_types.h file
-  AC_CHECK_SIZEOF(unsigned char)
-  AC_CHECK_SIZEOF(unsigned short)
-  AC_CHECK_SIZEOF(unsigned int)
-  AC_CHECK_SIZEOF(unsigned long)
-  AC_CHECK_SIZEOF(unsigned long long)
-]
-)
-
-AC_MSG_CHECKING(where to find lime)
-AC_ARG_WITH(limedir,
-  AS_HELP_STRING([--with-limedir[=dir]], [search lime in dir [default=./lime]]),
-  lime_dir=$withval, lime_dir="./lime")
-AC_MSG_RESULT($lime_dir)
-LDFLAGS="$LDFLAGS -L${lime_dir}/lib/"
-AC_CHECK_LIB([lime], [limeReaderNextRecord],[],
-              [AC_MSG_ERROR([library liblime is missing or needed function is not available])])
-
-#LIBS="$LIBS $FLIBS -lm"
-
-AC_MSG_CHECKING(whether we want to use lemon)
-AC_ARG_WITH(lemondir,
-            AS_HELP_STRING([--with-lemondir[=dir]], [use lemon, to be found in dir]),
-             [echo $withval
-              LEMON_AVAILABLE=1
-              lemon_dir=$withval
-              LDFLAGS="$LDFLAGS -L${lemon_dir}/lib"
-              AC_CHECK_LIB([lemon],
-                           [lemonReaderNextRecord],
-                           [],
-                           [AC_MSG_ERROR([library liblemon was not found])])],
-             [echo no
-              LEMON_AVAILABLE=0])
-
-AC_MSG_CHECKING(whether we want to use MPI)
-AC_ARG_ENABLE(mpi,
-  AS_HELP_STRING([--enable-mpi], [enable use of mpi [default=yes]]),
-  enable_mpi=$enableval, enable_mpi=yes)
-if test $enable_mpi = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_USE_MPI,1,Compile with MPI support)
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use DDalphaAMG)
-AC_ARG_WITH(DDalphaAMG,
-            AS_HELP_STRING([--with-DDalphaAMG[=dir]], [use DDalphaAMG, to be found in dir]),
-             [echo $withval
-              DDalphaAMG_AVAILABLE=1
-              DDalphaAMG_INTERFACE="DDalphaAMG_interface"
-              AC_DEFINE(DDalphaAMG,1,Using DDalphaAMG)
-              DDalphaAMG_dir=$withval
-              LDFLAGS="$LDFLAGS -L${DDalphaAMG_dir}/lib"
-              INCLUDES="$INCLUDES -I${DDalphaAMG_dir}/include/"
-              AC_CHECK_LIB([DDalphaAMG],
-                           [DDalphaAMG_finalize],
-                           [],
-                           [AC_MSG_ERROR([library DDalphaAMG was not found])])],
-             [echo no
-              DDalphaAMG_AVAILABLE=0
-              DDalphaAMG_INTERFACE="DDalphaAMG_interface"
-              ])
-
-AC_MSG_CHECKING(whether we want to use OpenMP)
-AC_ARG_ENABLE(omp,
-  AS_HELP_STRING([--enable-omp], [enable use of OpenMP [default=yes]]),
-  enable_omp=$enableval, enable_omp=yes)
-if test $enable_omp = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_USE_OMP,1,Compile with OpenMP support)
-  AC_CHECK_HEADERS([omp.h],,[AC_MSG_ERROR([Cannot find OpenMP headers!])])
-  AC_OPENMP
-# -- AC_OPENMP provides a compiler-dependent OPENMP_CFLAGS so we can set it here
-    CFLAGS="$CFLAGS $OPENMP_CFLAGS"
-    CPPFLAGS="$CPPFLAGS $OPENMP_CFLAGS"
-    LDFLAGS="$LDFLAGS $OPENMP_CFLAGS"
-else
-  AC_MSG_RESULT(no)
-fi
-
-fftw_lib=/usr
-AC_MSG_CHECKING(whether we want to use FFTW)
-AC_ARG_ENABLE(fftw,
-  AS_HELP_STRING([--enable-fftw], [enable use of fftw [default=no]]),
-  enable_fftw=$enableval, enable_fftw=no)
-if test $enable_fftw = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(HAVE_FFTW,1,Compile with FFTW support)
-  LIBS="-lfftw3 ${LIBS}"
-elif test $enable_fftw = no; then
-  AC_MSG_RESULT(no)
-else
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(HAVE_FFTW,1,Compile with FFTW support)
-  fftw_lib=${enable_fftw}
-  LDFLAGS="$LDFLAGS -L${fftw_lib}/lib64"
-  LIBS="-lfftw3 ${LIBS}"
-  INCLUDES="-I${fftw_lib}/include ${INCLUDES}"
-fi
-
-if test $enable_mpi = yes; then
-  AC_MSG_CHECKING(which parallelisation to use for MPI)
-  AC_ARG_WITH(mpidimension,
-    AS_HELP_STRING([--with-mpidimension[=n]], [use n dimensional parallelisation [default=1]]),
-    withmpidimension=$withval, withmpidimension=1)
-  if test $withmpidimension = 1; then
-    AC_MSG_RESULT(n=1 [t])
-    AC_DEFINE(PARALLELT,1,One dimensional parallelisation)
-  elif test $withmpidimension = 2; then
-    AC_MSG_RESULT(n=2 [xt])
-    AC_DEFINE(PARALLELXT,1,Two dimensional parallelisation)
-  elif test $withmpidimension = 3; then
-    AC_MSG_RESULT(n=3 [xyt])
-    AC_DEFINE(PARALLELXYT,1,Three dimensional parallelisation)
-  elif test $withmpidimension = 4; then
-    AC_MSG_RESULT(n=4 [xyzt])
-    AC_DEFINE(PARALLELXYZT,1,Four dimensional parallelisation)
-  elif test $withmpidimension = X; then
-    AC_MSG_RESULT(n=1 [x])
-    AC_DEFINE(PARALLELX,1, X parallelisation)
-  elif test $withmpidimension = XY; then
-    AC_MSG_RESULT(n=2 [xy])
-    AC_DEFINE(PARALLELXY,1, XY parallelisation)
-  elif test $withmpidimension = XYZ; then
-    AC_MSG_RESULT(n=3 [xyz])
-    AC_DEFINE(PARALLELXYZ,1, XYZ parallelisation)
-  elif test $withmpidimension = T; then
-    AC_MSG_RESULT(n=1 [t])
-    AC_DEFINE(PARALLELT,1, T parallelisation)
-  elif test $withmpidimension = XT; then
-    AC_MSG_RESULT(n=2 [xt])
-    AC_DEFINE(PARALLELXT,1, XT parallelisation)
-  elif test $withmpidimension = XYT; then
-    AC_MSG_RESULT(n=3 [xyt])
-    AC_DEFINE(PARALLELXYT,1, XYT parallelisation)
-  elif test $withmpidimension = XYZT; then
-    AC_MSG_RESULT(n=4 [xyzt])
-    AC_DEFINE(PARALLELXYZT,1, XYZT parallelisation)
-  else
-    AC_MSG_RESULT(unknown)
-    AC_MSG_ERROR([Only t, xt, xyt, xyzt, x, xy, xyz parallelisation available])
-  fi
-
-  AC_MSG_CHECKING(whether we shall use persistent MPI calls for halfspinor)
-  AC_ARG_WITH([persistentmpi],
-    AS_HELP_STRING([--with-persistentmpi], [use persistent MPI calls for halfspinor [default=no]]),
-    withpersistent=$withval, withpersistent=no)
-  if test $withpersistent = yes; then
-    AC_MSG_RESULT(yes)
-    AC_DEFINE(_PERSISTENT,1,use persistent MPI calls for halfspinor)
-  else
-    AC_MSG_RESULT(no)
-  fi
-
-  AC_MSG_CHECKING(whether we shall use non-blocking MPI calls)
-  AC_ARG_WITH([nonblockingmpi],
-    AS_HELP_STRING([--with-nonblockingmpi], [use non-blocking MPI calls for spinor and gauge [default=yes]]),
-    withnonblock=$withval, withnonblock=yes)
-  if test $withnonblock = yes; then
-    AC_MSG_RESULT(yes)
-    AC_DEFINE(_NON_BLOCKING,1,use non-blocking MPI calls for spinor ang gauge)
-  else
-    AC_MSG_RESULT(no)
-  fi
-fi
-
-AC_MSG_CHECKING([whether we want to fix volume at compiletime])
-AC_ARG_WITH([fixedvolume],
-  AS_HELP_STRING([--with-fixedvolume], [fix volume at compiletime [default=no]]),
-  with_fixvol=$withval, with_fixvol=no)
-if test $with_fixvol = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(FIXEDVOLUME,1,Fixed volume at compiletime)
-  AC_CONFIG_FILES([fixed_volume.h])
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING([whether we want to use KOJAK instrumentalisation])
-AC_ARG_WITH([kojakinst],
-  AS_HELP_STRING([--with-kojakinst], [instrumentalise for KOJAK [default=no]]),
-  with_kojakinst=$withval, with_kojakinst=no)
-if test $with_kojakinst = yes; then
-  AC_MSG_RESULT(yes)
-  CC="kinst-pomp ${CC}"
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use lapack and blas)
-AC_ARG_WITH(lapack,
-  AS_HELP_STRING([--with-lapack], [enable use of lapack [default=yes]]),
-  with_lapack=$withval, with_lapack=yes)
-if test "$with_lapack" = yes; then
-  AC_MSG_RESULT(yes)
-  LAPACKLIB=
-  AC_DEFINE(HAVE_LAPACK,1,lapack available)
-elif test "$with_lapack" != no; then
-  AC_MSG_RESULT(yes)
-  LIBS="$withval $LIBS"
-  with_lapack=yes
-  AC_DEFINE(HAVE_LAPACK,1,lapack available)
-else
-  AC_MSG_RESULT(no)
-  AC_MSG_ERROR([lapack is needed! Will stop here.])
-fi
-
-if test $enable_mpi = yes; then
-  dnl In general one cannot run mpi programs directly
-  dnl thats why we need here cross_compiling=yes
-  dnl for non CRAY
-  if test "$host_vendor" != "cray"; then
-    cross_compiling=yes
-  fi
-fi
-
-dnl for the case of other configure scripts
-dnl AC_CONFIG_SUBDIRS( rng )
-
-dnl check for clock_gettime and set correct library flag if one is required
-dnl (this is done by AC_CHECK_LIB)
-AC_CHECK_FUNCS(clock_gettime, [], [AC_CHECK_LIB(rt, clock_gettime)])
-
-dnl in principle clock_gettime and CLOCK_MONOTONIC/CLOCK_REALTIME should be available
-dnl only when using POSIX 199309, we set this explicitly here
-dnl this should not cause problems on any relatively modern (post y2k) machine!
-if ( test "$ac_cv_lib_rt_clock_gettime" = "yes" || test "$ac_cv_func_clock_gettime" = "yes" ); then
-  AC_DEFINE(HAVE_CLOCK_GETTIME,1)
-dnl  we set this in gettime.c explicitly for the time being 
-dnl  due to endian problem on BG/Q
-dnl  CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=199309L"
-  AC_MSG_NOTICE([Instructing the compiler to use POSIX 199309L])
-fi
-
-dnl Checks for lapack and defines proper name mangling scheme for
-dnl linking with f77 code
-AC_F77_FUNC(zheev)
-if test "$zheev" = "zheev"; then
-  AC_DEFINE(NOF77_,1,Fortran has no extra _)
-fi
-AC_SEARCH_LIBS([$zheev],[lapack], [], [AC_MSG_ERROR([Cannot find lapack])])
-
-dnl Checks for header files.
-AC_HEADER_STDC
-AC_CHECK_HEADERS([float.h libintl.h limits.h stdint.h stdlib.h string.h strings.h sys/time.h unistd.h endian.h])
-AC_CHECK_HEADER( getopt.h, [])
-
-dnl Checks for typedefs, structures, and compiler characteristics.
-AC_C_CONST
-AC_TYPE_OFF_T
-AC_TYPE_SIZE_T
-AC_HEADER_TIME
-
-dnl Checks for library functions.
-AC_SYS_LARGEFILE
-AC_FUNC_FSEEKO
-AC_FUNC_MALLOC
-AC_TYPE_SIGNAL
-AC_CHECK_FUNCS([gettimeofday pow sqrt])
-
-dnl We now define some replacement variables
-AC_SUBST(OPTARGS)
-AC_SUBST(SOPTARGS)
-AC_SUBST(INCLUDES)
-AC_SUBST(AUTOCONF)
-AC_SUBST(SOLVEROUT)
-AC_SUBST(CCDEP)
-AC_SUBST(CXXDEP)
-AC_SUBST(CCLD)
-AC_SUBST(DEPFLAGS)
-AC_SUBST(CXXDEPFLAGS)
-AC_SUBST(DEBUG_FLAG)
-AC_SUBST(PROFILE_FLAG)
-AC_SUBST(XCHANGELIB)
-AC_SUBST(XCHANGEDIR)
-AC_SUBST(MEASDIR)
-AC_SUBST(XLIB)
-AC_SUBST([LEMON_AVAILABLE])
-AC_SUBST(QUDA_INTERFACE)
-AC_SUBST(QPHIX_INTERFACE)
-AC_SUBST(QPHIX_PROGRAMS)
-AC_SUBST(DDalphaAMG_INTERFACE)
-
-INCLUDES="$INCLUDES -I\$(HOME)/include/ -I. -I\${abs_top_builddir}/  -I\${abs_top_builddir}/include/ -I\${abs_top_srcdir}/ -I\${abs_top_srcdir}/include/ -I${lime_dir}/include/ -I${lemon_dir}/include/"
-DEPFLAGS="$DEPFLAGS"
-
-AC_MSG_CHECKING(what alignment we want for arrays)
-AC_ARG_ENABLE(alignment,
-  [AS_HELP_STRING([--enable-alignment[=n]], [Automatically or expliclty align arrays to byte number: auto, none, 16, 32, 64 [default=auto]])],
-  withalign=$enableval, withalign=auto)
-if test "$withalign" = "none"; then
-  AC_MSG_RESULT(none)
-  withalign=1
-  AC_DEFINE(ALIGN_BASE, 0x00, [Align base])
-  AC_DEFINE(ALIGN, [])
-  AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32])
-  AC_DEFINE(ALIGN32, [], [])
-elif test $withalign = 16; then
-  AC_MSG_RESULT(16 bytes)
-  AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-  AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))])
-  AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])
-  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))], [])
-elif test $withalign = 32; then
-  AC_MSG_RESULT(32 bytes)
-  AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
-  AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
-  AC_DEFINE(ALIGN_BASE32, 0x1F, [Align base32])
-  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (32)))], [])
-elif test $withalign = 64; then
-  AC_MSG_RESULT(64 bytes)
-  AC_DEFINE(ALIGN_BASE, 0x3F, [Align base])
-  AC_DEFINE(ALIGN, [__attribute__ ((aligned (64)))])
-  AC_DEFINE(ALIGN_BASE32, 0x3F, [Align base32])
-  AC_DEFINE(ALIGN32, [__attribute__ ((aligned (64)))], [])
-elif test $withalign = auto; then
-  withautoalign=1
-  AC_MSG_RESULT(auto)
-  AC_DEFINE(ALIGN_BASE, 0x00, [Align base])
-  AC_DEFINE(ALIGN, [], [])
-  AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32])
-  AC_DEFINE(ALIGN32, [], [])
-else
-  AC_MSG_RESULT(Unusable value for array alignment)
-  AC_MSG_ERROR([Allowed values are: auto, none, 16, 32, 64])
-fi
-
-dnl We here check for alignment issues with QPX instructions -- this flag has been set earlier
-if test $enable_qpx = yes; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 32; then
-      AC_MSG_RESULT(increasing array alignment to 32 bytes for use of QPX instructions on BG/Q)
-      AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
-      AC_MSG_RESULT(increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q)
-      AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32])
-      AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
-      withautoalign=32
-    fi
-  elif test $withalign -lt 32; then
-    AC_MSG_ERROR([alignment incompatible with QPX instructions (32 bytes required)])
-  fi
-fi
-
-dnl Check for alignment associated with (non-QPX) BG optimization.
-dnl This will also result in using 32 byte alignment on MareNostrum, but that should be fairly innocuous.
-if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 16; then
-      AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/L optimization)
-      AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base])
-      withautoalign=16
-    fi
-  fi
-elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 16; then
-      AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/P optimization)
-      AC_DEFINE(ALIGN_BASE, 0x0F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base])
-      withautoalign=16
-    fi
-  fi
-elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then
-  if test $withalign = auto; then
-    if test $withautoalign -lt 32; then
-      AC_MSG_RESULT(increasing array alignment to 32 bytes for BG/Q and generic POWER optimization)
-      AC_DEFINE(ALIGN_BASE, 0x1F, [Align base])
-      AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))])
-      AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization)
-      AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base])
-      AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))])
-      withautoalign=32
-    fi
-  fi
-fi
-
-AC_MSG_CHECKING(whether we want to use gprof as profiler)
-AC_ARG_WITH(gprof,
-  AS_HELP_STRING([--with-gprof], [use of gprof profiler [default=no]]),
-  enable_gprof=$withval, enable_gprof=no)
-if test $enable_gprof = yes; then
-  AC_MSG_RESULT(yes)
-    if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then
-      PROFILE_FLAG="-pg -qfullpath -g"
-    else
-      PROFILE_FLAG="-pg -g"
-    fi
-else
-  AC_MSG_RESULT(no)
-  PROFILE_FLAG=
-fi
-
-dnl Now we have to set all Flags and compiler properly
-PGCC=`$CC -V 2>&1 | grep pgcc`
-ICC=`$CC -V 2>&1 | grep -i intel`
-
-dnl first for PC's
-if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then
-dnl the GNU compiler
-  if test "$GCC" = yes && test "$ICC" = ""; then
-    DEPFLAGS="-MM"
-    CFLAGS="$CFLAGS -pedantic -Wall"
-    OPTARGS='-O'
-    SOPTARGS='-O'
-
-    if test "$host_cpu" = "x86_64"; then
-      AC_DEFINE(_x86_64,1,x86 64 Bit architecture)
-    fi
-    CCDEP="$CC"
-    if test $enable_mpi = yes; then
-      CCDEP="gcc"
-    fi
-    CXXDEP="$CXX"
-    if test $enable_mpi = yes; then
-      CXXDEP="g++"
-    fi
-    DEBUG_FLAG="-g"
-dnl other compilers
-  else
-dnl check for pgcc
-    if test "$PGCC" != ""; then
-      DEPFLAGS="-M"
-      echo "We are using the Portland Group C compiler!"
-      OPTARGS="-O2"
-      SOPTARGS="-O2"
-      DEBUG_FLAG="-g"
-      PROFILE_FLAG="-p -g"
-      CCDEP="$CC"
-
-dnl check for icc
-    elif test "$ICC" != ""; then
-      echo "We are using the Intel C compiler!"
-      DEPFLAGS="-M"
-      OPTARGS="-O3"
-      SOPTARGS="-O3"
-      DEBUG_FLAG="-g"
-      PROFILE_FLAG="-p -g"
-      CCDEP="$CC"
-      CXXDEP="$CXX"
-    else
-      DEPFLAGS="-M"
-      CFLAGS="$CFLAGS -O"
-      DEBUG_FLAG="-g"
-      CCDEP="$CC"
-      CXXDEP="$CXX"
-    fi
-  fi
-# The CRAY
-elif test "$host_vendor" = "cray"; then
-  echo
-  echo "Hey, we are on a cray, you should take some time for this..."
-  echo "get yourself a coffee or so!"
-  echo
-  CFLAGS="$CFLAGS -dp"
-  AC_DEFINE(CRAY,1,We are on a CRAY)
-  OPTARGS="-O3"
-  SOPTARGS="-O3"
-  DEBUG_FLAG="-g"
-  CCDEP="$CC"
-  DEPFLAGS="-M"
-else
-  AC_CHECK_PROG(CCDEP, gcc, "gcc", "$CC")
-  if test "$CCDEP" = "gcc"; then
-    DEPFLAGS="-MM"
-  else
-    DEPFLAGS="-M"
-  fi
-  OPTARGS=
-  SOPTARGS=
-fi
-
-CXXDEPFLAGS="$DEPFLAGS --std=c++11"
-
-AC_MSG_CHECKING(whether we want to switch on optimisation)
-AC_ARG_ENABLE(optimize,
-  AS_HELP_STRING([--enable-optimize], [enable optimisation [default=yes]]),
-  enable_optimize=$enableval, enable_optimize=yes)
-if test $enable_optimize = no; then
-  AC_MSG_RESULT(no)
-  OPTARGS=
-  SOPTARGS=
-else
-  AC_MSG_RESULT(yes)
-fi
-
-AC_MSG_CHECKING(whether we want to use a copy of the gauge field)
-AC_ARG_ENABLE(gaugecopy,
-  AS_HELP_STRING([--enable-gaugecopy], [enable use of a copy of the gauge field [default=yes]]),
-  enable_gaugecopy=$enableval, enable_gaugecopy=yes)
-if test $enable_gaugecopy = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(_GAUGE_COPY,1,Construct an extra copy of the gauge fields)
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use a Dirac Op. with halfspinor exchange)
-AC_ARG_ENABLE(halfspinor,
-  AS_HELP_STRING([--enable-halfspinor], [use a Dirac Op. with halfspinor exchange [default=yes]]),
-  enable_halfspinor=$enableval, enable_halfspinor=yes)
-if test $enable_halfspinor = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(_USE_HALFSPINOR,1,Exchange only a halfspinor in the Dirac Operator)
-  if test $enable_gaugecopy = no; then
-    AC_MSG_WARN([switching on gaugecopy for Dirac operator with halfspinor!])
-    AC_DEFINE(_GAUGE_COPY,1,Construct an extra copy of the gauge fields)
-  fi
-else
-  AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(whether we want to use shmem API)
-AC_ARG_ENABLE(shmem,
-  AS_HELP_STRING([--enable-shmem],[use shmem API [default=no]]),
-  enable_shmem=$enableval, enable_shmem=no)
-if test $enable_shmem = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(_USE_SHMEM,1,Use shmem API)
-  LIBS="$LIBS -lsma"
-else
-  AC_MSG_RESULT(no)
-fi
-
-
-AC_SUBST(USESUBDIRS)
-
-AC_MSG_CHECKING(whether we want to use CUDA)
-AC_ARG_WITH(cudadir,
-            AS_HELP_STRING([--with-cudadir[=dir]], [use CUDA library (specify 'lib' directory)]),
-             [AC_MSG_RESULT($withval)
-              CUDA_AVAILABLE=1
-              cuda_dir=$withval
-              LDFLAGS="$LDFLAGS -L${cuda_dir} -lcuda"
-              AC_CHECK_LIB([cudart],
-                           [cudaMalloc],
-                           [],
-                           [AC_MSG_ERROR([Can't link a simple program against library cudart.])])],
-             [AC_MSG_RESULT(no)
-              CUDA_AVAILABLE=0])
-
-AC_MSG_CHECKING(whether we want to use HIP)
-AC_ARG_WITH(hipdir,
-            AS_HELP_STRING([--with-hipdir[=dir]], [use HIP library (specify 'lib' directory)]),
-             [AC_MSG_RESULT($withval)
-              HIP_AVAILABLE=1
-              hip_dir=$withval
-              LDFLAGS="$LDFLAGS -L${hip_dir} -lamdhip64"
-              AC_CHECK_LIB([amdhip64],
-                           [hipMalloc],
-                           [],
-                           [AC_MSG_ERROR([Can't link a simple program against library amdhip64.])])],
-             [AC_MSG_RESULT(no)
-              HIP_AVAILABLE=0])
-
-
-# QUDA library for GPUs
-AC_MSG_CHECKING(whether we want to use QUDA)
-AC_ARG_WITH(qudadir,
-            AS_HELP_STRING([--with-qudadir[=dir]], [use QUDA library (specify directory which contains 'include' and 'lib' subdirs)]),
-             [AC_MSG_RESULT($withval)
-              if test $CUDA_AVAILABLE -ne 1 && test $HIP_AVAILABLE -ne 1; then
-                AC_MSG_ERROR([Need either CUDA or HIP to link against QUDA!])
-              fi
-              QUDA_AVAILABLE=1
-              AC_DEFINE(TM_USE_QUDA,1,Using QUDA GPU)
-              quda_dir=$withval
-              LDFLAGS="$LDFLAGS -L${quda_dir}/lib"
-              INCLUDES="$INCLUDES -I${quda_dir}/include/"
-              QUDA_INTERFACE="quda_interface"
-              AC_CHECK_LIB([quda],
-                           [freeGaugeQuda],
-                           [],
-                           [AC_MSG_ERROR([Can't link a simple program against library libquda. (Did you set CXX properly?)])]
-                           )
-              #QUDA needs to be linked with C++ linker
-              CCLD=${CXX}
-             ],
-             [AC_MSG_RESULT(no)
-              QUDA_AVAILABLE=0
-              QUDA_INTERFACE=""
-              ]
-            )
-AC_SUBST([QUDA_AVAILABLE])
-
-AC_MSG_CHECKING(whether the QUDA version is experimental)
-AC_ARG_ENABLE(quda_experimental,
-  AS_HELP_STRING([--enable-quda_experimental], [enable support for experimental QUDA versions [default=no]]),
-  enable_quda_experimental=$enableval, enable_quda_experimental=no)
-if test $enable_quda_experimental = yes; then
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_QUDA_EXPERIMENTAL,1,Experimental QUDA version in use)
-else
-  AC_MSG_RESULT(no)
-fi
-AC_MSG_CHECKING(whether the QUDA force is enabled)
-AC_ARG_ENABLE(quda_fermionic_forces,
-  AS_HELP_STRING([--enable-quda_fermionic_forces], [enable support for fermionic forces using QUDA [default=yes]]),
-  enable_quda_fermionic_forces=$enableval, enable_quda_fermionic_forces=yes)
-if test $enable_quda_fermionic_forces = no; then
-  AC_MSG_RESULT(no)
-else
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(TM_QUDA_FERMIONIC_FORCES,1, fermionic forces with QUDA are enabled)
-fi
-
-# QPhiX library for Intel Xeon and Xeon Phis
-AC_MSG_CHECKING(whether we want to use QPhiX)
-AC_ARG_WITH(qphixdir,
-            AS_HELP_STRING([--with-qphixdir[=dir]], [use QPhiX, to be found in dir]),
-             [echo yes
-              QPHIX_AVAILABLE=1
-              AC_DEFINE(TM_USE_QPHIX,1,Using QPhiX)
-              qphix_dir=$withval
-              LDFLAGS="$LDFLAGS -L${qphix_dir}/lib -lqphix_solver -lqphix_codegen"
-              INCLUDES="$INCLUDES -I${qphix_dir}/include/" 
-              QPHIX_INTERFACE="qphix_interface"
-              QPHIX_PROGRAMS=""
-              # Due to github issue #404, the qphix test_Dslash code has been disabled by BaKo
-              # for the time being
-              # it should be updated to make use of the QPhiX internal interfaces
-              # for passing full lattice spinors
-              # "qphix_test_Dslash"
-
-              # QMP: TODO AC_CHECK_LIB
-              AC_MSG_CHECKING([where to search for QMP libs])
-              AC_ARG_WITH(qmpdir,
-                          AS_HELP_STRING([--with-qmpdir[=dir]], [if using QPhiX, then set QMP lib dir]),
-                          qmp_dir=$withval
-                          LDFLAGS="$LDFLAGS -L${qmp_dir}/lib -lqmp"
-                          INCLUDES="$INCLUDES -I${qmp_dir}/include/"
-                          )
-              AC_MSG_RESULT($qmp_dir)
-
-              AC_MSG_CHECKING([Setting QPhiX SOALEN])
-              AC_ARG_ENABLE(qphix-soalen,
-                            AS_HELP_STRING([--enable-qphix-soalen], [if using QPhiX, set SOALEN [default=4]]),
-                            enable_qphix_soalen=$enableval, enable_qphix_soalen=4)
-              AC_MSG_RESULT($enable_qphix_soalen)
-              AC_DEFINE_UNQUOTED(QPHIX_SOALEN, ${enable_qphix_soalen}, Structure of Array length to use with QPhiX)
-
-              AC_PROG_CXX
-              #QPhiX needs to be linked with C++ linker
-              CCLD=${CXX}
-             ],
-             [echo no
-              QPHIX_AVAILABLE=0
-              QPHIX_INTERFACE=""])
-AC_SUBST([QPHIX_AVAILABLE])
-
-if test ! -e lib; then
-  mkdir lib
-fi
-
-dnl create the test and tests directory here
-if test ! -e test; then
-  mkdir test
-fi
-
-if test ! -e tests; then
-  mkdir tests
-fi
-
-if test ! -e tests/regressions; then
-  mkdir tests/regressions
-fi
-
-
-LIBS="-lhmc -lmonomial -loperator -lsolver -linit -lmeas -llinalg -lhmc -lxchange -lrational -lio $LIBS"
-AUTOCONF=autoconf
-
-for i in $USESUBDIRS
-do
-  make_files="$make_files $i/Makefile"
-done
-
-AC_CONFIG_FILES([Makefile $make_files])
-
-AC_OUTPUT
diff --git a/src/lib/profiling/hmc/Readme.md b/profiling/hmc/Readme.md
similarity index 100%
rename from src/lib/profiling/hmc/Readme.md
rename to profiling/hmc/Readme.md
diff --git a/src/lib/profiling/hmc/example_profile.pdf b/profiling/hmc/example_profile.pdf
similarity index 100%
rename from src/lib/profiling/hmc/example_profile.pdf
rename to profiling/hmc/example_profile.pdf
diff --git a/src/lib/profiling/hmc/profile.Rmd b/profiling/hmc/profile.Rmd
similarity index 100%
rename from src/lib/profiling/hmc/profile.Rmd
rename to profiling/hmc/profile.Rmd
diff --git a/src/lib/profiling/hmc/timing.R b/profiling/hmc/timing.R
similarity index 100%
rename from src/lib/profiling/hmc/timing.R
rename to profiling/hmc/timing.R
diff --git a/src/lib/profiling/hmc_mk2/.gitignore b/profiling/hmc_mk2/.gitignore
similarity index 100%
rename from src/lib/profiling/hmc_mk2/.gitignore
rename to profiling/hmc_mk2/.gitignore
diff --git a/src/lib/profiling/hmc_mk2/README.md b/profiling/hmc_mk2/README.md
similarity index 100%
rename from src/lib/profiling/hmc_mk2/README.md
rename to profiling/hmc_mk2/README.md
diff --git a/src/lib/profiling/hmc_mk2/logs/example_log.out b/profiling/hmc_mk2/logs/example_log.out
similarity index 99%
rename from src/lib/profiling/hmc_mk2/logs/example_log.out
rename to profiling/hmc_mk2/logs/example_log.out
index faf4874bf..22ec86ec9 100644
--- a/src/lib/profiling/hmc_mk2/logs/example_log.out
+++ b/profiling/hmc_mk2/logs/example_log.out
@@ -270,8 +270,8 @@ operator 0 parsed line 229
 This is the hmc code for twisted mass Wilson QCD
 
 Version 5.2.0, commit 51cf008a89944ecdd9345cdb62aaf0a203a7f306
-# The code is compiled with -D_GAUGE_COPY
-# The code is compiled with -D_USE_HALFSPINOR
+# The code is compiled with -DTM_GAUGE_COPY
+# The code is compiled with -DTM_USE_HALFSPINOR
 # the code is compiled for non-blocking MPI calls (spinor and gauge)
 # the code is compiled with openMP support
 # Non-Schroedinger (anti-periodic, periodic or twisted) boundary conditions are used
diff --git a/src/lib/profiling/hmc_mk2/make_profile.R b/profiling/hmc_mk2/make_profile.R
similarity index 100%
rename from src/lib/profiling/hmc_mk2/make_profile.R
rename to profiling/hmc_mk2/make_profile.R
diff --git a/src/lib/profiling/hmc_mk2/profile.Rmd b/profiling/hmc_mk2/profile.Rmd
similarity index 100%
rename from src/lib/profiling/hmc_mk2/profile.Rmd
rename to profiling/hmc_mk2/profile.Rmd
diff --git a/qphix_base_classes.hpp b/qphix_base_classes.hpp
deleted file mode 100644
index 26015e3a2..000000000
--- a/qphix_base_classes.hpp
+++ /dev/null
@@ -1,771 +0,0 @@
-// Copyright © 2017 Martin Ueding <dev@martin-ueding.de>
-// Licensed unter the [BSD-3-Clause](https://opensource.org/licenses/BSD-3-Clause).
-
-// Due to github issue #404, the helper functions to apply the full QPhiX operator
-// are currently disabled because they conflict with the new interfaces in QPhiX
-// itself. If required, these should be rewritten to use these interfaces
-// rather than the base classes in qphix_base_classes.hpp
-
-// This file should be deprecated or updated to provide any functionality
-// not covered by QPhiX itself.
-
-/**
-  \file Additions to QPhiX that are only needed for tmLQCD.
-
-  In the original QPhiX, there are only Wilson fermions and Wilson clover
-  fermions. The Dslash operators have a different call signature (the latter
-  requiring a clover term), so there is no common base class. With the addition
-  of Wilson twisted mass (Mario) and Wilson twisted clover (Peter), there are
-  now two instances of the Dslash that have the same signature. In order to
-  write a more general even-odd source preparation and solution reconstruction
-  code, a common base class for non-clover and clover is desired. In order to
-  leave the QPhiX code untouched (for now), this code lives here in tmLQCD.
-  */
-
-#pragma once
-
-#include <qphix/blas_new_c.h>
-#include <qphix/clover_dslash_def.h>
-#include <qphix/dslash_def.h>
-#include <qphix/geometry.h>
-#include <qphix/tm_clov_dslash_def.h>
-#include <qphix/tm_dslash_def.h>
-
-#include <cassert>
-
-namespace tmlqcd {
-
-namespace {
-size_t constexpr re = 0;
-size_t constexpr im = 1;
-int const n_blas_simt = 1;
-
-// The even checkerboard is given by ( (x + y + z + t ) & 1 == 0 ) -> cb0 is even
-int constexpr cb_even = 0;
-int constexpr cb_odd = 1;
-}
-
-/**
-  Complex multiplication accumulate.
-
-  Computes \f$ (r + \mathrm i i) += (a + \mathrm i b) * (c + \mathrm i d) \f$.
-  */
-template <typename FT>
-void cplx_mul_acc(FT &r_out, FT &i_out, FT const &a, FT const &b, FT const &c, FT const &d) {
-  r_out += a * c - b * d;
-  i_out += a * d + b * c;
-}
-
-/**
-  Wrapper for the clover multiplication function.
-
-  The `struct` is needed in order to allow for partial template specialization in the `Clover`
-  parameter.
-
-  \tparam Clover Type of clover block to use, must be a type from Geometry such that there exists a
-  specialization for it.
-  */
-template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
-struct InnerCloverProduct {
-  /**
-  Multiplies the clover term for a single lattice size to a spinor.
-
-  This function is intended to be used in a loop over all lattice sites. It is expected from the
-  caller to have figured out all the correct indices. There are template specializations for the two
-  different types of clover term that are used in QPhiX.
-
-  \param[out] out Output spinor block. It is assumed to be zeroed properly, the function will just
-  accumulate values into that output variable. Use \ref QPhiX::zeroSpinor for that.
-  \param[in] in Input spinor block.
-  \param[in] clover Single clover block that contains the lattice site of the spinor.
-  \param[in] xi SIMD index for the arrays with length `soalen`, as in the spinors.
-  \param[in] veclen_idx SIMD index for the arrays with length `veclen`, as in the clover term.
-  */
-  static void multiply(
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &out,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &in,
-      Clover const &clover, int const xi, int const veclen_idx);
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-struct InnerCloverProduct<FT, veclen, soalen, compress12,
-                          typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock> {
-  static void multiply(
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock const &clov_block,
-      int const xi, int const veclen_idx) {
-    // The clover term is block-diagonal in spin. Therefore we need
-    // to iterate over the two blocks of spin.
-    for (auto s_block : {0, 1}) {
-      // Extract the diagonal and triangular parts.
-      auto const &diag_in = s_block == 0 ? clov_block.diag1 : clov_block.diag2;
-      auto const &off_diag_in = s_block == 0 ? clov_block.off_diag1 : clov_block.off_diag2;
-      // Input two-spinor component.
-      for (auto two_s_in : {0, 1}) {
-        // Reconstruct four spinor index.
-        auto const four_s_in = 2 * s_block + two_s_in;
-        // Output two-spinor component.
-        for (auto two_s_out : {0, 1}) {
-          // Reconstruct four spinor index.
-          auto const four_s_out = 2 * s_block + two_s_out;
-          // Input color.
-          for (auto c_in : {0, 1, 2}) {
-            // Spin-color index (0, ..., 5).
-            auto const sc_in = 3 * two_s_in + c_in;
-            // Output color.
-            for (auto c_out : {0, 1, 2}) {
-              // Spin-color index (0, ..., 5).
-              auto const sc_out = 3 * two_s_out + c_out;
-
-              // See `qphix-codegen` file `dslash_common.cc`
-              // function
-              // `clover_term` for the index manipulations done
-              // here.
-
-              // Using separate loops over the actual indices is
-              // probably
-              // faster than the branching in the innermost loop.
-
-              if (sc_out == sc_in) {
-                cplx_mul_acc(spinor_out[c_out][four_s_out][re][xi],
-                             spinor_out[c_out][four_s_out][im][xi], diag_in[sc_in][veclen_idx],
-                             QPhiX::rep<FT,double>(0.0), spinor_in[c_in][four_s_in][re][xi],
-                             spinor_in[c_in][four_s_in][im][xi]);
-              } else if (sc_out < sc_in) {
-                auto const idx15 = sc_in * (sc_in - 1) / 2 + sc_out;
-                cplx_mul_acc(
-                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
-                    off_diag_in[idx15][re][veclen_idx],
-                    // aww hell, maybe one should just add negation to QPhiX::half ?
-                    QPhiX::rep<FT,double>(-QPhiX::rep<double,FT>(off_diag_in[idx15][im][veclen_idx])),
-                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
-              } else {
-                auto const idx15 = sc_out * (sc_out - 1) / 2 + sc_in;
-                cplx_mul_acc(
-                    spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
-                    off_diag_in[idx15][re][veclen_idx], off_diag_in[idx15][im][veclen_idx],
-                    spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-struct InnerCloverProduct<
-    FT, veclen, soalen, compress12,
-    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock> {
-  static void multiply(
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock &spinor_out,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const &spinor_in,
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock const &clov_block,
-      int const xi, int const veclen_idx) {
-    // The clover term is block-diagonal in spin. Therefore we need
-    // to iterate over the two blocks of spin.
-    for (auto s_block : {0, 1}) {
-      // handy reference to half-spinor block
-      auto const &block_in = s_block == 0 ? clov_block.block1 : clov_block.block2;
-      // Input two-spinor component.
-      for (auto two_s_in : {0, 1}) {
-        // Reconstruct four spinor index.
-        auto const four_s_in = 2 * s_block + two_s_in;
-        // Output two-spinor component.
-        for (auto two_s_out : {0, 1}) {
-          // Reconstruct four spinor index.
-          auto const four_s_out = 2 * s_block + two_s_out;
-          // Input color.
-          for (auto c_in : {0, 1, 2}) {
-            // Spin-color index (0, ..., 5).
-            auto const sc_in = 3 * two_s_in + c_in;
-            // Output color.
-            for (auto c_out : {0, 1, 2}) {
-              // Spin-color index (0, ..., 5).
-              auto const sc_out = 3 * two_s_out + c_out;
-
-              cplx_mul_acc(
-                  spinor_out[c_out][four_s_out][re][xi], spinor_out[c_out][four_s_out][im][xi],
-                  block_in[sc_out][sc_in][re][veclen_idx], block_in[sc_out][sc_in][im][veclen_idx],
-                  spinor_in[c_in][four_s_in][re][xi], spinor_in[c_in][four_s_in][im][xi]);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/**
-  Multiplies a checkerboarded QPhiX Clover term with a checkerboarded QPhiX spinor.
-
-  Padding is taken care of. A test case for (a copy of) this function exists in QPhiX.
-
-  If the preprocessor macro `PRINT_MAPPING` is defined, it will print out the mapping of `(x, y, z,
-  t)` coordinates to block indices. Also it will check that each block is accessed the proper number
-  of times, that is `soalen` for spinors and `veclen` for clover blocks.
-
-  \param[out] out Output spinor
-  \param[in] in Input spinor
-  \param[in] clover Clover block
-  \param[in] geom Geometry object holding the dimension of clover and spinor
-  */
-template <typename FT, int veclen, int soalen, bool compress12, typename Clover>
-void clover_product(
-    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock *const out,
-    typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock const *const in,
-    Clover *clover, ::QPhiX::Geometry<FT, veclen, soalen, compress12> &geom) {
-  ::QPhiX::zeroSpinor<FT, veclen, soalen, compress12>(out, geom, n_blas_simt);
-
-#ifdef PRINT_MAPPING
-  std::vector<int> spin_touches(geom.getPxyz() * geom.Nt(), 0);
-  std::vector<int> clover_touches(geom.getPxyz() * geom.Nt() * soalen / veclen, 0);
-
-  std::cout << std::setw(3) << "x" << std::setw(3) << "y" << std::setw(3) << "z" << std::setw(3)
-            << "t"
-            << ":" << std::setw(5) << "spin" << std::setw(5) << "clov"
-            << "\n";
-#endif
-
-  // Iterate through all the block.
-  for (int t = 0; t < geom.Nt(); ++t) {
-    for (int z = 0; z < geom.Nz(); ++z) {
-      for (int y = 0; y < geom.Ny(); ++y) {
-        for (int x = 0; x < geom.Nxh(); ++x) {
-          // First element in the current XY plane at desired Z and T.
-          auto const xyBase = t * geom.getPxyz() + z * geom.getPxy();
-          // Index of the SoA along the X direction.
-          auto const xb = x / soalen;
-          // Index within the SoA.
-          auto const xi = x % soalen;
-          // Global spin block index.
-          auto const spin_block_idx = xb + geom.Nxh() / soalen * y + xyBase;
-          // Global clover/gauge block index.
-          auto const clov_block_idx =
-              xb + (y / geom.nGY()) * geom.Nxh() / soalen + xyBase / geom.nGY();
-          // Index of the SoA structure within the current tile.
-          // auto const tile = (geom.Nxh() / soalen * y + xyBase) % geom.nGY();
-          auto const tile = y % geom.nGY();
-          // Vector index for clover/gauge. The SoA index only runs to
-          // `soalen`, this index needs to run to `veclen`, that is across the
-          // various SoA within the tile.
-          auto const veclen_idx = soalen * tile + xi;
-
-#ifdef PRINT_MAPPING
-          ++spin_touches[spin_block_idx];
-          ++clover_touches[clov_block_idx];
-
-          std::cout << std::setw(3) << x << std::setw(3) << y << std::setw(3) << z << std::setw(3)
-                    << t << ":" << std::setw(5) << spin_block_idx << std::setw(5) << clov_block_idx
-                    << "\n";
-#endif
-
-          assert(xi + xb * soalen == x);
-
-          // References to the objects at desired block.
-          auto const &clov_block = clover[clov_block_idx];
-          auto const &spinor_in = in[spin_block_idx];
-          auto &spinor_out = out[spin_block_idx];
-
-          InnerCloverProduct<FT, veclen, soalen, compress12, Clover>::multiply(
-              spinor_out, spinor_in, clov_block, xi, veclen_idx);
-        }
-      }
-    }
-  }
-
-#ifdef PRINT_MAPPING
-  std::cout << std::flush;
-
-  // Make sure that each block got touched the correct number of times.
-  for (int i = 0; i != spin_touches.size(); ++i) {
-    if (spin_touches[i] != soalen) {
-      std::cout << "Spin missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
-                << spin_touches[i] << " times instead of " << soalen << "\n";
-    }
-  }
-
-  for (int i = 0; i != clover_touches.size(); ++i) {
-    if (clover_touches[i] != veclen) {
-      std::cout << "Clover missmatch: Block " << std::setw(4) << i << " accessed " << std::setw(4)
-                << clover_touches[i] << " times instead of " << veclen << "\n";
-    }
-  }
-
-  std::cout << std::flush;
-#endif
-}
-
-/**
-  Abstract base class for all single-flavor Dslash variants.
-
-  There are four Dslash operators which are implemented in QPhiX:
-
-  - Wilson
-  - Wilson clover
-  - Wilson twisted mass
-  - Wilson clover with twisted mass
-
-  Each of these has a the actual Dslash operation and a so-called “achimbdpsi” operation. These act
-  on four-spinors given a gauge field. This base class provides a uniform interface to all four
-  kinds.
-
-  This code should eventually be migrated into the QPhiX repository. Currently these classes are
-  mere delegators. In the QPhiX repository, the actual classes there should be used as concrete
-  classes.
-  */
-template <typename FT, int veclen, int soalen, bool compress12>
-class Dslash {
- public:
-  typedef ::QPhiX::Geometry<FT, veclen, soalen, compress12> Geom;
-  typedef typename Geom::FourSpinorBlock Spinor;
-  typedef typename Geom::SU3MatrixBlock SU3MatrixBlock;
-
-  explicit Dslash(Geom *geom, double const t_boundary_, double const aniso_coeff_S_,
-                  double const aniso_coeff_T_, double const mass_, bool use_tbc_[4] = nullptr,
-                  double tbc_phases_[4][2] = nullptr)
-      : geom(geom),
-        t_boundary(t_boundary_),
-        aniso_coeff_S(aniso_coeff_S_),
-        aniso_coeff_T(aniso_coeff_T_),
-        mass(mass_) {}
-
-  /**
-    Computes \f$ \psi_\mathrm o = A_\mathrm{oo} \chi_\mathrm o \f$.
-
-    The actual definition of the matrix \f$ A_\mathrm{oo} \f$ is
-    implementation dependent and can be the mass factor \f$ \alpha = 4 + m
-    \f$ for plain Wilson or something more complicated for twisted mass.
-
-    \param[out] out Output spinor \f$ \psi \f$.
-    \param[in] in Input spinor \f$ \chi \f$.
-    */
-  virtual void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) = 0;
-
-  /**
-    Computes \f$ \psi_\mathrm e = A_\mathrm{ee}^{-1} \chi_\mathrm e \f$.
-
-    \param[out] out Output spinor \f$ \psi \f$.
-    \param[in] in Input spinor \f$ \chi \f$.
-    */
-  virtual void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
-                         int const cb) = 0;
-
-  /**
-    Forwarder for the `dslash`.
-
-    This will call the `dslash` function of the respective QPhiX dslash class. There is a subtle
-    difference between the Wilson and all other cases. The Wilson dslash is just the hopping matrix,
-    just the operator \f$ D \f$. For every other case (clover, twisted mass, twisted mass clover),
-    the `dslash` member function will compute \f$ A^{-1} D \f$. In the Wilson case, this \f$ A =
-    \alpha = 4 + m = 1/(2 \kappa) \f$. Since that is _not_ included in the Wilson `dslash`, you will
-    obtain different results when using WilsonDslash::dslash and WilsonTMDslash::dslash with \f$
-    \mu = 0 \f$.
-
-    \todo Make this member function `const`. For this the member function in
-    QPhiX that is called internally must be marked `const` as well.
-    */
-  virtual void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-                      int const isign, int const cb) = 0;
-
-  /**
-    Always plain Wilson dslash.
-
-    In contrast to the \ref dslash member function which just forwards the implementation of QPhiX,
-    this will always give you the “naked” plain Wilson dslash without any factors of \f$ A^{-1} \f$
-    applied.
-    */
-  virtual void plain_dslash(Spinor *const res, const Spinor *const psi,
-                            const SU3MatrixBlock *const u, int const isign, int const cb) {
-    // XXX Perhaps rather implement this with an instance of the WilsonDslash instead?
-
-    auto tmp = QPhiX::makeFourSpinorHandle(*geom);
-    dslash(tmp.get(), psi, u, isign, cb);
-    A_chi(res, tmp.get(), isign, cb);
-  };
-
-  /**
-    Always “dressed” dslash.
-
-    This computes \f$ A^{-1} D \f$ for all variants. In the Wilson case, this will give \f$
-    \alpha^{-1} D \f$.
-    */
-  virtual void A_inv_dslash(Spinor *const res, const Spinor *const psi,
-                            const SU3MatrixBlock *const u, int const isign, int const cb) {
-    dslash(res, psi, u, isign, cb);
-  };
-
-  /**
-    Forwarder for the `achimbdpsi`.
-
-    \todo Make this member function `const`. For this the member function in QPhiX that is called
-    internally must be marked `const` as well.
-    */
-  virtual void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                          const SU3MatrixBlock *const u, double const alpha, double const beta,
-                          int const isign, int const cb) = 0;
-
-  /**
-    Prepares the sources on the odd checkerboard.
-
-    This computes
-    \f[
-        \tilde b_o = \frac 12 D_{oe} M_{ee}^{-1} b_e + b_o \,.
-    \f]
-
-    \param[out] tilde_b_odd Prepared source
-    \param[in] b_even Source (right hand side) on the even lattice sites
-    \param]in] b_odd Source on the odd lattice sites
-    \param[in] u Gauge field on the odd lattice sites
-    */
-  virtual void prepare_source(Spinor *const tilde_b_odd, Spinor const *const b_even,
-                              Spinor const *const b_odd, SU3MatrixBlock const *const u);
-
-  /**
-    Reconstructs the solution on the even lattices sites.
-
-    This computes
-    \f[
-        x_e = M_{ee}^{-1} \left( b_e - \frac 12 D_{eo} x_o \right) \,.
-    \f]
-
-    \param[out] x_even Solution on the even lattices sites
-    \param[in] b_even Source (right hand side) on the even lattice sites
-    \param[in] x_odd Solution on the odd lattices sites
-    \param[in] u Gauge field on the even lattice sites
-    */
-  virtual void reconstruct_solution(Spinor *const x_even, Spinor const *const b_even,
-                                    Spinor const *const x_odd, SU3MatrixBlock const *const u);
-
-  Geom *getGeometry() const { return geom; }
-
- private:
-  Geom *const geom;
-
-  double const t_boundary;
-  double const aniso_coeff_S;
-  double const aniso_coeff_T;
-  double const mass;
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-
-  WilsonDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
-               double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
-               bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {}
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-             int const cb_ignored) override {
-    int const n_blas_simt = 1;
-    ::QPhiX::axy(mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-                 int const cb_ignored) override {
-    int const n_blas_simt = 1;
-    ::QPhiX::axy(1.0 / mass_factor_alpha, in, out, upstream_dslash.getGeometry(), n_blas_simt);
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, isign, cb);
-  }
-
-  void plain_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-                    int const isign, int const cb) override {
-    dslash(res, psi, u, isign, cb);
-  };
-
-  void A_inv_dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-                    int const isign, int const cb) override {
-    auto tmp = QPhiX::makeFourSpinorHandle(upstream_dslash.getGeometry());
-    dslash(tmp.get(), psi, u, isign, cb);
-    A_inv_chi(res, tmp.get(), isign, cb);
-  };
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
-  }
-
- private:
-  ::QPhiX::Dslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonTMDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-
-  WilsonTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_, double const t_boundary_,
-                 double const aniso_coeff_S_, double const aniso_coeff_T_, double const mass_,
-                 double const twisted_mass_, bool use_tbc_[4] = nullptr,
-                 double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, mass_, twisted_mass_,
-                        use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(0.25),
-        derived_mu(twisted_mass_ / mass_factor_alpha),
-        derived_mu_inv(mass_factor_alpha /
-                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {}
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign,
-             int const cb_ignored) override {
-    helper_A_chi(out, in, -derived_mu * isign, mass_factor_alpha);
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
-                 int const cb_ignored) override {
-    helper_A_chi(out, in, derived_mu * isign, derived_mu_inv);
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, isign, cb);
-  }
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, alpha, beta, isign, cb);
-  }
-
- private:
-  void helper_A_chi(Spinor *const out, Spinor const *const in, double const factor_a,
-                    double const factor_b);
-
-  ::QPhiX::TMDslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-  double const derived_mu;
-  double const derived_mu_inv;
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonClovDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
-
-  WilsonClovDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
-                   double const t_boundary_, double const aniso_coeff_S_,
-                   double const aniso_coeff_T_, double const mass_,
-                   CloverBlock *const (&clover_)[2], CloverBlock *const (&inv_clover_)[2],
-                   bool use_tbc_[4] = nullptr, double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(1.0 / (4.0 * mass_factor_alpha)) {
-    for (int cb : {0, 1}) {
-      clover[cb] = clover_[cb];
-      inv_clover[cb] = inv_clover_[cb];
-    }
-  }
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-             int const cb) override {
-    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign_ignored,
-                 int const cb) override {
-    clover_product(out, in, inv_clover[cb], upstream_dslash.getGeometry());
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, inv_clover[cb], isign, cb);
-  }
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb], mass_factor_beta, isign, cb);
-  }
-
- private:
-  ::QPhiX::ClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-
-  /**
-    Reference to the clover term.
-
-    This class has to provide a `dslash` and `achimbdpsi` member function with the prescribed
-    argument list which does not contain the clover term. The user of these classes should not have
-    to differentiate between non-clover and clover variants. In order to provide the function
-    signature, the clover term is a member. This means that the user has to construct a new operator
-    if the pointers to the clover field need to be changed. Seperate pointers are kept for the fields
-    on the even and odd checkerboards, hence the array dimension.
-    */
-  CloverBlock *clover[2];
-
-  /// See \ref clover.
-  CloverBlock *inv_clover[2];
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-class WilsonClovTMDslash : public Dslash<FT, veclen, soalen, compress12> {
- public:
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FourSpinorBlock Spinor;
-  typedef typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::SU3MatrixBlock SU3MatrixBlock;
-  typedef
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::FullCloverBlock FullCloverBlock;
-  typedef
-      typename ::QPhiX::Geometry<FT, veclen, soalen, compress12>::CloverBlock CloverBlock;
-
-  WilsonClovTMDslash(::QPhiX::Geometry<FT, veclen, soalen, compress12> *geom_,
-                     double const t_boundary_, double const aniso_coeff_S_,
-                     double const aniso_coeff_T_, double const mass_, double const twisted_mass_,
-                     CloverBlock *const (&clover_)[2],
-                     FullCloverBlock *const (&inv_clover_)[2][2], bool use_tbc_[4] = nullptr,
-                     double tbc_phases_[4][2] = nullptr)
-      : Dslash<FT, veclen, soalen, compress12>(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_,
-                                               mass_, use_tbc_, tbc_phases_),
-        upstream_dslash(geom_, t_boundary_, aniso_coeff_S_, aniso_coeff_T_, use_tbc_, tbc_phases_),
-        mass_factor_alpha(4.0 + mass_),
-        mass_factor_beta(0.25),
-        derived_mu(twisted_mass_ / mass_factor_alpha),
-        derived_mu_inv(mass_factor_alpha /
-                       (mass_factor_alpha * mass_factor_alpha + twisted_mass_ * twisted_mass_)) {
-    for (int cb : {0, 1}) {
-      clover[cb] = clover_[cb];
-      for (int fl : {0, 1}) {
-        inv_clover[cb][fl] = inv_clover_[cb][fl];
-      }
-    }
-  }
-
-  void A_chi(Spinor *const out, Spinor const *const in, int const isign, int const cb) override {
-    clover_product(out, in, clover[cb], upstream_dslash.getGeometry());
-    // TODO: add twisted mass here
-  }
-
-  void A_inv_chi(Spinor *const out, Spinor const *const in, int const isign,
-                 int const cb) override {
-    if (isign == -1) {
-      clover_product(out, in, inv_clover[cb][1], upstream_dslash.getGeometry());
-    } else {
-      clover_product(out, in, inv_clover[cb][0], upstream_dslash.getGeometry());
-    }
-  }
-
-  void dslash(Spinor *const res, const Spinor *const psi, const SU3MatrixBlock *const u,
-              int const isign, int const cb) override {
-    upstream_dslash.dslash(res, psi, u, (const FullCloverBlock **)inv_clover[cb], isign, cb);
-  }
-
-  void achimbdpsi(Spinor *const res, const Spinor *const psi, const Spinor *const chi,
-                  const SU3MatrixBlock *const u, double const alpha, double const beta,
-                  int const isign, int const cb) override {
-    upstream_dslash.dslashAChiMinusBDPsi(res, psi, chi, u, clover[cb],
-                                         mass_factor_beta, isign, cb);
-  }
-
- private:
-  ::QPhiX::TMClovDslash<FT, veclen, soalen, compress12> upstream_dslash;
-
-  double const mass_factor_alpha;
-  double const mass_factor_beta;
-  double const derived_mu;
-  double const derived_mu_inv;
-
-  CloverBlock *clover[2];
-  /* For twisted clover, there are two fields on each checkerboard which differ in the sign
-   * of the twisted quark mass. In effect then, the inner index can be thought of as being
-   * in flavour space while the outer index is the checkerboard index. 
-   */
-  FullCloverBlock *inv_clover[2][2];
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-void WilsonTMDslash<FT, veclen, soalen, compress12>::helper_A_chi(Spinor *const out,
-                                                                  Spinor const *const in,
-                                                                  double const factor_a,
-                                                                  double const factor_b) {
-  auto const nVecs = upstream_dslash.getGeometry().nVecs();
-  auto const Pxy = upstream_dslash.getGeometry().getPxy();
-  auto const Pxyz = upstream_dslash.getGeometry().getPxyz();
-
-  for (uint64_t t = 0; t < T; t++)
-    for (uint64_t x = 0; x < LX / 2; x++)
-      for (uint64_t y = 0; y < LY; y++)
-        for (uint64_t z = 0; z < LZ; z++) {
-          uint64_t const SIMD_vector = x / soalen;
-          uint64_t const x_internal = x % soalen;
-          uint64_t const qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
-
-          for (int color = 0; color < 3; ++color) {
-            for (int spin_block = 0; spin_block < 2; ++spin_block) {
-              // Implement the $\gamma_5$ structure.
-              auto const signed_factor_a = factor_a * (spin_block == 0 ? 1.0 : -1.0);
-
-              for (int half_spin = 0; half_spin < 2; ++half_spin) {
-                auto const four_spin = 2 * spin_block + half_spin;
-                for (int v = 0; v < soalen; ++v) {
-                  auto &out_bcs = out[qphix_idx][color][four_spin];
-                  auto const &in_bcs = in[qphix_idx][color][four_spin];
-
-                  out_bcs[re][v] = factor_b * (in_bcs[re][v] + signed_factor_a * in_bcs[im][v]);
-                  out_bcs[im][v] = factor_b * (in_bcs[im][v] - signed_factor_a * in_bcs[re][v]);
-                }
-              }
-            }
-          }
-
-        }  // volume
-};
-
-template <typename FT, int veclen, int soalen, bool compress12>
-void Dslash<FT, veclen, soalen, compress12>::prepare_source(Spinor *const tilde_b_odd,
-                                                            Spinor const *const b_even,
-                                                            Spinor const *const b_odd,
-                                                            SU3MatrixBlock const *const u) {
-  auto Mee_be = QPhiX::makeFourSpinorHandle(*geom);
-  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
-                                                            aniso_coeff_T, mass);
-
-  A_inv_chi(Mee_be.get(), b_even, 1, cb_even);
-
-  plain_dslash.dslash(tilde_b_odd, Mee_be.get(), u, 1, cb_odd);
-
-  // FIXME Perhaps use a variable number of BLAS threads here (last parameter).
-  QPhiX::aypx(0.5, Mee_be.get(), tilde_b_odd, *geom, 1);
-}
-
-template <typename FT, int veclen, int soalen, bool compress12>
-void Dslash<FT, veclen, soalen, compress12>::reconstruct_solution(Spinor *const x_even,
-                                                                  Spinor const *const b_even,
-                                                                  Spinor const *const x_odd,
-                                                                  SU3MatrixBlock const *const u) {
-  auto tmp = QPhiX::makeFourSpinorHandle(*geom);
-  WilsonDslash<FT, veclen, soalen, compress12> plain_dslash(geom, t_boundary, aniso_coeff_S,
-                                                            aniso_coeff_T, mass);
-
-  plain_dslash.dslash(tmp.get(), x_odd, u, 1, cb_even);
-  QPhiX::aypx(0.5, b_even, tmp.get(), *geom, 1);
-  A_inv_chi(x_even, tmp.get(), 1, cb_even);
-}
-}
diff --git a/qphix_interface.cpp b/qphix_interface.cpp
deleted file mode 100644
index 2c61427dd..000000000
--- a/qphix_interface.cpp
+++ /dev/null
@@ -1,2192 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2015 Mario Schroeck
- *               2016 Peter Labus
- *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- ***********************************************************************/
-
-#include "qphix_interface.h"
-#include "qphix_interface.hpp"
-#include "qphix_interface_utils.hpp"
-#include "qphix_types.h"
-#include "qphix_veclen.h"
-
-#ifdef TM_USE_MPI
-#include <mpi.h>
-#endif
-
-extern "C" {
-#ifdef HAVE_CONFIG_H
-#include "tmlqcd_config.h"
-#endif
-#include "boundary.h"
-#include "geometry_eo.h"
-#include "gettime.h"
-#include "global.h"
-#include "linalg/convert_eo_to_lexic.h"
-#include "linalg/diff.h"
-#include "linalg/square_norm.h"
-#include "misc_types.h"
-#include "operator/Hopping_Matrix.h"
-#include "operator/clover_leaf.h"
-#include "operator/clovertm_operators.h"
-#include "operator_types.h"
-#include "struct_accessors.h"
-
-// for the normalisation of the heavy doublet when running
-// RHMC
-#include "phmc.h"
-
-#include "solver/matrix_mult_typedef.h"
-#include "solver/solver.h"
-#include "solver/solver_field.h"
-#include "solver/solver_params.h"
-#include "solver/solver_types.h"
-#include "start.h"
-#include "xchange/xchange_gauge.h"
-}
-#ifdef TM_USE_OMP
-#include <omp.h>
-#endif
-#include <qphix/blas_new_c.h>
-#include <qphix/clover.h>
-#include <qphix/inv_dummy_hermtest.h>
-#include <qphix/inv_richardson_multiprec.h>
-#include <qphix/invbicgstab.h>
-#include <qphix/invcg.h>
-#include <qphix/minvcg.h>
-#include <qphix/ndtm_reuse_operator.h>
-#include <qphix/ndtm_reuse_operator_clover.h>
-#include <qphix/print_utils.h>
-#include <qphix/qphix_config.h>
-#include <qphix/twisted_mass.h>
-#include <qphix/twisted_mass_clover.h>
-#include <qphix/wilson.h>
-#include <cfloat>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-
-using namespace tmlqcd;
-
-tm_QPhiXParams_t qphix_input;
-
-int By;
-int Bz;
-int NCores;
-int Sy;
-int Sz;
-int PadXY;
-int PadXYZ;
-int MinCt;
-int N_simt;
-bool compress12;
-QphixPrec_t qphix_precision;
-QphixPrec_t qphix_inner_precision;
-
-int subLattSize[4];
-int lattSize[4];
-int qmp_geom[4];
-int qmp_tm_map[4];
-
-// angles for boundary phases, values come from read_input
-extern double X0, X1, X2, X3;
-
-bool use_tbc[4];
-double tbc_phases[4][2];
-// we always use twisted boundary conditions, which means that we are always
-// periodic in time and any possible anti-periodicity is implemented via
-// the phase
-double constexpr t_boundary = 1.0;
-
-template <typename T>
-struct rsdTarget {
-  static const double value;
-};
-
-template <>
-const double rsdTarget<QPhiX::half>::value = 1.0e-3;
-
-template <>
-const double rsdTarget<float>::value = 1.0e-8;
-
-void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_,
-                QphixPrec_t inner_precision_) {
-  static bool qmp_topo_initialised = false;
-
-  // Global Lattice Size
-  lattSize[0] = LX * g_nproc_x;
-  lattSize[1] = LY * g_nproc_y;
-  lattSize[2] = LZ * g_nproc_z;
-  lattSize[3] = T * g_nproc_t;
-
-  // Local Lattice Size
-  subLattSize[0] = LX;
-  subLattSize[1] = LY;
-  subLattSize[2] = LZ;
-  subLattSize[3] = T;
-
-  // extract twisted boundary conditions
-  for (int dim = 0; dim < 4; dim++) {
-    bool dim_tbc = false;
-    double dim_phase[2] = {1.0, 0.0};
-    if (dim == 0) {
-      dim_tbc = (fabs(X1) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_1))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_1))[1] / g_kappa;
-    } else if (dim == 1) {
-      dim_tbc = (fabs(X2) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_2))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_2))[1] / g_kappa;
-    } else if (dim == 2) {
-      dim_tbc = (fabs(X3) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_3))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_3))[1] / g_kappa;
-    } else if (dim == 3) {
-      dim_tbc = (fabs(X0) > DBL_EPSILON);
-      dim_phase[0] = -((double *)(&phase_0))[0] / g_kappa;
-      dim_phase[1] = -((double *)(&phase_0))[1] / g_kappa;
-    }
-    use_tbc[dim] = dim_tbc;
-    tbc_phases[dim][0] = dim_phase[0];
-    tbc_phases[dim][1] = dim_phase[1];
-  }
-
-  By = params.By;
-  Bz = params.Bz;
-  NCores = params.NCores;
-  Sy = params.Sy;
-  Sz = params.Sz;
-  PadXY = params.PadXY;
-  PadXYZ = params.PadXYZ;
-  MinCt = params.MinCt;
-  N_simt = Sy * Sz;
-  if (c12 == 8) {
-    QPhiX::masterPrintf(
-        "# INFO QphiX: 8-parameter gauge compression not supported, using two row compression "
-        "instead!\n");
-    c12 = 12;
-  }
-  compress12 = c12 == 12 ? true : false;
-  qphix_precision = precision_;
-  qphix_inner_precision = inner_precision_;
-
-#ifdef QPHIX_QMP_COMMS
-  // Declare the logical topology
-  if (!qmp_topo_initialised) {
-    // the QMP topology is the one implied by the number of processes in each
-    // dimension as required by QPHIX ( x fastest to t slowest running )
-    qmp_geom[0] = g_nproc_x;
-    qmp_geom[1] = g_nproc_y;
-    qmp_geom[2] = g_nproc_z;
-    qmp_geom[3] = g_nproc_t;
-
-    // in order for the topologies to agree between tmLQCD and QPhiX, the dimensions need to be
-    // permuted
-    // since Z is fastest in tmLQCD and X is second-slowest
-    qmp_tm_map[0] = 2;
-    qmp_tm_map[1] = 1;
-    qmp_tm_map[2] = 0;
-    qmp_tm_map[3] = 3;
-    if (QMP_declare_logical_topology_map(qmp_geom, 4, qmp_tm_map, 4) != QMP_SUCCESS) {
-      QMP_error("Failed to declare QMP Logical Topology\n");
-      abort();
-    }
-    // longish test to check if the logical coordinates are correctly mapped
-    if (g_debug_level >= 5) {
-      for (int proc = 0; proc < g_nproc; proc++) {
-        if (proc == g_proc_id) {
-          const int coordinates[4] = {g_proc_coords[1], g_proc_coords[2], g_proc_coords[3],
-                                      g_proc_coords[0]};
-          int id = QMP_get_node_number_from(coordinates);
-          int *qmp_coords = QMP_get_logical_coordinates_from(id);
-          fflush(stdout);
-          printf("QMP id: %3d x:%3d y:%3d z:%3d t:%3d\n", id, qmp_coords[0], qmp_coords[1],
-                 qmp_coords[2], qmp_coords[3]);
-          printf("MPI id: %3d x:%3d y:%3d z:%3d t:%3d\n\n", g_proc_id, g_proc_coords[1],
-                 g_proc_coords[2], g_proc_coords[3], g_proc_coords[0]);
-          free(qmp_coords);
-          fflush(stdout);
-          MPI_Barrier(MPI_COMM_WORLD);
-        } else {
-          MPI_Barrier(MPI_COMM_WORLD);
-        }
-      }
-    }
-    qmp_topo_initialised = true;
-  }
-#endif
-
-#ifdef QPHIX_QPX_SOURCE
-  if (thread_bind) {
-    QPhiX::setThreadAffinity(NCores_user, Sy_user * Sz_user);
-  }
-  QPhiX::reportAffinity();
-#endif
-}
-
-void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPrec_t precision_) {
-  _initQphix(argc, argv, params, c12, precision_, precision_);
-}
-
-// Finalize the QPhiX library
-void _endQphix() {}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_clover_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::CloverBlock *qphix_clover, int cb,
-    bool inverse, bool fl_offdiag = false) {
-  const double startTime = gettime();
-
-  /* the spin-colour clover term in sw_term and the corresponding inverse
-   * in sw_inv are stored in the tmLQCD gamma basis.
-   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
-   * spinor and then apply the same transformation to the output spinor
-   * ( we have V^dagger = V and V*V = 1 )
-   * Thus, in order to translate the clover field, we need to copy
-   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
-   * This way, the clover term will be in the correct gamma basis.
-   *
-   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
-   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
-   * Below, each sAB represents one 3x3 colour matrix
-   *
-   *                +s33 -s32    0    0
-   *  T' = V*T*V =  -s23 +s22    0    0
-   *                   0    0 +s11 -s10
-   *                   0    0 -s01 +s00
-   *
-   * Such that the half-spinor blocks are inverted and within these, the ordering is
-   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
-   * each other and this is preserved by the transformation.
-   *
-   * The QPhiX (Wilson) clover term is stored as 12 reals on the diagonal
-   * in two 6-element vectors, one for each half-spinor spin pair
-   * and two sets of off-diagonal complex components.
-   *
-   * In addition, colour matrices are transposed in QPhiX.
-   *
-   * The tmLQCD clover term is stored as:
-   *
-   *      s00 s01
-   *          s11
-   * T =          s22 s23
-   *                  s33
-   *
-   * with indexing
-   *
-   *     sw[0][0] sw[1][0]
-   *              sw[2][0]
-   *                       sw[0][1] sw[1][1]
-   *                                sw[2][1]
-   *
-   * The inverse has four su3 blocks instead and is indexed
-   *     sw_inv[0][0] sw_inv[1][0]
-   *     sw_inv[3][0] sw_inv[2][0]
-   *                               sw_inv[0][1] sw_inv[1][1]
-   *                               sw_inv[3][1] sw_inv[2][1]
-   *
-   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0
-   *
-   * There is a special case for the non-degenerate twisted clover operator. The
-   * flavour-off-diagonal components of the inverse clover term do not have an imaginary part on the
-   * spin-colour diagonal. They can thus be stored as CloverBlock, which is done in the QPhiX
-   * implementation of the ND tmclover operator.
-   *
-   * As a hack, this inverse is prepared by sw_invert_epsbar and placed in to the last
-   * VOLUME/2 sites of sw_inv. Reading from there is triggered by the boolean
-   * fl_offdiag.
-   */
-
-  // rescale to get clover term (or its inverse) in the physical normalisation
-  // rather than the kappa normalisation
-  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
-  su3 ***tm_clover = inverse ? sw_inv : sw;
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto ngy = geom.nGY();
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // packer for Wilson clover (real diagonal + complex upper-triangular)
-  /* for the index in the off_diagN arrays, we map to an index in the su3 struct
-   * keeping in mind complex conjugation
-   * The off-diagonal in QPhiX is stored as follows:
-   *
-   * 0 1 3 6 10
-   *   2 4 7 11
-   *     5 8 12
-   *       9 13
-   *         14
-   *
-   * which we are going to map to su3 in blocks
-   *
-   *     0* 1*
-   *        2*
-   *
-   * 3   4  5
-   * 6   7  8
-   * 10 11 12
-   *
-   *   9* 13*
-   *      14*
-   *
-   * where the asterisk indicates complex conjugation. As a linear array then,
-   * these mappings are:
-   *
-   */
-  const int od_su3_offsets[15] = {Nz,
-                                  2 * Nz,            //     0 1
-                                  Nc * Nz + 2 * Nz,  //       2
-
-                                  0,
-                                  Nz,
-                                  2 * Nz,  // 3  4  5
-                                  Nc * Nz,
-                                  Nc * Nz + Nz,
-                                  Nc * Nz + 2 * Nz,  // 6  7  8
-
-                                  Nz,  //     9
-
-                                  2 * Nc * Nz,
-                                  2 * Nc * Nz + Nz,
-                                  2 * Nc * Nz + 2 * Nz,  // 10 11 12
-
-                                  2 * Nz,
-                                  Nc * Nz + 2 * Nz};  // 13 14
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
-
-          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
-            int64_t xx = (y % ngy) * SOALEN + x_soa;
-            int64_t q_cb_x_coord = x_soa + v * SOALEN;
-            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-
-            // the inverse of the clover term is in even-odd ordering
-            // while the clover term itself is lexicographically ordered
-            // for the special case of the nd tmclover operator, the inverse of the flavour
-            // off-diagonal components is stored in the last VOLUME/2 elements of sw_inv
-            int64_t tm_idx =
-                (inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z]) +
-                ((inverse && fl_offdiag) ? VOLUME / 2 : 0);
-
-            int b_idx;
-
-            //             we begin with the diagonal elements in CloverBlock
-            for (int d = 0; d < 6; d++) {
-              //               choose the block in sw which corresponds to the block in T'
-              b_idx = d < 3 ? 2 : 0;
-              //               get the right colour components
-              qphix_clover[block].diag1[d][xx] = QPhiX::rep<FT, double>(
-                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                    (Nc * Nz + Nz) * (d % 3)) *
-                  scale);
-
-              qphix_clover[block].diag2[d][xx] = QPhiX::rep<FT, double>(
-                  *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                    (Nc * Nz + Nz) * (d % 3)) *
-                  scale);
-            }
-
-            b_idx = 2;  // s33 and s11
-            for (int od : {0, 1, 2}) {
-              for (int reim : {0, 1}) {
-                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-
-                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-              }
-            }
-
-            b_idx = 1;  // s32 and s10
-            for (int od : {3, 4, 5, 6, 7, 8, 10, 11, 12}) {
-              for (int reim : {0, 1}) {
-                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                      od_su3_offsets[od] + reim) *
-                    (-scale));
-
-                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                      od_su3_offsets[od] + reim) *
-                    (-scale));
-              }
-            }
-
-            b_idx = 0;  // s22 and s00
-            for (int od : {9, 13, 14}) {
-              for (int reim : {0, 1}) {
-                qphix_clover[block].off_diag1[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][1].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-
-                qphix_clover[block].off_diag2[od][reim][xx] = QPhiX::rep<FT, double>(
-                    (reim == 1 ? -1.0 : 1.0) *
-                    *(reinterpret_cast<double const *const>(&tm_clover[tm_idx][b_idx][0].c00) +
-                      od_su3_offsets[od] + reim) *
-                    scale);
-              }
-            }
-
-          }  // x_soa
-        }  // for(v)
-      }  // for(y)
-    }  // for(z)
-  }  // for(t)
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf(
-        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (CloverBlock): %f secs\n",
-        diffTime);
-  }
-}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_clover_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FullCloverBlock *qphix_clover[2],
-    int cb, bool inverse) {
-  const double startTime = gettime();
-
-  /* the spin-colour clover term in sw_term and the corresponding inverse
-   * in sw_inv are stored in the tmLQCD gamma basis.
-   * When we translate spinors to QPhiX, we apply a transformation V to the tmLQCD
-   * spinor and then apply the same transformation to the output spinor
-   * ( we have V^dagger = V and V*V = 1 )
-   * Thus, in order to translate the clover field, we need to copy
-   *   (1+T)' = V*(1+T)*V, where T is the spin-colour clover-term
-   * This way, the clover term will be in the correct gamma basis.
-   *
-   * The tmLQCD clover term is stored in half-spinor blocks of colour matrices
-   * for which we need to work out what (1+T)'=V*(1+T)*V implies.
-   * Below, each sAB represents one 3x3 colour matrix
-   *
-   *                +s33 -s32    0    0
-   *  T' = V*T*V =  -s23 +s22    0    0
-   *                   0    0 +s11 -s10
-   *                   0    0 -s01 +s00
-   *
-   * Such that the half-spinor blocks are inverted and within these, the ordering is
-   * reversed. Note that the off-diagonal 3x3 colour blocks are hermitian conjugate to
-   * each other and this is preserved by the transformation.
-   *
-   * The QPhiX (tmclover) clover term and its inverse are stored as a pair of full
-   * 6x6 complex matrices which are multiplied with the spinor in exactly the same way
-   * as in tmLQCD.
-   *
-   * The tmLQCD clover term is stored as:
-   *
-   *      s00 s01
-   *          s11
-   * T =          s22 s23
-   *                  s33
-   *
-   * with indexing
-   *
-   *     sw[0][0] sw[1][0]
-   *              sw[2][0]
-   *                       sw[0][1] sw[1][1]
-   *                                sw[2][1]
-   *
-   * The inverse has four su3 blocks instead and is indexed
-   *     sw_inv[0][0] sw_inv[1][0]
-   *     sw_inv[3][0] sw_inv[2][0]
-   *                               sw_inv[0][1] sw_inv[1][1]
-   *                               sw_inv[3][1] sw_inv[2][1]
-   *
-   * where blocks sw_inv[3][0] and sw_inv[3][1] are relevant only when mu > 0   *
-   */
-
-  // rescale to get clover term (or its inverse) in the physical normalisation
-  // rather than the kappa normalisation
-  const double scale = inverse ? 2.0 * g_kappa : 1.0 / (2.0 * g_kappa);
-  su3 ***tm_clover = inverse ? sw_inv : sw;
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  const double amu = g_mu / (2.0 * g_kappa);
-
-  // Geometric parameters for QPhiX data layout
-  const auto ngy = geom.nGY();
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
-
-          for (int64_t x_soa = 0; x_soa < SOALEN; x_soa++) {
-            int64_t xx = (y % ngy) * SOALEN + x_soa;
-            int64_t q_cb_x_coord = x_soa + v * SOALEN;
-            int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-
-            //             the inverse of the clover term is in even-odd ordering
-            //             while the clover term itself is lexicographically ordered
-            int64_t tm_idx =
-                inverse ? g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]] : g_ipt[t][tm_x_coord][y][z];
-
-            for (int fl : {0, 1}) {
-              if (inverse && fl == 1) {
-                // the inverse clover term for the second flavour is stored at an offset
-                tm_idx += VOLUME / 2;
-              }
-              for (int q_hs : {0, 1}) {
-                auto &hs_block =
-                    ((q_hs == 0) ? qphix_clover[fl][block].block1 : qphix_clover[fl][block].block2);
-                for (int q_sc1 = 0; q_sc1 < 6; q_sc1++) {
-                  for (int q_sc2 = 0; q_sc2 < 6; q_sc2++) {
-                    const int q_s1 = q_sc1 / 3;
-                    const int q_s2 = q_sc2 / 3;
-                    const int q_c1 = q_sc1 % 3;
-                    const int q_c2 = q_sc2 % 3;
-
-                    // invert in spin as required by V*T*V
-                    const int t_hs = 1 - q_hs;
-                    // the indices inside the half-spinor are also inverted
-                    // (which transposes them, of course)
-                    const int t_s1 = 1 - q_s1;
-                    const int t_s2 = 1 - q_s2;
-                    // carry out the mapping from T' to T, keeping in mind that for the inverse
-                    // there are four blocks also on the tmLQCD side, otherwise there are just three
-                    const int t_b_idx = t_s1 + t_s2 + ((inverse && t_s1 == 1 && t_s2 == 0) ? 2 : 0);
-                    for (int reim : {0, 1}) {
-                      hs_block[q_sc1][q_sc2][reim][xx] = QPhiX::rep<FT, double>(
-                          scale *
-                              // off-diagonal (odd-numbered) blocks change sign
-                              (t_b_idx & 1 ? (-1.0) : 1.0) *
-                              // if not doing the inverse and in the bottom-left block, need to
-                              // complex conjugate
-                              ((!inverse && (t_s1 == 1 && t_s2 == 0) && reim == 1) ? -1.0 : 1.0) *
-                              *(reinterpret_cast<double const *const>(
-                                    &(tm_clover[tm_idx][t_b_idx][t_hs].c00)) +
-                                // if not doing the inverse and in the bottom-left block, transpose
-                                // in colour
-                                // because we're actually reading out of the top-right block
-                                Nz * ((!inverse && (t_s1 == 1 && t_s2 == 0)) ? Nc * q_c2 + q_c1
-                                                                             : Nc * q_c1 + q_c2) +
-                                reim) +
-                          // in the QPhiX gamma basis, the twisted quark mass enters with the
-                          // opposite
-                          // sign for consistency
-                          ((!inverse && q_sc1 == q_sc2 && q_hs == 0 && reim == 1)
-                               ? -amu * (1 - 2 * fl)
-                               : 0) +
-                          ((!inverse && q_sc1 == q_sc2 && q_hs == 1 && reim == 1)
-                               ? amu * (1 - 2 * fl)
-                               : 0));
-                    }
-                  }  // q_sc2
-                }  // q_sc1
-              }  // q_hs
-            }  // fl
-
-          }  // x_soa
-        }  // for(v)
-      }  // for(y)
-    }  // for(z)
-  }  // for(t)
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf(
-        "# QPHIX-interface: time spent in reorder_clover_to_QPhiX (FullCloverBlock): %f secs\n",
-        diffTime);
-  }
-}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_gauge_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb0,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::SU3MatrixBlock *qphix_gauge_cb1) {
-  const double startTime = gettime();
-
-  // Number of elements in spin, color & complex
-  // Here c1 is QPhiX's outer color, and c2 the inner one
-  const int Ns = 4;
-  const int Nc1 = compress12 ? 2 : 3;
-  const int Nc2 = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto ngy = geom.nGY();
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // This is needed to translate between the different
-  // orderings of the direction index "\mu" in tmlQCD
-  // and QPhiX, respectively
-  // in qphix, the Dirac operator is applied in the order
-  //   -+x -> -+y -> -+z -> -+t
-  // while tmlqcd does
-  //   -+t -> -+x -> -+y -> -+z
-  // same as the lattice ordering
-  // The mappingn between the application dimensions is thus:
-  //  tmlqcd_dim(t(0) -> x(1) -> y(2) -> z(3)) = qphix_dim( t(3) -> x(0) -> y(1) -> z(2) )
-  const int change_dim[4] = {1, 2, 3, 0};
-
-  // Get the base pointer for the (global) tmlQCD gauge field
-  xchange_gauge(g_gauge_field);
-  const double *in = reinterpret_cast<double *>(&g_gauge_field[0][0].c00);
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++)
-    for (int64_t z = 0; z < LZ; z++)
-      for (int64_t y = 0; y < LY; y++)
-        for (int64_t v = 0; v < nVecs; v++) {
-          int64_t block = (t * Pxyz + z * Pxy) / ngy + (y / ngy) * nVecs + v;
-
-          for (int dim = 0; dim < 4; dim++)     // dimension == QPhiX \mu
-            for (int c1 = 0; c1 < Nc1; c1++)    // QPhiX convention color 1 (runs up to 2 or 3)
-              for (int c2 = 0; c2 < Nc2; c2++)  // QPhiX convention color 2 (always runs up to 3)
-                for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
-                  int64_t xx = (y % ngy) * SOALEN + x_soa;
-                  int64_t q_cb_x_coord = x_soa + v * SOALEN;
-                  int64_t tm_x_coord_cb0 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 0);
-                  int64_t tm_x_coord_cb1 = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ 1);
-
-                  int64_t tm_idx_cb0;
-                  int64_t tm_idx_cb1;
-
-                  // backward / forward
-                  for (int dir = 0; dir < 2; dir++) {
-                    if (dir == 0) {
-                      tm_idx_cb0 = g_idn[g_ipt[t][tm_x_coord_cb0][y][z]][change_dim[dim]];
-                      tm_idx_cb1 = g_idn[g_ipt[t][tm_x_coord_cb1][y][z]][change_dim[dim]];
-                    } else {
-                      tm_idx_cb0 = g_ipt[t][tm_x_coord_cb0][y][z];
-                      tm_idx_cb1 = g_ipt[t][tm_x_coord_cb1][y][z];
-                    }
-                    for (int reim = 0; reim < Nz; reim++) {
-                      // Note:
-                      // -----
-                      // 1. \mu in QPhiX runs from 0..7 for all eight neighbouring
-                      // links.
-                      //    Here, the ordering of the direction (backward/forward)
-                      //    is the same
-                      //    for tmlQCD and QPhiX, but we have to change the
-                      //    ordering of the dimensions.
-                      int q_mu = 2 * dim + dir;
-
-                      qphix_gauge_cb0[block][q_mu][c1][c2][reim][xx] =
-                          QPhiX::rep<FT, double>(su3_get_elem(
-                              &(g_gauge_field[tm_idx_cb0][change_dim[dim]]), c2, c1, reim));
-                      qphix_gauge_cb1[block][q_mu][c1][c2][reim][xx] =
-                          QPhiX::rep<FT, double>(su3_get_elem(
-                              &(g_gauge_field[tm_idx_cb1][change_dim[dim]]), c2, c1, reim));
-                    }
-                  }
-                }  // for(dim,c1,c2,x_soa)
-        }  // outer loop (t,z,y,v)
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_gauge_to_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-// Reorder tmLQCD eo-spinor to a FourSpinorBlock QPhiX spinor on the given checkerboard
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_eo_spinor_to_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor const *const tm_eo_spinor,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
-    const int cb) {
-  const double startTime = gettime();
-
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-  const auto Nxh = geom.Nxh();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  // (note, this is a 4x4 matrix with 4 non-zero elements)
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          for (int col = 0; col < Nc; col++) {
-            for (int q_spin = 0; q_spin < Ns; q_spin++) {
-              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
-                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
-                int64_t q_cb_x_coord = v * SOALEN + x_soa;
-                // when t+y+z is odd and we're on an odd (1) checkerboard OR
-                // when t+y+z is even and we're on an even (0) checkerboard
-                // the full x coordinate is 2*x_cb
-                // otherwise, it is 2*x_cb+1
-                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-                // exchange x and z dimensions
-                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
-
-                for (int reim = 0; reim < 2; reim++) {
-                  qphix_spinor[q_ind][col][q_spin][reim][x_soa] = QPhiX::rep<FT, double>(
-                      change_sign[q_spin] *
-                      spinor_get_elem(&(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col, reim));
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_to_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_eo_spinor_from_QPhiX(
-    QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom, spinor *tm_eo_spinor,
-    typename QPhiX::Geometry<FT, VECLEN, SOALEN, compress12>::FourSpinorBlock *qphix_spinor,
-    const int cb, double normFac = 1.0) {
-  const double startTime = gettime();
-
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-  const auto Nxh = geom.Nxh();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  // (note, this is a 4x4 matrix with 4 non-zero elements)
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-#pragma omp parallel for collapse(4)
-  for (int64_t t = 0; t < T; t++) {
-    for (int64_t z = 0; z < LZ; z++) {
-      for (int64_t y = 0; y < LY; y++) {
-        for (int64_t v = 0; v < nVecs; v++) {
-          for (int col = 0; col < Nc; col++) {
-            for (int q_spin = 0; q_spin < Ns; q_spin++) {
-              for (int x_soa = 0; x_soa < SOALEN; x_soa++) {
-                int64_t q_ind = t * Pxyz + z * Pxy + y * nVecs + v;
-                int64_t q_cb_x_coord = v * SOALEN + x_soa;
-                // when t+y+z is odd and we're on an odd checkerboard (1) OR
-                // when t+y+z is even and we're on an even (0) checkerboard
-                // the full x coordinate is 2*x_cb
-                // otherwise, it is 2*x_cb+1
-                int64_t tm_x_coord = q_cb_x_coord * 2 + (((t + y + z) & 1) ^ cb);
-                // exchange x and z dimensions
-                int64_t tm_eo_ind = g_lexic2eosub[g_ipt[t][tm_x_coord][y][z]];
-
-                spinor_set_elem(
-                    &(tm_eo_spinor[tm_eo_ind]), change_spin[q_spin], col,
-                    change_sign[q_spin] * normFac *
-                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][0][x_soa]),
-                    change_sign[q_spin] * normFac *
-                        QPhiX::rep<double, FT>(qphix_spinor[q_ind][col][q_spin][1][x_soa]));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_eo_spinor_from_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-// Reorder a full tmLQCD spinor to a cb0 and cb1 QPhiX spinor
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_spinor_to_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-                             double const *tm_spinor, FT *qphix_spinor_cb0, FT *qphix_spinor_cb1) {
-  const double startTime = gettime();
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-// This will loop over the entire lattice and calculate
-// the array and internal indices for both tmlQCD & QPhiX
-#pragma omp parallel for collapse(4)
-  for (uint64_t t = 0; t < T; t++)
-    for (uint64_t x = 0; x < LX; x++)
-      for (uint64_t y = 0; y < LY; y++)
-        for (uint64_t z = 0; z < LZ; z++) {
-          // These are the QPhiX SIMD vector in checkerboarded x direction
-          // (up to LX/2) and the internal position inside the SIMD vector
-          const uint64_t SIMD_vector = (x / 2) / SOALEN;
-          const uint64_t x_internal = (x / 2) % SOALEN;
-
-          // Calculate the array index in tmlQCD & QPhiX,
-          // given a global lattice index (t,x,y,z)
-          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
-          const uint64_t tm_idx = g_ipt[t][x][y][z];
-
-          // Calculate base point for every spinor field element (tmlQCD) or
-          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
-          // which will depend on the checkerboard (cb)
-          const double *in = tm_spinor + Ns * Nc * Nz * tm_idx;
-          FT *out;
-          if ((t + x + y + z) & 1)
-            out = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // odd -> cb1
-          else
-            out = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // even -> cb0
-
-          // Copy the internal elements, performing a gamma basis transformation
-          for (int spin = 0; spin < Ns; spin++)  // QPhiX spin index
-            for (int color = 0; color < Nc; color++)
-              for (int z = 0; z < Nz; z++)  // RE or IM
-              {
-                const uint64_t qId =
-                    x_internal + z * SOALEN + spin * SOALEN * Nz + color * SOALEN * Nz * Ns;
-                const uint64_t tId = z + color * Nz + change_spin[spin] * Nz * Nc;
-
-                out[qId] = QPhiX::rep<FT, double>(change_sign[spin] * in[tId]);
-              }
-
-        }  // volume
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_to_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-// Reorder a cb0 and cb1 QPhiX spinor to a full tmLQCD spinor
-template <typename FT, int VECLEN, int SOALEN, bool compress12>
-void reorder_spinor_from_QPhiX(QPhiX::Geometry<FT, VECLEN, SOALEN, compress12> &geom,
-                               double *tm_spinor, FT const *qphix_spinor_cb0,
-                               FT const *qphix_spinor_cb1, double normFac = 1.0) {
-  const double startTime = gettime();
-
-  // Number of elements in spin, color & complex
-  const int Ns = 4;
-  const int Nc = 3;
-  const int Nz = 2;
-
-  // Geometric parameters for QPhiX data layout
-  const auto nVecs = geom.nVecs();
-  const auto Pxy = geom.getPxy();
-  const auto Pxyz = geom.getPxyz();
-
-  // This is needed to translate between the different
-  // gamma bases tmlQCD and QPhiX are using
-  const int change_sign[4] = {1, -1, -1, 1};
-  const int change_spin[4] = {3, 2, 1, 0};
-
-// This will loop over the entire lattice and calculate
-// the array and internal indices for both tmlQCD & QPhiX
-#pragma omp parallel for collapse(4)
-  for (uint64_t t = 0; t < T; t++)
-    for (uint64_t x = 0; x < LX; x++)
-      for (uint64_t y = 0; y < LY; y++)
-        for (uint64_t z = 0; z < LZ; z++) {
-          // These are the QPhiX SIMD vector in checkerboarded x direction
-          // (up to LX/2) and the internal position inside the SIMD vector
-          const uint64_t SIMD_vector = (x / 2) / SOALEN;
-          const uint64_t x_internal = (x / 2) % SOALEN;
-
-          // Calculate the array index in tmlQCD & QPhiX,
-          // given a global lattice index (t,x,y,z)
-          const uint64_t qphix_idx = t * Pxyz + z * Pxy + y * nVecs + SIMD_vector;
-          const uint64_t tm_idx = g_ipt[t][x][y][z];
-
-          // Calculate base point for every spinor field element (tmlQCD) or
-          // for every SIMD vector of spinors, a.k.a FourSpinorBlock (QPhiX),
-          // which will depend on the checkerboard (cb)
-          const FT *in;
-          if ((t + x + y + z) & 1)
-            in = qphix_spinor_cb1 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb1
-          else
-            in = qphix_spinor_cb0 + SOALEN * Nz * Nc * Ns * qphix_idx;  // cb0
-          double *out = tm_spinor + Ns * Nc * Nz * tm_idx;
-
-          // Copy the internal elements, performing a gamma basis transformation
-          for (int spin = 0; spin < Ns; spin++)  // tmlQCD spin index
-            for (int color = 0; color < Nc; color++)
-              for (int z = 0; z < Nz; z++)  // RE or IM
-              {
-                const uint64_t qId = x_internal + z * SOALEN + change_spin[spin] * SOALEN * Nz +
-                                     color * SOALEN * Nz * Ns;
-                const uint64_t tId = z + color * Nz + spin * Nz * Nc;
-
-                out[tId] = QPhiX::rep<double, FT>(normFac * change_sign[spin] * in[qId]);
-              }
-
-        }  // volume
-
-  const double diffTime = gettime() - startTime;
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-interface: time spent in reorder_spinor_from_QPhiX: %f secs\n",
-                        diffTime);
-  }
-}
-
-template <typename FT, int V, int S, bool compress12, typename FT_inner, int V_inner, int S_inner,
-          bool compress12_inner>
-void pack_nd_clover(
-    QPhiX::Geometry<FT, V, S, compress12> &geom,
-    QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner> &geom_inner,
-    typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock *full_invclov[2],
-    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *invclov_odiag,
-    typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock *clov,
-    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
-        *full_invclov_inner[2],
-    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
-        *invclov_odiag_inner,
-    typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock *clov_inner,
-    const int cb, bool pack_inner) {
-  typedef typename QPhiX::Geometry<FT, V, S, compress12>::CloverBlock QClover;
-  typedef typename QPhiX::Geometry<FT, V, S, compress12>::FullCloverBlock QFullClover;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::CloverBlock
-      QClover_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress12_inner>::FullCloverBlock
-      QFullClover_inner;
-
-  double start = gettime();
-  reorder_clover_to_QPhiX(geom, clov, cb, false);
-  if (pack_inner) {
-    reorder_clover_to_QPhiX(geom_inner, clov_inner, cb, false);
-  }
-
-  sw_invert_epsbar(g_epsbar);
-  reorder_clover_to_QPhiX(geom, invclov_odiag, 1 - cb, true, true);
-  if (pack_inner) {
-    reorder_clover_to_QPhiX(geom_inner, invclov_odiag_inner, 1 - cb, true, true);
-  }
-
-  // no minus sign here, the difference in the sign of gamma5
-  // is taken care of internally
-  sw_invert_mubar(g_mubar);
-  reorder_clover_to_QPhiX(geom, full_invclov, 1 - cb, true);
-  if (pack_inner) {
-    reorder_clover_to_QPhiX(geom_inner, full_invclov_inner, 1 - cb, true);
-  }
-
-  sw_invert_nd(g_mubar * g_mubar - g_epsbar * g_epsbar);
-
-  if (g_debug_level > 1) {
-    QPhiX::masterPrintf("# QPHIX-inteface: ND TMClover clover-field packing took %.4lf seconds\n",
-                        gettime() - start);
-  }
-}
-
-// Due to github issue #404, the helper functions to apply the full QPhiX operator
-// are currently disabled because they conflict with the new interfaces in QPhiX
-// itself. If required, these should be rewritten to use these interfaces
-// rather than the base classes in qphix_base_classes.hpp
-
-// Apply the full QPhiX fermion matrix to checkerboarded tm spinors
-// template <typename FT, int V, int S, bool compress>
-// void Mfull_helper(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
-//                  const op_type_t op_type) {
-//  // TODO: this should use handles for gauge and spinors because these are definitely temporary
-//  // objects
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
-//  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
-//
-//  if (g_debug_level > 1) tmlqcd::printQphixDiagnostics(V, S, compress, V, S, compress);
-//
-//  double coeff_s = (FT)(1);
-//  double coeff_t = (FT)(1);
-//
-//  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
-//                                           MinCt);
-//
-//  // Wilson mass
-//  double mass = 1 / (2.0 * g_kappa) - 4;
-//
-//  tmlqcd::Dslash<FT, V, S, compress> *polymorphic_dslash;
-//
-//  QGauge *u_packed[2];
-//  QSpinor *qphix_in[2];
-//  QSpinor *qphix_out[2];
-//
-//  QClover *clover[2];
-//  QClover *inv_clover[2];
-//
-//  QFullClover *inv_fullclover[2][2];
-//
-//  QSpinor *tmp_spinor = (QSpinor *)geom.allocCBFourSpinor();
-//  for (int cb : {0, 1}) {
-//    u_packed[cb] = (QGauge *)geom.allocCBGauge();
-//    qphix_in[cb] = (QSpinor *)geom.allocCBFourSpinor();
-//    qphix_out[cb] = (QSpinor *)geom.allocCBFourSpinor();
-//    clover[cb] = nullptr;
-//    inv_clover[cb] = nullptr;
-//    for (int fl : {0, 1}) {
-//      inv_fullclover[cb][fl] = nullptr;
-//    }
-//  }
-//  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
-//
-//  if (op_type == WILSON) {
-//    polymorphic_dslash = new tmlqcd::WilsonDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, use_tbc, tbc_phases);
-//  } else if (op_type == TMWILSON) {
-//    polymorphic_dslash = new tmlqcd::WilsonTMDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), use_tbc, tbc_phases);
-//  } else if (op_type == CLOVER && fabs(g_mu) <= DBL_EPSILON) {
-//    for (int cb : {0, 1}) {
-//      clover[cb] = (QClover *)geom.allocCBClov();
-//      inv_clover[cb] = (QClover *)geom.allocCBClov();
-//
-//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
-//      sw_invert(cb, 0);
-//      reorder_clover_to_QPhiX(geom, inv_clover[cb], cb, true);
-//    }
-//
-//    polymorphic_dslash = new tmlqcd::WilsonClovDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, clover, inv_clover, use_tbc, tbc_phases);
-//
-//  } else if (op_type == CLOVER && fabs(g_mu) > DBL_EPSILON) {
-//    for (int cb : {0, 1}) {
-//      clover[cb] = (QClover *)geom.allocCBClov();
-//      for (int fl : {0, 1}) {
-//        inv_fullclover[cb][fl] = (QFullClover *)geom.allocCBFullClov();
-//      }
-//      reorder_clover_to_QPhiX(geom, clover[cb], cb, false);
-//      sw_invert(cb, g_mu);
-//      reorder_clover_to_QPhiX(geom, inv_fullclover[cb], cb, true);
-//    }
-//
-//    polymorphic_dslash = new tmlqcd::WilsonClovTMDslash<FT, V, S, compress>(
-//        &geom, t_boundary, coeff_s, coeff_t, mass, -g_mu / (2.0 * g_kappa), clover,
-//        inv_fullclover, use_tbc, tbc_phases);
-//
-//  } else {
-//    QPhiX::masterPrintf("tmlqcd::Mfull_helper; No such operator type: %d\n", op_type);
-//    abort();
-//  }
-//
-////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
-////                              qphix_in[cb_even], cb_even);
-////   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
-/// qphix_in[cb_odd], /                              cb_odd);
-//  reorder_eo_spinor_to_QPhiX(geom, Even_in,
-//                             qphix_in[cb_even], cb_even);
-//  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_in[cb_odd],
-//                             cb_odd);
-//  // Apply QPhiX Mfull
-//  polymorphic_dslash->plain_dslash(qphix_out[cb_odd], qphix_in[cb_even], u_packed[cb_odd],
-//                                   /* isign == non-conjugate */ 1, cb_odd);
-//  polymorphic_dslash->plain_dslash(qphix_out[cb_even], qphix_in[cb_odd], u_packed[cb_even],
-//                                   /* isign == non-conjugate */ 1, cb_even);
-//  for (int cb : {0, 1}) {
-//    polymorphic_dslash->A_chi(tmp_spinor, qphix_in[cb], 1, cb);
-//    QPhiX::aypx(-0.5, tmp_spinor, qphix_out[cb], geom, 1);
-//  }
-//
-//  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_out[cb_even],
-//                               cb_even, 2.0 * g_kappa);
-//  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_out[cb_odd], cb_odd,
-//                               2.0 * g_kappa);
-//
-//  geom.free(tmp_spinor);
-//  for (int cb : {0, 1}) {
-//    geom.free(u_packed[cb]);
-//    geom.free(qphix_in[cb]);
-//    geom.free(qphix_out[cb]);
-//    geom.free(clover[cb]);
-//    geom.free(inv_clover[cb]);
-//    for (int fl : {0, 1}) {
-//      geom.free(inv_fullclover[cb][fl]);
-//    }
-//  };
-//  delete (polymorphic_dslash);
-//}
-
-// Templated even-odd preconditioned solver using QPhiX Library
-template <typename FT, int V, int S, bool compress, typename FT_inner = FT, int V_inner = V,
-          int S_inner = S, bool compress_inner = compress>
-int invert_eo_qphix_helper(std::vector<std::vector<spinor *> > &tmlqcd_odd_out,
-                           std::vector<std::vector<spinor *> > &tmlqcd_odd_in,
-                           const double target_precision, const int max_iter, const int solver_flag,
-                           solver_params_t solver_params, const int num_flavour) {
-  // TODO: it would perhaps be beneficial to keep the fields resident
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::SU3MatrixBlock QGauge;
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::FourSpinorBlock QSpinor;
-  typedef typename QPhiX::FourSpinorHandle<FT, V, S, compress> QSpinorHandle;
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::CloverBlock QClover;
-  typedef typename QPhiX::Geometry<FT, V, S, compress>::FullCloverBlock QFullClover;
-
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::SU3MatrixBlock
-      QGauge_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FourSpinorBlock
-      QSpinor_inner;
-  typedef typename QPhiX::FourSpinorHandle<FT_inner, V_inner, S_inner, compress_inner>
-      QSpinorHandle_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::CloverBlock
-      QClover_inner;
-  typedef typename QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner>::FullCloverBlock
-      QFullClover_inner;
-
-  /************************
-   *                      *
-   *    SETUP GEOMETRY    *
-   *                      *
-   ************************/
-
-  if (g_debug_level > 1) {
-    tmlqcd::printQphixDiagnostics(V, S, compress, V_inner, S_inner, compress_inner);
-  }
-
-  QPhiX::Geometry<FT, V, S, compress> geom(subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ,
-                                           MinCt);
-
-  // we always create the inner geometry, the overhead should be small...
-  QPhiX::Geometry<FT_inner, V_inner, S_inner, compress_inner> geom_inner(
-      subLattSize, By, Bz, NCores, Sy, Sz, PadXY, PadXYZ, MinCt);
-
-  // Set number of BLAS threads by hand.
-  // In case some implements the tune routines in QPhiX
-  // this may be updated...
-  QPhiX::masterPrintf("# Setting number of BLAS threads...\n");
-  const int n_blas_simt = N_simt;
-  QPhiX::masterPrintf("# ...done.\n");
-
-  // Anisotropy Coefficents
-  const double coeff_s = 1.0;
-  const double coeff_t = 1.0;
-
-  // The Wilson mass
-  const double mass = 1.0 / (2.0 * g_kappa) - 4.0;
-
-  // Set variables need for solve
-  bool verbose = g_debug_level > 2 ? true : false;
-  int niters = -1;
-  int niters2 = 0;
-  double rsd_final = -1.0;
-  uint64_t site_flops = 0;
-  uint64_t site_flops2 = 0;
-  uint64_t mv_apps = 0;
-  uint64_t mv_apps2 = 0;
-
-  double start_time;
-  double end_time;
-
-  // support for multi-shift solves via the length of the output vector,
-  // which counts the shifts on the outer index and the flavour on the inner index
-  const int num_shifts = tmlqcd_odd_out.size();
-  std::vector<double> shifts;
-  shifts.resize(num_shifts);
-  std::vector<double> RsdTargetArr;
-  RsdTargetArr.resize(num_shifts);
-  std::vector<double> RsdFinalArr;
-  RsdFinalArr.resize(num_shifts);
-
-  double rescale = 0.5 / g_kappa;
-  // the inverse of M M^dag, as required for the HMC, comes with a factor of alpha^2
-  if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-    rescale *= rescale;
-  }
-
-  std::vector<QSpinorHandle> q_spinor_handles;
-
-  QGauge *u_packed[2] = {nullptr, nullptr};
-  QGauge_inner *u_packed_inner[2] = {nullptr, nullptr};
-  for (int cb : {0, 1}) {
-    u_packed[cb] = (QGauge *)geom.allocCBGauge();
-  }
-  // Reorder (global) input gauge field from tmLQCD to QPhiX
-  reorder_gauge_to_QPhiX(geom, u_packed[cb_even], u_packed[cb_odd]);
-
-  // for mixed solvers, we also need the gauge field in the inner precision
-  if (solver_is_mixed(solver_flag)) {
-    for (int cb : {0, 1}) {
-      u_packed_inner[cb] = (QGauge_inner *)geom_inner.allocCBGauge();
-    }
-    reorder_gauge_to_QPhiX(geom_inner, u_packed_inner[cb_even], u_packed_inner[cb_odd]);
-  }
-
-  if (num_flavour == 1) {
-    constexpr int nf = 1;
-    std::vector<QSpinor *> qphix_in;
-    qphix_in.resize(1);
-    std::vector<QSpinor *> qphix_out;
-    qphix_out.resize(num_shifts);
-    QSpinor *qphix_buffer;
-
-    QClover *qphix_clover = nullptr;
-    QClover *qphix_inv_clover = nullptr;
-
-    QClover_inner *qphix_clover_inner = nullptr;
-    QClover_inner *qphix_inv_clover_inner = nullptr;
-
-    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
-
-    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
-
-    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-    qphix_in[0] = q_spinor_handles.back().get();
-
-    for (int shift = 0; shift < num_shifts; shift++) {
-      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-      qphix_out[shift] = q_spinor_handles.back().get();
-    }
-
-    q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-    qphix_buffer = q_spinor_handles.back().get();
-
-    QPhiX::EvenOddLinearOperator<FT, V, S, compress> *FermionMatrixQPhiX = nullptr;
-    QPhiX::EvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
-        *InnerFermionMatrixQPhiX = nullptr;
-    if ((fabs(g_mu) > DBL_EPSILON) && g_c_sw > DBL_EPSILON) {  // TWISTED-MASS-CLOVER
-      qphix_clover = (QClover *)geom.allocCBClov();
-      for (int fl : {0, 1}) {
-        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
-      }
-      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
-      reorder_clover_to_QPhiX(geom, qphix_inv_fullclover, cb_even, true);
-
-      QPhiX::masterPrintf("# Creating QPhiX Twisted Clover Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddTMCloverOperator<FT, V, S, compress>(
-          u_packed, qphix_clover, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t,
-          use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
-      if (solver_is_mixed(solver_flag)) {
-        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        for (int fl : {0, 1}) {
-          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
-        }
-        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
-        reorder_clover_to_QPhiX(geom_inner, qphix_inv_fullclover_inner, cb_even, true);
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddTMCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                u_packed_inner, qphix_clover_inner, qphix_inv_fullclover_inner, &geom_inner,
-                t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * (g_mu3 + g_mu) / g_kappa);
-      }
-      QPhiX::masterPrintf("# ...done.\n");
-    } else if (fabs(g_mu) > DBL_EPSILON) {  // TWISTED-MASS
-      const double TwistedMass = -g_mu / (2.0 * g_kappa);
-      QPhiX::masterPrintf("# Creating QPhiX Twisted Mass Wilson Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddTMWilsonOperator<FT, V, S, compress>(
-          mass, TwistedMass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      QPhiX::masterPrintf("# ...done.\n");
-      if (solver_is_mixed(solver_flag)) {
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddTMWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                mass, TwistedMass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t,
-                use_tbc, tbc_phases);
-      }
-    } else if (g_c_sw > DBL_EPSILON) {  // WILSON CLOVER
-      qphix_clover = (QClover *)geom.allocCBClov();
-      qphix_inv_clover = (QClover *)geom.allocCBClov();
-
-      reorder_clover_to_QPhiX(geom, qphix_clover, cb_odd, false);
-      reorder_clover_to_QPhiX(geom, qphix_inv_clover, cb_even, true);
-
-      QPhiX::masterPrintf("# Creating QPhiX Wilson Clover Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddCloverOperator<FT, V, S, compress>(
-          u_packed, qphix_clover, qphix_inv_clover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
-          tbc_phases, -0.5 * g_mu3 / g_kappa);
-      if (solver_is_mixed(solver_flag)) {
-        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        qphix_inv_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        reorder_clover_to_QPhiX(geom_inner, qphix_clover_inner, cb_odd, false);
-        reorder_clover_to_QPhiX(geom_inner, qphix_inv_clover_inner, cb_even, true);
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddCloverOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                u_packed_inner, qphix_clover_inner, qphix_inv_clover_inner, &geom_inner, t_boundary,
-                coeff_s, coeff_t, use_tbc, tbc_phases, -0.5 * g_mu3 / g_kappa);
-      }
-      QPhiX::masterPrintf("# ...done.\n");
-
-    } else {  // WILSON
-      QPhiX::masterPrintf("# Creating QPhiX Wilson Fermion Matrix...\n");
-      FermionMatrixQPhiX = new QPhiX::EvenOddWilsonOperator<FT, V, S, compress>(
-          mass, u_packed, &geom, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      if (solver_is_mixed(solver_flag)) {
-        InnerFermionMatrixQPhiX =
-            new QPhiX::EvenOddWilsonOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                mass, u_packed_inner, &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc,
-                tbc_phases);
-      }
-      QPhiX::masterPrintf("# ...done.\n");
-    }
-
-    // Create a Linear Solver Object
-    QPhiX::AbstractSolver<FT, V, S, compress> *SolverQPhiX = nullptr;
-    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner> *InnerSolverQPhiX = nullptr;
-    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *MultiSolverQPhiX = nullptr;
-    if (solver_flag == DUMMYHERMTEST) {
-      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
-      SolverQPhiX =
-          new QPhiX::InvDummyHermTest<FT, V, S, compress,
-                                      typename QPhiX::EvenOddLinearOperator<FT, V, S, compress> >(
-              *FermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == CG) {
-      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
-      SolverQPhiX = new QPhiX::InvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == BICGSTAB) {
-      QPhiX::masterPrintf("# QPHIX: Creating BiCGStab solver...\n");
-      SolverQPhiX = new QPhiX::InvBiCGStab<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == MIXEDCG) {
-      // TODO: probably need to adjust inner solver iterations here...
-      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
-      InnerSolverQPhiX = new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner>(
-          *InnerFermionMatrixQPhiX, max_iter);
-      const bool MMdag = true;
-      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
-                                                      S_inner, compress_inner, MMdag>(
-          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
-    } else if (solver_flag == MIXEDBICGSTAB) {
-      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision BICGCGSTAB solver...\n");
-      InnerSolverQPhiX = new QPhiX::InvBiCGStab<FT_inner, V_inner, S_inner, compress_inner>(
-          *InnerFermionMatrixQPhiX, max_iter);
-      const bool MMdag = false;
-      SolverQPhiX = new QPhiX::InvRichardsonMultiPrec<FT, V, S, compress, FT_inner, V_inner,
-                                                      S_inner, compress_inner, MMdag>(
-          *FermionMatrixQPhiX, *InnerSolverQPhiX, solver_params.mcg_delta, max_iter);
-    } else if (solver_flag == CGMMS) {
-      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver ...\n");
-      MultiSolverQPhiX =
-          new QPhiX::MInvCG<FT, V, S, compress>(*FermionMatrixQPhiX, max_iter, num_shifts);
-    } else {
-      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
-      QPhiX::masterPrintf(" Aborting...\n");
-      abort();
-    }
-    QPhiX::masterPrintf("# ...done.\n");
-
-    //     reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
-    //     *const>(tmlqcd_odd_in[0][0]),
-    //                                qphix_in[0], cb_odd);
-    reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][0], qphix_in[0], cb_odd);
-    QPhiX::masterPrintf("# Calling the solver...\n");
-
-    // Set the right precision for the QPhiX solver
-    // we get target_precision externally and and is given such, that it's either
-    // already relative or absolute
-    // Most QPhiX solvers allow setting absolute or relative residual
-    // by passing an appropriate flag, but this is not true for the multi-shift solver.
-    // As a result, we follow that solver and call ALL solvers with
-    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
-    double rhs_norm2 = 1.0;
-    QPhiX::norm2Spinor(rhs_norm2, qphix_in[0], geom, n_blas_simt);
-    const double RsdTarget = sqrt(target_precision / rhs_norm2);
-
-    // Calling the solver
-    start_time = gettime();
-    if (solver_flag == DUMMYHERMTEST) {
-      random_spinor_field_eo(tmlqcd_odd_out[0][0], 0, RN_GAUSS);
-      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][0], qphix_buffer, cb_odd);
-      for (int isign : {-1, 1}) {
-        (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
-                       isign, verbose, cb_odd, QPhiX::RELATIVE);
-      }
-      QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-    } else if (solver_flag == CG || solver_flag == MIXEDCG || solver_flag == RGMIXEDCG) {
-      // USING CG:
-      // We are solving
-      //   M M^dagger qphix_buffer = qphix_in_prepared
-      // here, that is, isign = -1 for the QPhiX CG solver.
-      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
-                     -1, verbose, cb_odd, QPhiX::RELATIVE);
-      // After that. if required by the solution type, multiply with M^dagger:
-      //   qphix_out[1] = M^dagger ( M^dagger^-1 M^-1 ) qphix_in_prepared
-      if (solver_params.solution_type == TM_SOLUTION_M) {
-        (*FermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
-        mv_apps++;
-      } else {
-        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    } else if (solver_flag == CGMMS) {
-      // TODO: handle the residuals properly
-      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMS: shifts: \n");
-      for (int shift = 0; shift < num_shifts; shift++) {
-        RsdTargetArr[shift] = RsdTarget;
-        RsdFinalArr[shift] = -1.0;
-        shifts[shift] =
-            solver_params.shifts[shift] * solver_params.shifts[shift] / (4 * g_kappa * g_kappa);
-        if (g_debug_level > 2)
-          QPhiX::masterPrintf("# QPHIX CGMMS: shift[%d] = %.6e\n", shift, shifts[shift]);
-      }
-      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
-      (*MultiSolverQPhiX)(qphix_out.data(), qphix_in[0], num_shifts, shifts.data(),
-                          RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops, mv_apps, -1,
-                          verbose);
-      rsd_final = RsdFinalArr[0];
-    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
-      (*SolverQPhiX)(qphix_buffer, qphix_in[0], RsdTarget, niters, rsd_final, site_flops, mv_apps,
-                     1, verbose, cb_odd, QPhiX::RELATIVE);
-      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
-      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-        (*SolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
-                       mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
-      } else {
-        QPhiX::copySpinor(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    }
-    end_time = gettime();
-
-    for (int shift = 0; shift < num_shifts; shift++) {
-      reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][0], qphix_out[shift], cb_odd,
-                                   rescale);
-    }
-
-    QPhiX::masterPrintf("# QPHIX: ...done.\n");
-    QPhiX::masterPrintf("# QPHIX: Cleaning up\n");
-    delete (FermionMatrixQPhiX);
-    delete (InnerFermionMatrixQPhiX);
-    delete (SolverQPhiX);
-    delete (InnerSolverQPhiX);
-    delete (MultiSolverQPhiX);
-    // on KNL, it seems that munmap is problematic, so we check for nullptr
-    if (qphix_clover) geom.free(qphix_clover);
-    if (qphix_inv_clover) geom.free(qphix_inv_clover);
-    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
-    if (qphix_inv_clover_inner) geom_inner.free(qphix_inv_clover_inner);
-    for (int fl : {0, 1}) {
-      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
-      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
-    }
-    QPhiX::masterPrintf("# QPHIX: ...done.\n\n");
-
-  } else if (num_flavour == 2) {
-    // for explicit template arguments
-    constexpr int nf = 2;
-
-    QSpinor *qphix_in[2];
-    std::vector<QSpinor **> qphix_out;
-    qphix_out.resize(num_shifts);
-    for (int shift = 0; shift < num_shifts; shift++) {
-      qphix_out[shift] = new QSpinor *[2];
-      for (int fl : {0, 1}) {
-        q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-        qphix_out[shift][fl] = q_spinor_handles.back().get();
-      }
-    }
-
-    QSpinor *qphix_buffer[2];
-    for (int fl : {0, 1}) {
-      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-      qphix_in[fl] = q_spinor_handles.back().get();
-      q_spinor_handles.push_back(makeFourSpinorHandle(geom));
-      qphix_buffer[fl] = q_spinor_handles.back().get();
-    }
-
-    QClover *qphix_clover = nullptr;
-    QClover_inner *qphix_clover_inner = nullptr;
-
-    QClover *qphix_invclov_odiag = nullptr;
-    QClover_inner *qphix_invclov_odiag_inner = nullptr;
-
-    QFullClover *qphix_inv_fullclover[2] = {nullptr, nullptr};
-    QFullClover_inner *qphix_inv_fullclover_inner[2] = {nullptr, nullptr};
-
-    QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> *TwoFlavFermionMatrixQPhiX = nullptr;
-    QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner, compress_inner>
-        *InnerTwoFlavFermionMatrixQPhiX = nullptr;
-
-    if (g_c_sw > DBL_EPSILON) {  // DBCLOVER
-      qphix_clover = (QClover *)geom.allocCBClov();
-      qphix_invclov_odiag = (QClover *)geom.allocCBClov();
-      if (solver_is_mixed(solver_flag)) {
-        qphix_clover_inner = (QClover_inner *)geom_inner.allocCBClov();
-        qphix_invclov_odiag_inner = (QClover_inner *)geom_inner.allocCBClov();
-      }
-
-      for (int fl : {0, 1}) {
-        qphix_inv_fullclover[fl] = (QFullClover *)geom.allocCBFullClov();
-        if (solver_is_mixed(solver_flag)) {
-          qphix_inv_fullclover_inner[fl] = (QFullClover_inner *)geom_inner.allocCBFullClov();
-        }
-      }
-
-      pack_nd_clover(geom, geom_inner, qphix_inv_fullclover, qphix_invclov_odiag, qphix_clover,
-                     qphix_inv_fullclover_inner, qphix_invclov_odiag_inner, qphix_clover_inner,
-                     cb_odd, solver_is_mixed(solver_flag));
-
-      QPhiX::masterPrintf(
-          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Clover Fermion Matrix...\n");
-      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMCloverReuseOperator<FT, V, S, compress>(
-          -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, qphix_clover,
-          qphix_invclov_odiag, qphix_inv_fullclover, &geom, t_boundary, coeff_s, coeff_t, use_tbc,
-          tbc_phases);
-      if (solver_is_mixed(solver_flag)) {
-        InnerTwoFlavFermionMatrixQPhiX =
-            new QPhiX::EvenOddNDTMCloverReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
-                qphix_clover_inner, qphix_invclov_odiag_inner, qphix_inv_fullclover_inner,
-                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      }
-    } else {  // DBTMWILSON
-      QPhiX::masterPrintf(
-          "# QPHIX: Creating two-flavour QPhiX Wilson Twisted Mass Fermion Matrix...\n");
-      TwoFlavFermionMatrixQPhiX = new QPhiX::EvenOddNDTMWilsonReuseOperator<FT, V, S, compress>(
-          mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed, &geom, t_boundary,
-          coeff_s, coeff_t, use_tbc, tbc_phases);
-      if (solver_is_mixed(solver_flag)) {
-        InnerTwoFlavFermionMatrixQPhiX =
-            new QPhiX::EvenOddNDTMWilsonReuseOperator<FT_inner, V_inner, S_inner, compress_inner>(
-                mass, -0.5 * g_mubar / g_kappa, 0.5 * g_epsbar / g_kappa, u_packed_inner,
-                &geom_inner, t_boundary, coeff_s, coeff_t, use_tbc, tbc_phases);
-      }
-    }
-
-    //
-    QPhiX::AbstractSolver<FT, V, S, compress, nf> *TwoFlavSolverQPhiX = nullptr;
-    QPhiX::AbstractSolver<FT_inner, V_inner, S_inner, compress_inner, nf> *InnerTwoFlavSolverQPhiX =
-        nullptr;
-    QPhiX::AbstractMultiSolver<FT, V, S, compress, nf> *TwoFlavMultiSolverQPhiX = nullptr;
-    if (solver_flag == DUMMYHERMTEST) {
-      QPhiX::masterPrintf("# QPHIX: Creating dummy solver for hermiticity test...\n");
-      TwoFlavSolverQPhiX = new QPhiX::InvDummyHermTest<
-          FT, V, S, compress, typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-          *TwoFlavFermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == CG) {
-      QPhiX::masterPrintf("# QPHIX: Creating CG solver...\n");
-      TwoFlavSolverQPhiX =
-          new QPhiX::InvCG<FT, V, S, compress,
-                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-              *TwoFlavFermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == BICGSTAB) {
-      QPhiX::masterPrintf("# QPHIX: Creating BiCGstab solver...\n");
-      TwoFlavSolverQPhiX =
-          new QPhiX::InvBiCGStab<FT, V, S, compress,
-                                 typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-              *TwoFlavFermionMatrixQPhiX, max_iter);
-    } else if (solver_flag == MIXEDCG) {
-      QPhiX::masterPrintf("# QPHIX: Creating mixed-precision CG solver...\n");
-      InnerTwoFlavSolverQPhiX =
-          new QPhiX::InvCG<FT_inner, V_inner, S_inner, compress_inner,
-                           typename QPhiX::TwoFlavEvenOddLinearOperator<FT_inner, V_inner, S_inner,
-                                                                        compress_inner> >(
-              *InnerTwoFlavFermionMatrixQPhiX, max_iter);
-      const bool MMdag = true;
-      TwoFlavSolverQPhiX = new QPhiX::InvRichardsonMultiPrec<
-          FT, V, S, compress, FT_inner, V_inner, S_inner, compress_inner, MMdag,
-          typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-          *TwoFlavFermionMatrixQPhiX, *InnerTwoFlavSolverQPhiX, solver_params.mcg_delta, max_iter);
-    } else if (solver_flag == CGMMSND) {
-      QPhiX::masterPrintf("# QPHIX: Creating multi-shift CG solver...\n");
-      TwoFlavMultiSolverQPhiX =
-          new QPhiX::MInvCG<FT, V, S, compress,
-                            typename QPhiX::TwoFlavEvenOddLinearOperator<FT, V, S, compress> >(
-              *TwoFlavFermionMatrixQPhiX, max_iter, num_shifts);
-    } else {
-      QPhiX::masterPrintf(" Solver not yet supported by QPhiX!\n");
-      QPhiX::masterPrintf(" Aborting...\n");
-      abort();
-    }
-    QPhiX::masterPrintf("# QPHIX: ...done.\n");
-
-    for (int fl : {0, 1}) {
-      //       reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const
-      //       *const>(tmlqcd_odd_in[0][fl]),
-      //                                  qphix_in[fl], cb_odd);
-      reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_in[0][fl], qphix_in[fl], cb_odd);
-    }
-
-    QPhiX::masterPrintf("# QPHIX: Calling the solver...\n");
-
-    // Set the right precision for the QPhiX solver
-    // we get target_precision externally and and is given such, that it's either
-    // already relative or absolute
-    // Most QPhiX solvers allow setting absolute or relative residual
-    // by passing an appropriate flag, but this is not true for the multi-shift solver.
-    // As a result, we follow that solver and call ALL solvers with
-    // QPhiX::RELATIVE, which gives results consistent with tmLQCD in all cases.
-    double rhs_norm2 = 1.0;
-    QPhiX::norm2Spinor<FT, V, S, compress, nf>(rhs_norm2, qphix_in, geom, n_blas_simt);
-    const double RsdTarget = sqrt(target_precision / rhs_norm2);
-
-    // Calling the solver
-    start_time = gettime();
-    if (solver_flag == DUMMYHERMTEST) {
-      for (int fl : {0, 1}) {
-        random_spinor_field_eo(tmlqcd_odd_out[0][fl], 0, RN_GAUSS);
-        reorder_eo_spinor_to_QPhiX(geom, tmlqcd_odd_out[0][fl], qphix_buffer[fl], cb_odd);
-      }
-      for (int isign : {-1, 1}) {
-        (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
-                              mv_apps, isign, verbose, cb_odd, QPhiX::RELATIVE);
-      }
-      QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-    } else if (solver_flag == CG || solver_flag == MIXEDCG) {
-      // USING CG:
-      // We are solving
-      //   M M^dagger qphix_buffer = qphix_in_prepared
-      // here, that is, isign = -1 for the QPhiX CG solver.
-      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
-                            mv_apps, -1, verbose, cb_odd, QPhiX::RELATIVE);
-      // After that. if required by the solution type, multiply with M^dagger:
-      //   qphix_out[1] = M^dagger M^dagger^-1 M^-1 qphix_in_prepared
-      if (solver_params.solution_type == TM_SOLUTION_M) {
-        (*TwoFlavFermionMatrixQPhiX)(qphix_out[0], qphix_buffer, /* conjugate */ -1);
-        mv_apps++;
-      } else {
-        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    } else if (solver_flag == BICGSTAB || solver_flag == MIXEDBICGSTAB) {
-      (*TwoFlavSolverQPhiX)(qphix_buffer, qphix_in, RsdTarget, niters, rsd_final, site_flops,
-                            mv_apps, 1, verbose, cb_odd, QPhiX::RELATIVE);
-      // for M^dagger^-1 M^-1 solution type, need to call BiCGstab twice
-      if (solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-        (*TwoFlavSolverQPhiX)(qphix_out[0], qphix_buffer, RsdTarget, niters2, rsd_final, site_flops,
-                              mv_apps2, -1, verbose, cb_odd, QPhiX::RELATIVE);
-      } else {
-        QPhiX::copySpinor<FT, V, S, compress, nf>(qphix_out[0], qphix_buffer, geom, n_blas_simt);
-      }
-    } else if (solver_flag == CGMMSND) {
-      // TODO: handle the residuals properly
-      if (g_debug_level > 2) QPhiX::masterPrintf("# QPHIX CGMMSND: shifts: \n");
-      // tmLQCD weights the operator with 1/maxev in the RHMC relative to the shifts
-      // we will do this externally on the inverse (in monomial_solve) and thus need to weight
-      // the shifts by maxev^2
-      const double maxev_sq = (1.0 / phmc_invmaxev) * (1.0 / phmc_invmaxev);
-      for (int shift = 0; shift < num_shifts; shift++) {
-        RsdTargetArr[shift] = RsdTarget;
-        RsdFinalArr[shift] = -1.0;
-        shifts[shift] = maxev_sq * solver_params.shifts[shift] * solver_params.shifts[shift] /
-                        (4 * g_kappa * g_kappa);
-        if (g_debug_level > 2) QPhiX::masterPrintf("# [%d] = %lf\n", shift, shifts[shift]);
-      }
-      if (g_debug_level > 2) QPhiX::masterPrintf("\n");
-      (*TwoFlavMultiSolverQPhiX)(qphix_out.data(), qphix_in, num_shifts, shifts.data(),
-                                 RsdTargetArr.data(), niters, RsdFinalArr.data(), site_flops,
-                                 mv_apps, -1, verbose);
-      rsd_final = RsdFinalArr[0];
-    }
-    end_time = gettime();
-
-    for (int shift = 0; shift < num_shifts; shift++) {
-      for (int fl : {0, 1}) {
-        reorder_eo_spinor_from_QPhiX(geom, tmlqcd_odd_out[shift][fl], qphix_out[shift][fl], cb_odd,
-                                     rescale);
-      }
-    }
-
-    delete TwoFlavFermionMatrixQPhiX;
-    delete InnerTwoFlavFermionMatrixQPhiX;
-    delete InnerTwoFlavSolverQPhiX;
-    delete TwoFlavMultiSolverQPhiX;
-    delete TwoFlavSolverQPhiX;
-    for (int shift = 0; shift < num_shifts; shift++) {
-      delete[] qphix_out[shift];
-    }
-
-    if (qphix_clover) geom.free(qphix_clover);
-    if (qphix_invclov_odiag) geom.free(qphix_invclov_odiag);
-    if (qphix_clover_inner) geom_inner.free(qphix_clover_inner);
-    if (qphix_invclov_odiag_inner) geom_inner.free(qphix_invclov_odiag_inner);
-    for (int fl : {0, 1}) {
-      if (qphix_inv_fullclover[fl]) geom.free(qphix_inv_fullclover[fl]);
-      if (qphix_inv_fullclover_inner[fl]) geom_inner.free(qphix_inv_fullclover_inner[fl]);
-    }
-
-  } else {  // if(num_flavour)
-    // complain, this number of flavours is not valid
-  }  // if(num_flavour)
-
-  for (int cb : {0, 1}) {
-    if (u_packed[cb]) geom.free(u_packed[cb]);
-    if (u_packed_inner[cb]) geom_inner.free(u_packed_inner[cb]);
-  }
-
-  // FIXME: This should be called properly somewhere else
-  _endQphix();
-
-  QPhiX::masterPrintf("# ...done.\n\n");
-
-  uint64_t num_cb_sites = lattSize[0] / 2 * lattSize[1] * lattSize[2] * lattSize[3];
-  // FIXME: this needs to be adjusted depending on the operator used
-  uint64_t op_flops_per_site = 1320;
-  uint64_t total_flops =
-      (site_flops + site_flops2 + (2 * num_flavour * op_flops_per_site) * (mv_apps + mv_apps2)) *
-      num_cb_sites;
-  QPhiX::masterPrintf("# QPHIX: Solver Time = %g sec\n", (end_time - start_time));
-  QPhiX::masterPrintf("# QPHIX: Performance in GFLOPS = %g\n\n",
-                      1.0e-9 * total_flops / (end_time - start_time));
-
-  if (solver_is_mixed(solver_flag)) {
-    // the mixed solver reports the outer iterations, we would like to get
-    // some better total
-    niters = mv_apps / 2;
-    if (solver_flag == MIXEDBICGSTAB && solver_params.solution_type == TM_SOLUTION_M_MDAG) {
-      niters2 = mv_apps2 / 2;
-    }
-  }
-  // solver did not converge in maximum number of iterations
-  // FIXME: non-convergence does not work correctly yet
-  if ((niters + niters2) > max_iter) {
-    niters = -1;
-    niters2 = 0;
-  }
-  return (niters + niters2);
-}
-
-// Due to github issue #404, the helper functions to apply the full QPhiX operator
-// are currently disabled because they conflict with the new interfaces in QPhiX
-// itself. If required, these should be rewritten to use these interfaces
-// rather than the base classes in qphix_base_classes.hpp
-
-// Template wrapper for the Dslash operator call-able from C code
-// void Mfull_qphix(spinor *Even_out, spinor *Odd_out, const spinor *Even_in, const spinor *Odd_in,
-//                 const op_type_t op_type) {
-//  tmlqcd::checkQphixInputParameters(qphix_input);
-//  // FIXME: two-row gauge compression and double precision hard-coded
-//  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
-//
-//  if (qphix_precision == QPHIX_DOUBLE_PREC) {
-//    if (QPHIX_SOALEN > VECLEN_DP) {
-//      QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
-//                          VECLEN_DP);
-//      abort();
-//    }
-//    QPhiX::masterPrintf("TESTING IN DOUBLE PRECISION \n");
-//    if (compress12) {
-//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                          op_type);
-//    } else {
-//      Mfull_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                           op_type);
-//    }
-//  } else if (qphix_precision == QPHIX_FLOAT_PREC) {
-//    if (QPHIX_SOALEN > VECLEN_SP) {
-//      QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
-//                          VECLEN_SP);
-//      abort();
-//    }
-//    QPhiX::masterPrintf("TESTING IN SINGLE PRECISION \n");
-//    if (compress12) {
-//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                         op_type);
-//    } else {
-//      Mfull_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                          op_type);
-//    }
-//  }
-// #if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
-//  else if (qphix_precision == QPHIX_HALF_PREC) {
-//    if (QPHIX_SOALEN > VECLEN_HP) {
-//      QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
-//                          VECLEN_HP);
-//      abort();
-//    }
-//    QPhiX::masterPrintf("TESTING IN HALF PRECISION \n");
-//    if (compress12) {
-//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(Even_out, Odd_out, Even_in, Odd_in,
-//                                                               op_type);
-//    } else {
-//      Mfull_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(Even_out, Odd_out, Even_in,
-//      Odd_in,
-//                                                                op_type);
-//    }
-//  }
-// #endif
-//}
-
-// we have a unified interface for n-flavour inversions, but we need to provide wrappers
-// which can be called by the tmLQCD solver drivers for one and two-flavour inversions
-int invert_eo_qphix_oneflavour(spinor *Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
-                               const double precision, const int solver_flag, const int rel_prec,
-                               const solver_params_t solver_params, const SloppyPrecision sloppy,
-                               const CompressionType compression) {
-  const int num_flavour = 1;
-  const int num_shifts = 1;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_out[0].resize(num_flavour);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_1f;
-  Odd_out[0][0] = Odd_out_1f;
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-int invert_eo_qphix_oneflavour_mshift(spinor **Odd_out_1f, spinor *Odd_in_1f, const int max_iter,
-                                      const double precision, const int solver_flag,
-                                      const int rel_prec, const solver_params_t solver_params,
-                                      const SloppyPrecision sloppy,
-                                      const CompressionType compression) {
-  // even though the default is set to 1, guard against zeroes
-  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
-  const int num_flavour = 1;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_1f;
-  for (int shift = 0; shift < num_shifts; shift++) {
-    Odd_out[shift].resize(num_flavour);
-    Odd_out[shift][0] = Odd_out_1f[shift];
-  }
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-// Template wrapper for QPhiX solvers callable from C code, return number of iterations
-int invert_eo_qphix_twoflavour(spinor *Odd_out_s, spinor *Odd_out_c, spinor *Odd_in_s,
-                               spinor *Odd_in_c, const int max_iter, const double precision,
-                               const int solver_flag, const int rel_prec,
-                               const solver_params_t solver_params, const SloppyPrecision sloppy,
-                               const CompressionType compression) {
-  const int num_flavour = 2;
-  const int num_shifts = 1;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_out[0].resize(num_flavour);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_s;
-  Odd_in[0][1] = Odd_in_c;
-
-  Odd_out[0][0] = Odd_out_s;
-  Odd_out[0][1] = Odd_out_c;
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-int invert_eo_qphix_twoflavour_mshift(spinor **Odd_out_s, spinor **Odd_out_c, spinor *Odd_in_s,
-                                      spinor *Odd_in_c, const int max_iter, const double precision,
-                                      const int solver_flag, const int rel_prec,
-                                      const solver_params_t solver_params,
-                                      const SloppyPrecision sloppy,
-                                      const CompressionType compression) {
-  // even though the default is set to 1, guard against zeroes
-  const int num_shifts = solver_params.no_shifts == 0 ? 1 : solver_params.no_shifts;
-  const int num_flavour = 2;
-  std::vector<std::vector<spinor *> > Odd_out;
-  std::vector<std::vector<spinor *> > Odd_in;
-
-  Odd_out.resize(num_shifts);
-  Odd_in.resize(1);
-  Odd_in[0].resize(num_flavour);
-
-  Odd_in[0][0] = Odd_in_s;
-  Odd_in[0][1] = Odd_in_c;
-
-  for (int shift = 0; shift < num_shifts; shift++) {
-    Odd_out[shift].resize(num_flavour);
-    Odd_out[shift][0] = Odd_out_s[shift];
-    Odd_out[shift][1] = Odd_out_c[shift];
-  }
-
-  return invert_eo_qphix_nflavour_mshift(Odd_out, Odd_in, precision, max_iter, solver_flag,
-                                         rel_prec, solver_params, sloppy, compression, num_flavour);
-}
-
-// Template wrapper for QPhiX solvers callable from C code, return number of iterations
-// the interface is prepared for multi-rhs solves, hence the double vector for the input
-int invert_eo_qphix_nflavour_mshift(std::vector<std::vector<spinor *> > &Odd_out,
-                                    std::vector<std::vector<spinor *> > &Odd_in,
-                                    const double precision, const int max_iter,
-                                    const int solver_flag, const int rel_prec,
-                                    solver_params_t solver_params, const SloppyPrecision sloppy,
-                                    const CompressionType compression, const int num_flavour) {
-  tmlqcd::checkQphixInputParameters(qphix_input);
-  double target_precision = precision;
-  double src_norm = 0.0;
-  for (int f = 0; f < num_flavour; ++f) {
-    src_norm += square_norm(Odd_in[0][f], VOLUME / 2, 1);
-  }
-  // we use "precision_lambda" to determine if a system can be solved in half or float
-  // precision (when a fixed-precision solver is used)
-  double precision_lambda = target_precision / src_norm;
-  if (rel_prec == 1) {
-    QPhiX::masterPrintf("# QPHIX: Using relative precision\n");
-    target_precision = precision * src_norm;
-    precision_lambda = precision;
-  }
-  QPhiX::masterPrintf("# QPHIX: precision_lambda: %g, target_precision: %g\n\n", precision_lambda,
-                      target_precision);
-
-  // mixed solvers require inner and outer precisions, which we specify explicitly here
-  if (solver_is_mixed(solver_flag)) {
-#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
-    if (sloppy == SLOPPY_HALF) {
-      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_HP) {
-        QPhiX::masterPrintf(
-            "SOALEN=%d is greater than the half prec VECLEN=%d or the double prec VECLEN=%d\n",
-            QPHIX_SOALEN, VECLEN_HP, VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE-HALF PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_HALF_PREC);
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, QPhiX::half, VECLEN_HP,
-                                      QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, QPhiX::half,
-                                      VECLEN_HP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else
-#else
-    if (sloppy == SLOPPY_HALF) {
-      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
-      abort();
-    } else
-#endif
-        if (sloppy == SLOPPY_SINGLE) {
-      if (QPHIX_SOALEN > VECLEN_DP || QPHIX_SOALEN > VECLEN_SP) {
-        QPhiX::masterPrintf(
-            "SOALEN=%d is greater than the single prec VECLEN=%d or the double prec VECLEN=%d\n",
-            QPHIX_SOALEN, VECLEN_SP, VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE-SINGLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_FLOAT_PREC);
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true, float, VECLEN_SP,
-                                      QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false, float, VECLEN_SP,
-                                      QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else {  // if(sloppy)
-      if (QPHIX_SOALEN > VECLEN_DP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX MIXED SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE-DOUBLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC, QPHIX_DOUBLE_PREC);
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    }  // if( sloppy )
-  } else {  // if( solver_is_mixed )
-#if (defined(QPHIX_MIC_SOURCE) || defined(QPHIX_AVX512_SOURCE))
-    if (sloppy == SLOPPY_HALF || precision_lambda >= rsdTarget<QPhiX::half>::value) {
-      if (QPHIX_SOALEN > VECLEN_HP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the half prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_HP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
-      QPhiX::masterPrintf("# USING HALF PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_HALF_PREC);
-
-      if (compress12) {
-        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<QPhiX::half, VECLEN_HP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else
-#else
-    if (sloppy == SLOPPY_HALF) {
-      QPhiX::masterPrintf("QPHIX interface: half precision not supported on this architecture!\n");
-      abort();
-    } else
-#endif
-        if (sloppy == SLOPPY_SINGLE || precision_lambda >= rsdTarget<float>::value) {
-      if (QPHIX_SOALEN > VECLEN_SP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the single prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_SP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
-      QPhiX::masterPrintf("# USING SINGLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_FLOAT_PREC);
-
-      if (compress12) {
-        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<float, VECLEN_SP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    } else {
-      if (QPHIX_SOALEN > VECLEN_DP) {
-        QPhiX::masterPrintf("SOALEN=%d is greater than the double prec VECLEN=%d\n", QPHIX_SOALEN,
-                            VECLEN_DP);
-        abort();
-      }
-      QPhiX::masterPrintf("# INITIALIZING QPHIX SOLVER\n");
-      QPhiX::masterPrintf("# USING DOUBLE PRECISION\n");
-      _initQphix(0, nullptr, qphix_input, compression, QPHIX_DOUBLE_PREC);
-
-      if (compress12) {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, true>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      } else {
-        return invert_eo_qphix_helper<double, VECLEN_DP, QPHIX_SOALEN, false>(
-            Odd_out, Odd_in, target_precision, max_iter, solver_flag, solver_params, num_flavour);
-      }
-    }  // if( sloppy || target_precision )
-  }  // if ( solver_flag == *MIXEDCG )
-  return -1;
-}
-
-void tmlqcd::checkQphixInputParameters(const tm_QPhiXParams_t &params) {
-  if (params.MinCt == 0) {
-    QPhiX::masterPrintf("QPHIX Error: MinCt cannot be 0! Minimal value: 1. Aborting.\n");
-    abort();
-  }
-  if (params.By == 0 || params.Bz == 0) {
-    QPhiX::masterPrintf("QPHIX Error: By and Bz may not be 0! Minimal value: 1. Aborting.\n");
-    abort();
-  }
-  if (params.NCores * params.Sy * params.Sz != omp_num_threads) {
-    QPhiX::masterPrintf("QPHIX Error: NCores * Sy * Sz != ompnumthreads ! Aborting.\n");
-    abort();
-  }
-}
-
-void tmlqcd::printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner,
-                                   int SOALEN_inner, bool compress_inner) {
-  QPhiX::masterPrintf("# QphiX: VECLEN=%d SOALEN=%d VECLEN_inner=%d, SOALEN_inner=%d\n", VECLEN,
-                      SOALEN, VECLEN_inner, SOALEN_inner);
-
-  QPhiX::masterPrintf("# QphiX: Declared QMP Topology (xyzt):");
-  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d", qmp_geom[mu]);
-  QPhiX::masterPrintf("\n");
-
-  QPhiX::masterPrintf("# QphiX: Mapping of dimensions QMP -> tmLQCD (xyzt):");
-  for (int mu = 0; mu < 4; mu++) QPhiX::masterPrintf(" %d->%d", mu, qmp_tm_map[mu]);
-  QPhiX::masterPrintf("\n");
-
-  QPhiX::masterPrintf("# QphiX: Global Lattice Size (xyzt) = ");
-  for (int mu = 0; mu < 4; mu++) {
-    QPhiX::masterPrintf(" %d", lattSize[mu]);
-  }
-  QPhiX::masterPrintf("\n");
-  QPhiX::masterPrintf("# QphiX: Local Lattice Size (xyzt) = ");
-  for (int mu = 0; mu < 4; mu++) {
-    QPhiX::masterPrintf(" %d", subLattSize[mu]);
-  }
-  QPhiX::masterPrintf("\n");
-  QPhiX::masterPrintf("# QphiX: Block Sizes: By= %d Bz=%d\n", By, Bz);
-  QPhiX::masterPrintf("# QphiX: Cores = %d\n", NCores);
-  QPhiX::masterPrintf("# QphiX: SMT Grid: Sy=%d Sz=%d\n", Sy, Sz);
-  QPhiX::masterPrintf("# QphiX: Pad Factors: PadXY=%d PadXYZ=%d\n", PadXY, PadXYZ);
-  QPhiX::masterPrintf("# QphiX: Threads_per_core = %d\n", N_simt);
-  QPhiX::masterPrintf("# QphiX: MinCt = %d\n", MinCt);
-  if (compress) {
-    QPhiX::masterPrintf("# QphiX: Using two-row gauge compression (compress12)\n");
-  }
-  if (compress_inner) {
-    QPhiX::masterPrintf("# QphiX: Inner solver using two-row gauge compression (compress12)\n");
-  }
-}
-
-void testSpinorPackers(spinor *Even_out, spinor *Odd_out, const spinor *const Even_in,
-                       const spinor *const Odd_in) {
-  tmlqcd::checkQphixInputParameters(qphix_input);
-  // FIXME: two-row gauge compression and double precision hard-coded
-  _initQphix(0, nullptr, qphix_input, 12, QPHIX_DOUBLE_PREC);
-
-  QPhiX::Geometry<double, VECLEN_SP, QPHIX_SOALEN, true> geom(subLattSize, By, Bz, NCores, Sy, Sz,
-                                                              PadXY, PadXYZ, MinCt);
-
-  auto qphix_cb_even = QPhiX::makeFourSpinorHandle(geom);
-  auto qphix_cb_odd = QPhiX::makeFourSpinorHandle(geom);
-
-  spinor **tmp;
-  init_solver_field(&tmp, VOLUME / 2, 2);
-
-  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Even_in),
-  //                              qphix_cb_even.get(), cb_even);
-  //   reorder_eo_spinor_to_QPhiX(geom, reinterpret_cast<double const *const>(Odd_in),
-  //                              qphix_cb_odd.get(), cb_odd);
-  reorder_eo_spinor_to_QPhiX(geom, Even_in, qphix_cb_even.get(), cb_even);
-  reorder_eo_spinor_to_QPhiX(geom, Odd_in, qphix_cb_odd.get(), cb_odd);
-
-  reorder_eo_spinor_from_QPhiX(geom, Even_out, qphix_cb_even.get(), cb_even, 1.0);
-  reorder_eo_spinor_from_QPhiX(geom, Odd_out, qphix_cb_odd.get(), cb_odd, 1.0);
-
-  diff(tmp[0], Even_out, Even_in, VOLUME / 2);
-  diff(tmp[1], Odd_out, Odd_in, VOLUME / 2);
-  double l2norm = square_norm(tmp[0], VOLUME / 2, 1) + square_norm(tmp[1], VOLUME / 2, 1);
-  QPhiX::masterPrintf("QPHIX eo spinor packer back and forth difference L2 norm: %lf\n", l2norm);
-  finalize_solver(tmp, 2);
-}
diff --git a/qphix_interface.hpp b/qphix_interface.hpp
deleted file mode 100644
index b487eda66..000000000
--- a/qphix_interface.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2017 Bartosz Kostrzewa
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- ***********************************************************************/
-
-#pragma once
-
-#include "global.h"
-#include "qphix_types.h"
-
-#ifdef __cplusplus /* If this is a C++ compiler, use C linkage */
-extern "C" {
-#endif
-
-#include "misc_types.h"
-#include "operator_types.h"
-#include "solver/matrix_mult_typedef.h"
-#include "solver/solver_params.h"
-#include "su3.h"
-
-#ifdef __cplusplus
-}
-#endif
-
-#include <vector>
-
-int invert_eo_qphix_nflavour_mshift(std::vector< std::vector< spinor* > > &Odd_out, 
-                                    std::vector< std::vector< spinor* > > &Odd_in, 
-                                    const double precision,
-                                    const int max_iter,
-                                    const int solver_flag, 
-                                    const int rel_prec,
-                                    solver_params_t solver_params,
-                                    const SloppyPrecision sloppy, const CompressionType compression,
-                                    const int num_flavour);
\ No newline at end of file
diff --git a/qphix_interface_utils.hpp b/qphix_interface_utils.hpp
deleted file mode 100644
index 56d8afe56..000000000
--- a/qphix_interface_utils.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2015 Mario Schroeck
- *               2016 Peter Labus
- *               2017 Peter Labus, Martin Ueding, Bartosz Kostrzewa
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- ***********************************************************************/
-
-#pragma once
-
-#include "qphix_types.h"
-
-namespace tmlqcd {
-
-void checkQphixInputParameters(const tm_QPhiXParams_t &params);
-void printQphixDiagnostics(int VECLEN, int SOALEN, bool compress, int VECLEN_inner, int SOALEN_inner, bool compress_inner);
-
-}  // namespace tmlqcd
diff --git a/src/bin/LapH_ev.c b/src/bin/LapH_ev.c
index dd96133fb..08e810b36 100644
--- a/src/bin/LapH_ev.c
+++ b/src/bin/LapH_ev.c
@@ -63,20 +63,20 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-    printf("# the code was compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+    printf("# the code was compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
     printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
     printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
 #endif
@@ -98,8 +98,8 @@ int main(int argc, char *argv[]) {
   exit(0);
 #endif
 #endif
-#ifdef FIXEDVOLUME
-  printf(" Error: FIXEDVOLUME not allowed");
+#ifdef TM_FIXEDVOLUME
+  printf(" Error: TM_FIXEDVOLUME not allowed");
   exit(0);
 #endif
 
diff --git a/src/bin/benchmark.c b/src/bin/benchmark.c
index 3dd70a86b..72d8c8f4d 100644
--- a/src/bin/benchmark.c
+++ b/src/bin/benchmark.c
@@ -33,7 +33,7 @@
 #include <time.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <io/gauge.h>
 #include <io/params.h>
 #endif
@@ -61,19 +61,19 @@
 #include "test/check_geometry.h"
 #include "xchange/xchange.h"
 
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
 #define SLICE (LX * LY * LZ / 2)
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2))
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2))
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2) + (T * LX * LY / 2))
-#elif defined PARALLELX
+#elif defined TM_PARALLELX
 #define SLICE ((LY * LZ * T / 2))
-#elif defined PARALLELXY
+#elif defined TM_PARALLELXY
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2))
-#elif defined PARALLELXYZ
+#elif defined TM_PARALLELXYZ
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2) + (LX * LY * T / 2))
 #endif
 
@@ -81,7 +81,7 @@ int check_xchange();
 
 int main(int argc, char *argv[]) {
   int j, j_max, k, k_max = 1;
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   paramsXlfInfo *xlfInfo;
 #endif
   int status = 0;
@@ -123,20 +123,20 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-    printf("# The code was compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+    printf("# The code was compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
     printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
     printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
 #endif
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]) {
   /* define the boundary conditions for the fermion fields */
   boundary(g_kappa);
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
@@ -200,7 +200,7 @@ int main(int argc, char *argv[]) {
       exit(0);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -210,7 +210,7 @@ int main(int argc, char *argv[]) {
     fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
     exit(1);
   }
-#if (defined TM_USE_MPI && !(defined _USE_SHMEM))
+#if (defined TM_USE_MPI && !(defined TM_USE_SHMEM))
   check_xchange();
 #endif
 
@@ -344,7 +344,7 @@ int main(int argc, char *argv[]) {
     sdt = sdt / ((double)(2 * SLICE));
     if (g_proc_id == 0) {
       printf("# The size of the package is %d bytes.\n", (SLICE) * 192);
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
       printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 192. / sdt / 1024 / 1024,
              192. / sdt / 1024. / 1024);
 #else
@@ -431,7 +431,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   if (g_proc_id == 0) {
     printf("# Performing parallel IO test ...\n");
   }
diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index d3abb66ee..75595bc60 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -64,7 +64,7 @@
 #include "solver/solver.h"
 #include "test/check_geometry.h"
 #include "update_tm.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QUDA
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_DERIV_MG_TUNE);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -136,7 +136,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
     exit(1);
   }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -217,7 +217,7 @@ int main(int argc, char *argv[]) {
     exit(-1);
   }
 
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -367,7 +367,7 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index 2db6f8c1b..0d95a3b3c 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -67,7 +67,7 @@
 #include "solver/solver.h"
 #include "test/check_geometry.h"
 #include "update_tm.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QUDA
@@ -113,7 +113,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_HMC_TM);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -168,7 +168,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
@@ -257,7 +257,7 @@ int main(int argc, char *argv[]) {
     exit(1);
   }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -270,7 +270,7 @@ int main(int argc, char *argv[]) {
     exit(-1);
   }
 
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -504,7 +504,7 @@ int main(int argc, char *argv[]) {
     }
 
     /* online measurements */
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     // When the configuration is rejected, we have to update it in the MG and redo the setup.
     int mg_update = accept ? 0 : 1;
 #endif
@@ -514,7 +514,7 @@ int main(int argc, char *argv[]) {
         if (g_proc_id == 0) {
           fprintf(stdout, "#\n# Beginning online measurement.\n");
         }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
         if (mg_update) {
           mg_update = 0;
           MG_reset();
@@ -591,7 +591,7 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/invert.c b/src/bin/invert.c
index 007e0ea41..c3111decb 100644
--- a/src/bin/invert.c
+++ b/src/bin/invert.c
@@ -84,7 +84,7 @@
 #ifdef TM_USE_QPHIX
 #include "qphix_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #include "expo.h"
@@ -114,7 +114,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_INVERT);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -165,7 +165,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
   init_measurements();
 
   /* this could be maybe moved to init_operators */
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -258,7 +258,7 @@ int main(int argc, char *argv[]) {
     fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
     exit(-1);
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) init_xchange_halffield();
 #endif
 #endif
@@ -362,7 +362,7 @@ int main(int argc, char *argv[]) {
     g_precWS = NULL;
     if (use_preconditioning == 1) {
       /* todo load fftw wisdom */
-#if (defined HAVE_FFTW) && !(defined TM_USE_MPI)
+#if (defined TM_USE_FFTW) && !(defined TM_USE_MPI)
       loadFFTWWisdom(g_spinor_field[0], g_spinor_field[1], T, LX);
 #else
       use_preconditioning = 0;
@@ -457,7 +457,7 @@ int main(int argc, char *argv[]) {
   MPI_Finalize();
 #endif
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index c2ae72f9c..72a828fb7 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -83,7 +83,7 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_OFFLINE_MEASUREMENT);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -210,7 +210,7 @@ int main(int argc, char *argv[]) {
   init_measurements();
 
   /* this could be maybe moved to init_operators */
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -223,7 +223,7 @@ int main(int argc, char *argv[]) {
       exit(-1);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) init_xchange_halffield();
 #endif
 #endif
@@ -307,7 +307,7 @@ int main(int argc, char *argv[]) {
 #endif
   return (0);
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/check_locallity.c b/src/bin/tests/check_locallity.c
similarity index 98%
rename from src/bin/check_locallity.c
rename to src/bin/tests/check_locallity.c
index 52ea21209..f03806f21 100644
--- a/src/bin/check_locallity.c
+++ b/src/bin/tests/check_locallity.c
@@ -77,13 +77,13 @@ int main(int argc, char *argv[]) {
   double *norm;
   struct stout_parameters params_smear;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   int kb = 0;
 #endif
 #ifdef TM_USE_MPI
   double atime = 0., etime = 0.;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]) {
   /* define the boundary conditions for the fermion fields */
   boundary();
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -199,7 +199,7 @@ int main(int argc, char *argv[]) {
       exit(-1);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -312,7 +312,7 @@ int main(int argc, char *argv[]) {
   free_spinor_field();
   free_moment_field();
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/bin/hopping_test.c b/src/bin/tests/hopping_test.c
similarity index 94%
rename from src/bin/hopping_test.c
rename to src/bin/tests/hopping_test.c
index 04df878e5..da60c83ba 100644
--- a/src/bin/hopping_test.c
+++ b/src/bin/tests/hopping_test.c
@@ -34,7 +34,7 @@
 #include <time.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <io/gauge.h>
 #include <io/params.h>
 #endif
@@ -59,19 +59,19 @@
 #include "test/check_geometry.h"
 #include "xchange/xchange.h"
 
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
 #define SLICE (LX * LY * LZ / 2)
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2))
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2))
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
 #define SLICE ((LX * LY * LZ / 2) + (T * LY * LZ / 2) + (T * LX * LZ / 2) + (T * LX * LY / 2))
-#elif defined PARALLELX
+#elif defined TM_PARALLELX
 #define SLICE ((LY * LZ * T / 2))
-#elif defined PARALLELXY
+#elif defined TM_PARALLELXY
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2))
-#elif defined PARALLELXYZ
+#elif defined TM_PARALLELXYZ
 #define SLICE ((LY * LZ * T / 2) + (LX * LZ * T / 2) + (LX * LY * T / 2))
 #endif
 
@@ -102,20 +102,20 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-    printf("# the code was compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+    printf("# the code was compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
     printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
     printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
 #endif
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -165,7 +165,7 @@ int main(int argc, char *argv[]) {
   /* define the boundary conditions for the fermion fields */
   boundary(g_kappa);
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
@@ -179,7 +179,7 @@ int main(int argc, char *argv[]) {
       exit(0);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -190,7 +190,7 @@ int main(int argc, char *argv[]) {
     exit(1);
   }
 
-#if (defined TM_USE_MPI && !(defined _USE_SHMEM))
+#if (defined TM_USE_MPI && !(defined TM_USE_SHMEM))
   check_xchange();
 #endif
 
diff --git a/src/bin/qphix_test_Dslash.c b/src/bin/tests/qphix_test_Dslash.c
similarity index 99%
rename from src/bin/qphix_test_Dslash.c
rename to src/bin/tests/qphix_test_Dslash.c
index 56250bc5a..b4218d3e6 100644
--- a/src/bin/qphix_test_Dslash.c
+++ b/src/bin/tests/qphix_test_Dslash.c
@@ -35,7 +35,7 @@
 #include <time.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <io/gauge.h>
 #include <io/params.h>
 #endif
@@ -86,7 +86,7 @@ double compare_spinors(spinor* s1, spinor* s2);
 
 int main(int argc, char* argv[]) {
   int j;
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   paramsXlfInfo* xlfInfo;
 #endif
   int status = 0;
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
   tmlqcd_mpi_init(argc, argv);
   g_dbw2rand = 0;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -135,7 +135,7 @@ int main(int argc, char* argv[]) {
   /* define the geometry */
   geometry();
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
@@ -146,7 +146,7 @@ int main(int argc, char* argv[]) {
     fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n");
     exit(0);
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   init_xchange_halffield();
 #endif
 #endif
@@ -180,7 +180,7 @@ int main(int argc, char* argv[]) {
 #endif
 
   g_update_gauge_copy = 1;
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   update_backward_gauge(g_gauge_field);
 #endif
 
diff --git a/src/bin/scalar_prod_r_test.c b/src/bin/tests/scalar_prod_r_test.c
similarity index 100%
rename from src/bin/scalar_prod_r_test.c
rename to src/bin/tests/scalar_prod_r_test.c
diff --git a/src/bin/test_eigenvalues.c b/src/bin/tests/test_eigenvalues.c
similarity index 98%
rename from src/bin/test_eigenvalues.c
rename to src/bin/tests/test_eigenvalues.c
index 053944698..759d8dd2f 100644
--- a/src/bin/test_eigenvalues.c
+++ b/src/bin/tests/test_eigenvalues.c
@@ -227,7 +227,7 @@ int main(int argc, char *argv[]) {
   g_eps_sq_acc = g_eps_sq_acc1;
   g_eps_sq_force = g_eps_sq_force1;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -274,11 +274,11 @@ int main(int argc, char *argv[]) {
 
     parameterfile = fopen(parameterfilename, "w");
     printf("# This is the hmc code for twisted Mass Wilson QCD\n\nVersion %s\n", Version);
-#ifdef _NEW_GEOMETRY
-    printf("# The code was compiled with -D_NEW_GEOMETRY\n");
+#ifdef TM_NEW_GEOMETRY
+    printf("# The code was compiled with -DTM_NEW_GEOMETRY\n");
 #endif
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
 #endif
     printf("# The lattice size is %d x %d x %d x %d\n", (int)(T * g_nproc_t), (int)(LX * g_nproc_x),
            (int)(LY), (int)(LZ));
@@ -430,7 +430,7 @@ int main(int argc, char *argv[]) {
 #ifdef TM_USE_MPI
   xchange_gauge(g_gauge_field);
 #endif
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   update_backward_gauge();
 #endif
 
diff --git a/src/bin/test_lemon.c b/src/bin/tests/test_lemon.c
similarity index 99%
rename from src/bin/test_lemon.c
rename to src/bin/tests/test_lemon.c
index f2147ad3f..3cef7689c 100644
--- a/src/bin/test_lemon.c
+++ b/src/bin/tests/test_lemon.c
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
 
   tmlqcd_mpi_init(argc, argv);
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
   xlfInfo = construct_paramsXlfInfo(plaquette_energy, 0);
   write_lime_gauge_field("conf.lime", 64, xlfInfo);
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   if (g_proc_id == 0) {
     printf("Now we do write with lemon to conf.lemon...\n");
   }
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 4ace6c997..746b40c0d 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -369,11 +369,11 @@ list(
 
 list(APPEND TEST_SRC_C test/check_xchange.c test/check_geometry.c
      test/overlaptests.c)
-if(TMLQCD_USE_QPHIX)
+if(TM_USE_QPHIX)
   list(APPEND MAIN_SRC_C QphiX/qphix_interface.cpp)
 endif()
 
-if(TMLQCD_USE_QUDA)
+if(TM_USE_QUDA)
   list(APPEND MAIN_SRC_C quda_interface.c)
 endif()
 
@@ -392,7 +392,8 @@ list(
   ${INIT_SRC_C}
   ${SOLVER_SRC_C}
   ${TEST_SRC_C}
-  ${MEAS_SRC_C})
+  ${MEAS_SRC_C}
+  ${PROJECT_BINARY_DIR}/git_hash.c)
 
 include_directories(
   $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
@@ -424,13 +425,10 @@ set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
 # define a library and add the dependencies
 target_link_libraries(
   hmc
-  PUBLIC $<$<BOOL:${HAVE_CLOCK_GETTIME_IN_RT}>:rt>
-         $<$<BOOL:${TM_USE_LEMON}>:tmlqcd::clime>
-         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
+  PUBLIC $<$<BOOL:${TM_CLOCK_GETTIME_IN_RT}>:rt>
+         $<$<BOOL:${TM_DDalphaAMG}>:tmlqcd::DDalphaAMG>
          $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
          $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
-         $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
-         MPI::MPI_CXX>
          $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>
          $<$<BOOL:${TM_USE_CUDA}>:CUDA::cufft
          CUDA::cufftw
@@ -440,9 +438,13 @@ target_link_libraries(
          $<$<BOOL:${TM_USE_HIP}>:hip::hipfft
          roc::hipblas
          hip::host>
+         tmlqcd::clime
+         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
          ${LAPACK_LIBRARIES}
          ${BLAS_LIBRARIES}
-         $<$<BOOL:${TM_USE_OPENMP}>:OpenMP::OpenMP_C
+         $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
+         MPI::MPI_CXX>
+         $<$<BOOL:${TM_USE_OMP}>:OpenMP::OpenMP_C
          OpenMP::OpenMP_CXX>
          m)
 
diff --git a/src/lib/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
index 029d2f76f..80bff4fcc 100644
--- a/src/lib/DDalphaAMG_interface.c
+++ b/src/lib/DDalphaAMG_interface.c
@@ -17,13 +17,13 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  *
- * Interface for DDalphaAMG
+ * Interface for TM_USE_DDalphaAMG
  *
  *******************************************************************************/
 
 #include "DDalphaAMG_interface.h"
 
-#ifndef DDalphaAMG
+#ifndef TM_USE_DDalphaAMG
 
 int mg_setup_iter;
 int mg_coarse_setup_iter;
@@ -43,47 +43,47 @@ double mg_dtau_update;
 double mg_rho_update;
 
 void MG_init(void) {
-  printf("ERROR: MG_init called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_init called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_update_gauge(double step) {
-  printf("ERROR: MG_update_gauge called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_update_gauge called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD) {
-  printf("ERROR: MG_update_mu called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_update_mu called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_reset(void) {
-  printf("ERROR: MG_reset called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_reset called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 void MG_finalize(void) {
-  printf("ERROR: MG_finalize called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_finalize called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver(spinor *const phi_new, spinor *const phi_old, const double precision,
               const int max_iter, const int rel_prec, const int N, su3 **gf, matrix_mult f) {
-  printf("ERROR: MG_solver called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver_eo(spinor *const Even_new, spinor *const Odd_new, spinor *const Even,
                  spinor *const Odd, const double precision, const int max_iter, const int rel_prec,
                  const int N, su3 **gf, matrix_mult_full f_full) {
-  printf("ERROR: MG_solver_eo called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver_eo called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
 int MG_solver_nd(spinor *const up_new, spinor *const dn_new, spinor *const up_old,
                  spinor *const dn_old, const double precision, const int max_iter,
                  const int rel_prec, const int N, su3 **gf, matrix_mult_nd f) {
-  printf("ERROR: MG_solver_nd called but DDalphaAMG library not included.\n");
+  printf("ERROR: MG_solver_nd called but TM_USE_DDalphaAMG library not included.\n");
   exit(1);
 }
 
@@ -207,7 +207,7 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -215,7 +215,7 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
            differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
@@ -257,7 +257,7 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -265,7 +265,7 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
            differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
@@ -304,7 +304,7 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
             "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
             "restart... \n");
         printf(
-            "ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > "
+            "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > "
             "%e \n",
             differ[0], differ[1], differ[0] / differ[1], precision[i]);
       }
@@ -313,7 +313,7 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
     }
 
     if (g_debug_level > 0 && g_proc_id == 0)
-      printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+      printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
              differ[0], differ[1], differ[0] / differ[1]);
   }
 
@@ -343,7 +343,7 @@ static int MG_pre_solve(su3 **gf) {
   if (mg_initialized == 0) {
     MG_init();
     mg_initialized = 1;
-    if (g_proc_id == 0) printf("DDalphaAMG initialized\n");
+    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG initialized\n");
     MPI_Barrier(MPI_COMM_WORLD);
   }
 
@@ -351,23 +351,23 @@ static int MG_pre_solve(su3 **gf) {
     DDalphaAMG_set_configuration((double *)&(gf[0][0]), &mg_status);
     mg_update_gauge = 0;
     if (mg_status.success && g_proc_id == 0)
-      printf("DDalphaAMG cnfg set, plaquette %e\n", mg_status.info);
+      printf("TM_USE_DDalphaAMG cnfg set, plaquette %e\n", mg_status.info);
     else if (g_proc_id == 0)
       printf("ERROR: configuration updating did not run correctly");
   }
 
   if (mg_do_setup == 1) {
     if (mg_setup_mu_set) {
-      if (g_proc_id == 0) printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      if (g_proc_id == 0) printf("TM_USE_DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0);
     } else
       MG_update_mu(g_mu, 0);
-    if (g_proc_id == 0) printf("DDalphaAMG running setup\n");
+    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG running setup\n");
     DDalphaAMG_setup(&mg_status);
     mg_do_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
              100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup procedure did not run correctly");
@@ -375,16 +375,16 @@ static int MG_pre_solve(su3 **gf) {
 
   if (mg_update_setup > 0) {
     if (mg_setup_mu_set) {
-      if (g_proc_id == 0) printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+      if (g_proc_id == 0) printf("TM_USE_DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0);
     } else
       MG_update_mu(g_mu, 0);
-    if (g_proc_id == 0) printf("DDalphaAMG updating setup\n");
+    if (g_proc_id == 0) printf("TM_USE_DDalphaAMG updating setup\n");
     DDalphaAMG_update_setup(mg_update_setup, &mg_status);
     mg_update_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
              100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup updating did not run correctly");
@@ -395,7 +395,7 @@ static int MG_pre_solve(su3 **gf) {
 
 static int MG_solve(spinor *const phi_new, spinor *const phi_old, const double precision,
                     const int N, matrix_mult f) {
-  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m
   double mg_scale = 0.5 / g_kappa;
   double *old = (double *)phi_old;
@@ -529,7 +529,7 @@ static int MG_solve(spinor *const phi_new, spinor *const phi_old, const double p
 
 static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spinor *const dn_old,
                        const double precision, const int N, matrix_mult_nd f) {
-  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
   double mg_scale = 0.5 / g_kappa / phmc_invmaxev;
   double sqnorm;
@@ -803,7 +803,7 @@ static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spi
                                          // 0 and shift
              f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
              f == Qsw_pm_ndpsi_shift) {  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
-    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // TM_USE_DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
     // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
     if (init_guess) {
       mul_gamma5(old1, VOLUME);
@@ -900,7 +900,7 @@ static int MG_solve_nd(spinor *up_new, spinor *dn_new, spinor *const up_old, spi
 static int MG_mms_solve_nd(spinor **const up_new, spinor **const dn_new, spinor *const up_old,
                            spinor *const dn_old, const double *shifts, const int no_shifts,
                            double *precision, const int N, matrix_mult_nd f) {
-  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
+  // for rescaling  convention in TM_USE_DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} ->
   // rescale by 1/4+m moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
   double mg_scale = 0.5 / g_kappa / phmc_invmaxev;
   double *old1 = (double *)up_old;
@@ -1001,7 +1001,7 @@ static int MG_mms_solve_nd(spinor **const up_new, spinor **const dn_new, spinor
                                          // 0 and shift
              f == Qsw_pm_ndpsi_shift) {  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
     mg_scale *= mg_scale;
-    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // TM_USE_DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
     // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
     DDalphaAMG_solve_ms_doublet_squared_odd(new2, old2, new1, old1, mg_even_shifts, mg_odd_shifts,
                                             no_shifts, precision, &mg_status);
@@ -1110,7 +1110,7 @@ void MG_init() {
   mg_params.conf_index_fct = conf_index_fct;
   mg_params.vector_index_fct = vector_index_fct;
 
-  /* in DDalphaAMG
+  /* in TM_USE_DDalphaAMG
    * Printing level:
    *  -1: silent (errors or warnings)
    *   0: minimal //default
diff --git a/src/lib/DDalphaAMG_interface.h b/src/lib/DDalphaAMG_interface.h
index 96f59c31e..cc7ae1678 100644
--- a/src/lib/DDalphaAMG_interface.h
+++ b/src/lib/DDalphaAMG_interface.h
@@ -17,7 +17,7 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  *
- * Interface for DDalphaAMG
+ * Interface for TM_USE_DDalphaAMG
  *
  *******************************************************************************/
 
diff --git a/src/lib/buffers/utils_generic_exchange.blocking.inc b/src/lib/buffers/utils_generic_exchange.blocking.inc
index e6e5f975c..71b44900c 100644
--- a/src/lib/buffers/utils_generic_exchange.blocking.inc
+++ b/src/lib/buffers/utils_generic_exchange.blocking.inc
@@ -26,7 +26,7 @@
 		 g_cart_grid, &status);
   }
   
-#  if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#  if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv(buffer[0],              1, slice_X_gath_type, g_nb_x_dn, 93,
@@ -108,10 +108,10 @@
 		 1, edge_XT_cont_type, g_nb_t_dn, 98,
 		 g_cart_grid, &status);
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #  endif
 
-#  if (defined PARALLELXYT || defined PARALLELXYZT)
+#  if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv(buffer[0],                            1, slice_Y_gath_type, g_nb_y_dn, 103,
@@ -247,9 +247,9 @@
 		 g_cart_grid, &status);
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #  endif
-#  if defined PARALLELXYZT
+#  if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -454,4 +454,4 @@
 
   }
 
-#endif /* PARALLELXYZT */
+#endif /* TM_PARALLELXYZT */
diff --git a/src/lib/buffers/utils_generic_exchange.c b/src/lib/buffers/utils_generic_exchange.c
index c1c3c844a..474c738ad 100644
--- a/src/lib/buffers/utils_generic_exchange.c
+++ b/src/lib/buffers/utils_generic_exchange.c
@@ -4,13 +4,13 @@
 void generic_exchange(void *field_in, int bytes_per_site) {}
 #else /* MPI */
 void generic_exchange(void *field_in, int bytes_per_site) {
-#if defined _NON_BLOCKING
+#if defined TM_NON_BLOCKING
   int cntr = 0;
   MPI_Request request[108];
   MPI_Status status[108];
-#else  /* _NON_BLOCKING */
+#else  /* TM_NON_BLOCKING */
   MPI_Status status;
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
   static int initialized = 0;
 
   /* We start by defining all the MPI datatypes required */
@@ -125,11 +125,11 @@ void generic_exchange(void *field_in, int bytes_per_site) {
   }
 
   /* Following are implementations using different compile time flags */
-#if defined _NON_BLOCKING
+#if defined TM_NON_BLOCKING
 #include "utils_generic_exchange.nonblocking.inc"
-#else  /* _NON_BLOCKING */
+#else  /* TM_NON_BLOCKING */
 #include "utils_generic_exchange.blocking.inc"
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
 }
 
 #endif /* MPI */
diff --git a/src/lib/buffers/utils_generic_exchange.nonblocking.inc b/src/lib/buffers/utils_generic_exchange.nonblocking.inc
index 0789a490f..71409008f 100644
--- a/src/lib/buffers/utils_generic_exchange.nonblocking.inc
+++ b/src/lib/buffers/utils_generic_exchange.nonblocking.inc
@@ -32,7 +32,7 @@
     cntr=cntr+2;
   }
   
-#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend(buffer[0],              1, slice_X_gath_type, g_nb_x_dn, 87,
@@ -71,7 +71,7 @@
 #    endif
   MPI_Waitall(cntr, request, status);
   cntr=0;
-#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* The edges */
 
   /* send the data to the neighbour on the left in t direction */
@@ -137,10 +137,10 @@
 	      g_cart_grid, &request[cntr+1]);
     cntr=cntr+2;
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #    endif
 
-#    if (defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend(buffer[0],                            1, slice_Y_gath_type, g_nb_y_dn, 106,
@@ -177,7 +177,7 @@
 #    endif
   MPI_Waitall(cntr, request, status);
   cntr=0;
-#    if (defined PARALLELXYT || defined PARALLELXYZT)
+#    if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* jetzt wirds richtig eklig ... */
 
@@ -312,9 +312,9 @@
     cntr=cntr+2;
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #    endif
-#    if defined PARALLELXYZT
+#    if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -359,7 +359,7 @@
   }
 #    endif
   MPI_Waitall(cntr, request, status);
-#    if defined PARALLELXYZT
+#    if defined TM_PARALLELXYZT
   cntr=0;
   /* edges */
 
diff --git a/src/lib/deriv_Sb.c b/src/lib/deriv_Sb.c
index 4303c80d5..7b55eb170 100644
--- a/src/lib/deriv_Sb.c
+++ b/src/lib/deriv_Sb.c
@@ -56,7 +56,7 @@
 void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field_t* const hf,
               const double factor) {
   tm_stopwatch_push(&g_timers, __func__, "");
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(hf->gaugefield);
   }
@@ -85,7 +85,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
 #undef static
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(derivSb)
 #endif
 
@@ -114,7 +114,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = &g_gauge_field_copy[icx][0];
 #else
     up = &hf->gaugefield[ix][0];
@@ -136,7 +136,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][0];
@@ -159,7 +159,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][1];
@@ -181,7 +181,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][1];
@@ -203,7 +203,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][2];
@@ -225,7 +225,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][2];
@@ -247,7 +247,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][3];
@@ -269,7 +269,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR)
+#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][3];
@@ -292,7 +292,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
   } /* OpenMP closing brace */
 #endif
   tm_stopwatch_pop(&g_timers, 0, 1, "");
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(derivSb)
 #endif
 }
diff --git a/src/lib/deriv_Sb_D_psi.c b/src/lib/deriv_Sb_D_psi.c
index 6ba15d490..61da4b9d2 100644
--- a/src/lib/deriv_Sb_D_psi.c
+++ b/src/lib/deriv_Sb_D_psi.c
@@ -63,7 +63,7 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 #undef static
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(derivSb)
 #endif
 
@@ -225,7 +225,7 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 
       /****************** end of loop ************************/
     }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(derivSb)
 #endif
 
diff --git a/fixed_volume.h.in b/src/lib/fixed_volume.h.in
similarity index 100%
rename from fixed_volume.h.in
rename to src/lib/fixed_volume.h.in
diff --git a/src/lib/geometry_eo.c b/src/lib/geometry_eo.c
index 8622131e9..ceb348e1a 100644
--- a/src/lib/geometry_eo.c
+++ b/src/lib/geometry_eo.c
@@ -45,7 +45,7 @@
 
 void Hopping_Matrix_Indices(void);
 
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ))
 
 /* This is the version of the function Index  introduced for Aurora-like parallelizations (mainly
  * xyz)  */
@@ -72,7 +72,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     ix = VOLUME + T * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
   }
 
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   /* y-Rand */
   if (x2 == LY) {
     ix = VOLUME + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
@@ -97,9 +97,9 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUME + RAND + 3 * T * LZ + y0 * LZ + y3;
     }
   }
-#endif /* endif of PARALLELXY  || PARALLELXYZ */
+#endif /* endif of TM_PARALLELXY  || TM_PARALLELXYZ */
 
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
   /* z-Rand */
   if (x3 == LZ) {
     ix = VOLUME + 2 * T * LY * LZ + 2 * T * LX * LZ + y0 * LX * LY + y1 * LY + y2;
@@ -142,7 +142,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXYZ */
+#endif /* endif of TM_PARALLELXYZ */
 
   /* The DBW2 stuff --> second boundary slice */
   /* This we put a the very end.              */
@@ -150,44 +150,44 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
   /* x2-rand+ */
   if (x1 == LX + 1) {
     ix = VOLUMEPLUSRAND + y0 * LY * LZ + y2 * LZ + y3;
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     /* x2y */
     if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 1 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXY || PARALLELXYZ  */
-#if defined PARALLELXYZ
+#endif /* endif of TM_PARALLELXY || TM_PARALLELXYZ  */
+#if defined TM_PARALLELXYZ
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 4 * T * LY + y0 * LY + y2;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 5 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
   }
   /* x2-rand- */
   if (x1 == -2) {
     ix = VOLUMEPLUSRAND + T * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     /* x2y */
     if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 2 * T * LZ + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 3 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXY || PARALLELXYZ  */
-#if defined PARALLELXYZ
+#endif /* endif of TM_PARALLELXY || TM_PARALLELXYZ  */
+#if defined TM_PARALLELXYZ
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 6 * T * LY + y0 * LY + y2;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 7 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of  PARALLELXYZ  */
+#endif /* endif of  TM_PARALLELXYZ  */
   }
-#if (defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   /* y2-rand+ */
   if (x2 == LY + 1) {
     ix = VOLUMEPLUSRAND + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
@@ -197,14 +197,14 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 6 * T * LZ + y0 * LZ + y3;
     }
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 4 * T * LX + y0 * LX + y1;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 5 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
   }
   /* y2-rand- */
   if (x2 == -2) {
@@ -215,17 +215,17 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 7 * T * LZ + y0 * LZ + y3;
     }
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 6 * T * LX + y0 * LX + y1;
     } else if (x3 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 7 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
   }
-#endif /* endif of PARALLELXY || PARALLELXYZ  */
-#if defined PARALLELXYZ
+#endif /* endif of TM_PARALLELXY || TM_PARALLELXYZ  */
+#if defined TM_PARALLELXYZ
   /* z2-rand+ */
   if (x3 == LZ + 1) {
     ix = VOLUMEPLUSRAND + 2 * T * LY * LZ + 2 * T * LX * LZ + y0 * LX * LY + y1 * LY + y2;
@@ -259,12 +259,12 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * T * LZ + 8 * T * LY + 3 * T * LX + y0 * LX + y1;
     }
   }
-#endif /* endif of PARALLELXYZ  */
+#endif /* endif of TM_PARALLELXYZ  */
 
   return (ix);
 }
 
-#else /* original version of Index(): used for no parallelization  or PARALLEL*T */
+#else /* original version of Index(): used for no parallelization  or TM_PARALLEL*T */
 
 int Index(const int x0, const int x1, const int x2, const int x3) {
   int y0, y1, y2, y3, ix;
@@ -274,7 +274,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
   y3 = (x3 + LZ) % LZ;
   ix = ((y0 * LX + y1) * LY + y2) * LZ + y3;
 
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
   if (x0 == T) {
     ix = VOLUME + y3 + LZ * y2 + LZ * LY * y1;
   }
@@ -283,7 +283,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     ix = VOLUME + LX * LY * LZ + y3 + LZ * y2 + LZ * LY * y1;
   }
 #endif
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
   if (x1 == LX) {
     ix = VOLUME + 2 * LX * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
   }
@@ -309,9 +309,9 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT */
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT */
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* y-Rand */
   if (x2 == LY) {
     ix = VOLUME + 2 * LX * LY * LZ + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
@@ -358,8 +358,8 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXYT  || PARALLELXYZT */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT  || TM_PARALLELXYZT */
+#if defined TM_PARALLELXYZT
   /* z-Rand */
   if (x3 == LZ) {
     ix =
@@ -429,30 +429,30 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     }
   }
 
-#endif /* endif of PARALLELXYZT */
+#endif /* endif of TM_PARALLELXYZT */
 
   /* The DBW2 stuff --> second boundary slice */
   /* This we put a the very end.              */
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
   if (x0 == T + 1) {
     ix = VOLUMEPLUSRAND + y3 + LZ * y2 + LZ * LY * y1;
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     /* t2x */
     if (x1 == LX) {
       ix = VOLUMEPLUSRAND + RAND + y2 * LZ + y3;
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 1 * LY * LZ + y2 * LZ + y3;
     }
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* t2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + y1 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 2 * LX * LZ + y1 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* t2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + y1 * LY + y2;
@@ -460,28 +460,28 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 2 * LX * LY + y1 * LY +
            y2;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
   /* the slice at time -2 is put behind the one at time T+1 */
   else if (x0 == -2) {
     ix = VOLUMEPLUSRAND + LX * LY * LZ + y3 + LZ * y2 + LZ * LY * y1;
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     /* t2x */
     if (x1 == LX) {
       ix = VOLUMEPLUSRAND + RAND + 2 * LY * LZ + y2 * LZ + y3;
     } else if (x1 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 3 * LY * LZ + y2 * LZ + y3;
     }
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* t2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + LX * LZ + y1 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 3 * LX * LZ + y1 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* t2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + LX * LY + y1 * LY + y2;
@@ -489,10 +489,10 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 3 * LX * LY + y1 * LY +
            y2;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
-#endif /* endif of PARALLELT || PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELT || TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || defined TM_PARALLELXYZT)
   if (x1 == LX + 1) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
     /* x2t */
@@ -501,15 +501,15 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 6 * LY * LZ + y2 * LZ + y3;
     }
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* x2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 1 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -518,7 +518,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            5 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
   if (x1 == -2) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + T * LY * LZ + y0 * LY * LZ + y2 * LZ + y3;
@@ -528,15 +528,15 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 7 * LY * LZ + y2 * LZ + y3;
     }
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* x2y */
     else if (x2 == LY) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 2 * T * LZ + y0 * LZ + y3;
     } else if (x2 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 3 * T * LZ + y0 * LZ + y3;
     }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
     /* x2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -545,10 +545,10 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            7 * T * LY + y0 * LY + y2;
     }
-#endif /* endif of  PARALLELXYZT  */
+#endif /* endif of  TM_PARALLELXYZT  */
   }
-#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT  */
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#endif /* endif of TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   if (x2 == LY + 1) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LY * LZ + y0 * LX * LZ + y1 * LZ + y3;
     /* y2x */
@@ -563,7 +563,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 5 * LX * LZ + y1 * LZ + y3;
     }
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -572,7 +572,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            8 * T * LY + 5 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
   if (x2 == -2) {
     ix = VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LY * LZ + T * LX * LZ + y0 * LX * LZ +
@@ -589,7 +589,7 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
     } else if (x0 == -1) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 7 * LX * LZ + y1 * LZ + y3;
     }
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     /* y2z */
     else if (x3 == LZ) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
@@ -598,10 +598,10 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
       ix = VOLUMEPLUSRAND + RAND + 8 * LY * LZ + 8 * T * LZ + 8 * LX * LZ + 8 * LX * LY +
            8 * T * LY + 7 * T * LX + y0 * LX + y1;
     }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   }
-#endif /* endif of PARALLELXYT || PARALLELXYZT  */
-#if defined PARALLELXYZT
+#endif /* endif of TM_PARALLELXYT || TM_PARALLELXYZT  */
+#if defined TM_PARALLELXYZT
   /* z2-Rand */
   if (x3 == LZ + 1) {
     if ((x0 < T) && (x0 > -1) && (x1 < LX) && (x1 > -1) && (x2 > -1) && (x2 < LY)) {
@@ -663,14 +663,14 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
            8 * T * LY + 3 * T * LX + y0 * LX + y1;
     }
   }
-#endif /* endif of PARALLELXYZT  */
+#endif /* endif of TM_PARALLELXYZT  */
   /*   if(ix == 372) { */
   /*     printf("## %d %d %d %d ix = %d, %d %d %d %d\n", x0, x1, x2, x3, ix, T, LX, LY, LZ); */
   /*   } */
   return (ix);
 }
 
-#endif /* PARALLEL???  */
+#endif /* TM_PARALLEL???  */
 
 void geometry() {
   int x0, x1, x2, x3, ix;
@@ -685,17 +685,17 @@ void geometry() {
 
   xeven = malloc(VOLUMEPLUSRAND * sizeof(int));
 
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   startvaluet = 1;
 #endif
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || \
-     defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
+     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   startvaluex = 1;
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   startvaluey = 1;
 #endif
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   startvaluez = 1;
 #endif
 
@@ -795,7 +795,7 @@ void geometry() {
     }
   }
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   ix = 0;
   for (x0 = 0; x0 < T; x0++) {
     for (x1 = 0; x1 < LX; x1++) {
@@ -852,7 +852,7 @@ void geometry() {
   }
 
 
-#endif /* PARALLELXYZ || PARALLELXYZT*/
+#endif /* TM_PARALLELXYZ || TM_PARALLELXYZT*/
 
   /* The rectangular gauge action part */
   /* Everything is stored behind VOLUMEPLUSRAND-1 !*/
@@ -861,7 +861,7 @@ void geometry() {
       printf("# Initialising rectangular gauge action stuff\n");
       fflush(stdout);
     }
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -910,8 +910,8 @@ void geometry() {
       }
     }
 #endif
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || \
-     defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
+     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -959,7 +959,7 @@ void geometry() {
       }
     }
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -1007,7 +1007,7 @@ void geometry() {
       }
     }
 #endif
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
         for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
diff --git a/src/lib/get_rectangle_staples.c b/src/lib/get_rectangle_staples.c
index eb2a7db9f..eab6b9d9e 100644
--- a/src/lib/get_rectangle_staples.c
+++ b/src/lib/get_rectangle_staples.c
@@ -34,7 +34,7 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
                                    const su3 *const *const gf) {
   su3 ALIGN tmp1, tmp2;
   const su3 *a, *b, *c, *d, *e;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(rectstaples)
 #endif
   _su3_zero((*v));
@@ -178,7 +178,7 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
       _su3_times_su3_acc((*v), tmp2, tmp1);
     }
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(rectstaples)
 #endif
 }
diff --git a/src/lib/get_staples.c b/src/lib/get_staples.c
index e80648382..b33010f2c 100644
--- a/src/lib/get_staples.c
+++ b/src/lib/get_staples.c
@@ -35,7 +35,7 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(staples)
 #endif
 
@@ -61,7 +61,7 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(staples)
 #endif
 }
@@ -72,7 +72,7 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(staples)
 #endif
 
@@ -98,7 +98,7 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(staples)
 #endif
 }
@@ -109,7 +109,7 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(staples)
 #endif
 
@@ -134,7 +134,7 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
     /* v = v + w1^d * st */
     _su3d_times_su3_acc(*staple, *w1, st);
   }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(staples)
 #endif
 }
diff --git a/src/lib/gettime.c b/src/lib/gettime.c
index 68c123ae4..adae6dcb3 100644
--- a/src/lib/gettime.c
+++ b/src/lib/gettime.c
@@ -21,7 +21,7 @@
 #ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
 #endif
-#ifdef HAVE_CLOCK_GETTIME
+#ifdef TM_CLOCK_GETTIME
 #ifndef _POSIX_C_SOURCE
 #define _POSIX_C_SOURCE 199309L
 #endif
@@ -45,7 +45,7 @@ double gettime(void) {
 
   t = MPI_Wtime();
 
-#elif (defined HAVE_CLOCK_GETTIME)
+#elif (defined TM_CLOCK_GETTIME)
 
   struct timespec ts;
 
diff --git a/src/lib/git_hash.h b/src/lib/git_hash.h
new file mode 100644
index 000000000..a3a22b48d
--- /dev/null
+++ b/src/lib/git_hash.h
@@ -0,0 +1,6 @@
+#ifndef TM_GIT_HASH_H
+#define TM_GIT_HASH_H
+
+extern const char git_hash[];
+
+#endif
diff --git a/src/lib/global.h b/src/lib/global.h
index 1fc644d3e..b0d3b1ac2 100644
--- a/src/lib/global.h
+++ b/src/lib/global.h
@@ -38,7 +38,7 @@
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef FIXEDVOLUME
+#ifdef TM_FIXEDVOLUME
 #include "fixed_volume.h"
 #endif
 #include "su3.h"
@@ -79,7 +79,7 @@ EXTERN tm_mpi_thread_level_t g_mpi_thread_level;
 EXTERN tm_timers_t g_timers;
 
 EXTERN int T_global;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
 EXTERN int T, L, LX, LY, LZ, VOLUME;
 EXTERN int N_PROC_T, N_PROC_X, N_PROC_Y, N_PROC_Z;
 EXTERN int RAND, EDGES, VOLUMEPLUSRAND;
@@ -130,7 +130,7 @@ EXTERN int g_running_phmc;
 
 EXTERN su3 **g_gauge_field;
 EXTERN su3_32 **g_gauge_field_32;
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
 EXTERN su3 ***g_gauge_field_copy;
 EXTERN su3_32 ***g_gauge_field_copy_32;
 #else
diff --git a/src/lib/init/init_dirac_halfspinor.c b/src/lib/init/init_dirac_halfspinor.c
index f5939d9cc..891a703e2 100644
--- a/src/lib/init/init_dirac_halfspinor.c
+++ b/src/lib/init/init_dirac_halfspinor.c
@@ -94,7 +94,7 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer[ieo][8 * i] = &sendBuffer[k];
@@ -104,8 +104,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 1] = &sendBuffer[k];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == 0) {
         k = (g_lexic2eosub[g_idn[j][1]] - VOLUME / 2);
         NBPointer[ieo][8 * i + 2] = &sendBuffer[k];
@@ -115,8 +115,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 3] = &sendBuffer[k];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == 0) {
         k = (g_lexic2eosub[g_idn[j][2]] - VOLUME / 2);
         NBPointer[ieo][8 * i + 4] = &sendBuffer[k];
@@ -126,7 +126,7 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 5] = &sendBuffer[k];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == 0) {
         k = (g_lexic2eosub[g_idn[j][3]] - VOLUME / 2);
         NBPointer[ieo][8 * i + 6] = &sendBuffer[k];
@@ -154,7 +154,7 @@ int init_dirac_halfspinor() {
       for (int mu = 0; mu < 8; mu++) {
         NBPointer[ieo][8 * i + mu] = &HalfSpinor[8 * i + mu];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer[ieo][8 * i] = &recvBuffer[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
@@ -162,8 +162,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 1] = &recvBuffer[(g_lexic2eosub[g_idn[j][0]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == LX - 1) {
         NBPointer[ieo][8 * i + 2] = &recvBuffer[(g_lexic2eosub[g_iup[j][1]] - VOLUME / 2)];
       }
@@ -171,8 +171,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 3] = &recvBuffer[(g_lexic2eosub[g_idn[j][1]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == LY - 1) {
         NBPointer[ieo][8 * i + 4] = &recvBuffer[(g_lexic2eosub[g_iup[j][2]] - VOLUME / 2)];
       }
@@ -180,7 +180,7 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 5] = &recvBuffer[(g_lexic2eosub[g_idn[j][2]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == LZ - 1) {
         NBPointer[ieo][8 * i + 6] = &recvBuffer[(g_lexic2eosub[g_iup[j][3]] - VOLUME / 2)];
       }
@@ -240,7 +240,7 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor32[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer32[ieo][8 * i] = &sendBuffer32[k];
@@ -250,8 +250,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 1] = &sendBuffer32[k];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == 0) {
         k = (g_lexic2eosub[g_idn[j][1]] - VOLUME / 2);
         NBPointer32[ieo][8 * i + 2] = &sendBuffer32[k];
@@ -261,8 +261,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 3] = &sendBuffer32[k];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == 0) {
         k = (g_lexic2eosub[g_idn[j][2]] - VOLUME / 2);
         NBPointer32[ieo][8 * i + 4] = &sendBuffer32[k];
@@ -272,7 +272,7 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 5] = &sendBuffer32[k];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == 0) {
         k = (g_lexic2eosub[g_idn[j][3]] - VOLUME / 2);
         NBPointer32[ieo][8 * i + 6] = &sendBuffer32[k];
@@ -300,7 +300,7 @@ int init_dirac_halfspinor32() {
       for (mu = 0; mu < 8; mu++) {
         NBPointer32[ieo][8 * i + mu] = &HalfSpinor32[8 * i + mu];
       }
-#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer32[ieo][8 * i] = &recvBuffer32[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
@@ -308,8 +308,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 1] = &recvBuffer32[(g_lexic2eosub[g_idn[j][0]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || \
-     (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELX) || (defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || \
+     (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
       if (x == LX - 1) {
         NBPointer32[ieo][8 * i + 2] = &recvBuffer32[(g_lexic2eosub[g_iup[j][1]] - VOLUME / 2)];
       }
@@ -317,8 +317,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 3] = &recvBuffer32[(g_lexic2eosub[g_idn[j][1]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || \
-     (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXY) || (defined TM_PARALLELXYZ) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (y == LY - 1) {
         NBPointer32[ieo][8 * i + 4] = &recvBuffer32[(g_lexic2eosub[g_iup[j][2]] - VOLUME / 2)];
       }
@@ -326,7 +326,7 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 5] = &recvBuffer32[(g_lexic2eosub[g_idn[j][2]] - VOLUME / 2)];
       }
 #endif
-#if ((defined PARALLELXYZ) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXYZ) || (defined TM_PARALLELXYZT))
       if (z == LZ - 1) {
         NBPointer32[ieo][8 * i + 6] = &recvBuffer32[(g_lexic2eosub[g_iup[j][3]] - VOLUME / 2)];
       }
diff --git a/src/lib/init/init_gauge_field.c b/src/lib/init/init_gauge_field.c
index e30e040bf..1ad4463a8 100644
--- a/src/lib/init/init_gauge_field.c
+++ b/src/lib/init/init_gauge_field.c
@@ -54,7 +54,7 @@ int init_gauge_field(const int V, const int back) {
     g_gauge_field[i] = g_gauge_field[i - 1] + 4;
   }
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
   if (back == 1) {
     /*
       g_gauge_field_copy[ieo][PM][sites/2][mu]
@@ -134,7 +134,7 @@ int init_gauge_field_32(const int V, const int back) {
     g_gauge_field_32[i] = g_gauge_field_32[i - 1] + 4;
   }
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
   if (back == 1) {
     /*
       g_gauge_field_copy[ieo][PM][sites/2][mu]
@@ -167,7 +167,7 @@ int init_gauge_field_32(const int V, const int back) {
       g_gauge_field_copy_32[1][i] = g_gauge_field_copy_32[1][i - 1] + 4;
     }
   }
-#else /* than _USE_HALFSPINOR  */
+#else /* than TM_USE_HALFSPINOR  */
   if (back == 1) {
     if ((void*)(g_gauge_field_copy_32 = (su3_32**)calloc((VOLUME + RAND), sizeof(su3_32*))) ==
         NULL) {
@@ -217,7 +217,7 @@ void convert_32_gauge_field(su3_32** gf32, su3** gf, int V) {
       gf32[i][mu].c22 = (_Complex float)gf[i][mu].c22;
     }
   }
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 
 #endif
 }
diff --git a/src/lib/init/init_geometry_indices.c b/src/lib/init/init_geometry_indices.c
index ef54c45de..6b75fc83a 100644
--- a/src/lib/init/init_geometry_indices.c
+++ b/src/lib/init/init_geometry_indices.c
@@ -58,7 +58,7 @@ int init_geometry_indices(const int V) {
   g_eo2lexic = (int *)calloc(V, sizeof(int));
   if ((void *)g_eo2lexic == NULL) return (11);
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   g_field_z_ipt_even = (int *)calloc(T * LX * LY, sizeof(int));
   if ((void *)g_field_z_ipt_even == NULL) return (12);
   g_field_z_ipt_odd = (int *)calloc(T * LX * LY, sizeof(int));
@@ -136,7 +136,7 @@ void free_geometry_indices() {
   free(g_eo2lexic);
   free(g_lexic2eosub);
   free(g_lexic2eo);
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   free(g_field_z_ipt_odd);
   free(g_field_z_ipt_even);
 #endif
diff --git a/src/lib/init/init_parallel.h b/src/lib/init/init_parallel.h
index f88ebe1b4..553da6765 100644
--- a/src/lib/init/init_parallel.h
+++ b/src/lib/init/init_parallel.h
@@ -19,8 +19,8 @@
  *
  *******************************************************************************/
 
-#ifndef _INIT_PARALLEL_H
-#define _INIT_PARALLEL_H
+#ifndef _INIT_TM_PARALLEL_H
+#define _INIT_TM_PARALLEL_H
 
 void init_parallel_and_read_input(int argc, char *argv[], const char input_filename[]);
 
diff --git a/src/lib/init/init_spinor_field.c b/src/lib/init/init_spinor_field.c
index c70945634..6fea95cd8 100644
--- a/src/lib/init/init_spinor_field.c
+++ b/src/lib/init/init_spinor_field.c
@@ -23,7 +23,7 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 #include "global.h"
@@ -37,7 +37,7 @@ spinor *sp_tbuff = NULL;
 int init_spinor_field(const int V, const int nr) {
   int i = 0;
 
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void *)(sp = (spinor *)shmalloc((nr * V + 1) * sizeof(spinor))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
@@ -65,7 +65,7 @@ int init_spinor_field(const int V, const int nr) {
 }
 
 void free_spinor_field() {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   shfree(sp);
   shfree(sp_csg);
 #else
@@ -78,7 +78,7 @@ spinor32 *sp32 = NULL;
 int init_spinor_field_32(const int V, const int nr) {
   int i = 0;
 
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void *)(sp32 = (spinor32 *)shmalloc((nr * V + 1) * sizeof(spinor32))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
@@ -106,7 +106,7 @@ int init_spinor_field_32(const int V, const int nr) {
 }
 
 void free_spinor_field_32() {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   shfree(sp32);
 #else
   free(sp32);
@@ -119,7 +119,7 @@ void free_spinor_field_32() {
 int allocate_spinor_field_array(spinor ***spinors, spinor **sp, const int V, const int nr) {
   int i = 0;
 
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void *)((*sp) = (spinor *)shmalloc((nr * V + 1) * sizeof(spinor))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
@@ -147,7 +147,7 @@ int allocate_spinor_field_array(spinor ***spinors, spinor **sp, const int V, con
 }
 
 void free_spinor_field_array(spinor **sp) {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   shfree(*sp);
 #else
   free(*sp);
@@ -165,7 +165,7 @@ int init_csg_field(const int V) {
 
   /* if all histories are zero, we do not need initialisation */
   if (sum != 0) {
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
     sp_csg = (spinor *)shmalloc((sum * V + 1) * sizeof(spinor));
 #else
     sp_csg = (spinor *)calloc(sum * V + 1, sizeof(spinor));
diff --git a/src/lib/invert_clover_eo.c b/src/lib/invert_clover_eo.c
index e3b6cad31..63e512819 100644
--- a/src/lib/invert_clover_eo.c
+++ b/src/lib/invert_clover_eo.c
@@ -53,7 +53,7 @@
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QPHIX
@@ -81,7 +81,7 @@ int invert_clover_eo(spinor* const Even_new, spinor* const Odd_new, spinor* cons
     }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if (solver_flag == MG) {
       return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter, rel_prec, VOLUME / 2,
                           gf[0], &Msw_full);
@@ -197,7 +197,7 @@ int invert_clover_eo(spinor* const Even_new, spinor* const Odd_new, spinor* cons
                     rel_prec, VOLUME, Qsq);
       Qm(g_spinor_field[DUM_DERI + 1], g_spinor_field[DUM_DERI]);
     }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     else if (solver_flag == MG) {
       return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter, rel_prec, VOLUME / 2,
                           gf[0], &Msw_full);
diff --git a/src/lib/invert_doublet_eo.c b/src/lib/invert_doublet_eo.c
index 5be48415e..8d5a7dd82 100644
--- a/src/lib/invert_doublet_eo.c
+++ b/src/lib/invert_doublet_eo.c
@@ -50,7 +50,7 @@
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QPHIX
@@ -75,7 +75,7 @@ int invert_doublet_eo(spinor* const Even_new_s, spinor* const Odd_new_s, spinor*
   }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   if (solver_flag == MG) {
     return MG_solver_nd_eo(Even_new_s, Odd_new_s, Even_new_c, Odd_new_c, Even_s, Odd_s, Even_c,
                            Odd_c, precision, max_iter, rel_prec, VOLUME / 2, g_gauge_field,
@@ -162,7 +162,7 @@ int invert_cloverdoublet_eo(spinor* const Even_new_s, spinor* const Odd_new_s,
   }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   if (solver_flag == MG) {
     return MG_solver_nd_eo(Even_new_s, Odd_new_s, Even_new_c, Odd_new_c, Even_s, Odd_s, Even_c,
                            Odd_c, precision, max_iter, rel_prec, VOLUME / 2, g_gauge_field,
diff --git a/src/lib/invert_eo.c b/src/lib/invert_eo.c
index 997cab021..3b7625d48 100644
--- a/src/lib/invert_eo.c
+++ b/src/lib/invert_eo.c
@@ -61,7 +61,7 @@
 #ifdef TM_USE_QPHIX
 #include "qphix_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -84,7 +84,7 @@ int invert_eo(spinor *const Even_new, spinor *const Odd_new, spinor *const Even,
   }
 #endif
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   if (solver_flag == MG)
     return MG_solver_eo(Even_new, Odd_new, Even, Odd, precision, max_iter, rel_prec, VOLUME / 2,
                         g_gauge_field, &M_full);
diff --git a/src/lib/io/gauge_read.c b/src/lib/io/gauge_read.c
index b7be10928..de53d9c28 100644
--- a/src/lib/io/gauge_read.c
+++ b/src/lib/io/gauge_read.c
@@ -19,7 +19,7 @@
  ***********************************************************************/
 
 #include "gauge.ih"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -209,7 +209,7 @@ int read_gauge_field(char *filename, su3 **const gf) {
     // reading a new gauge configuration moves the gauge_id a long way
     // to guarantee that the change is propagated
     update_tm_gauge_id(&g_gauge_state, TM_GAUGE_PROPAGATE_THRESHOLD);
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     MG_reset();
 #endif
   }
diff --git a/src/lib/io/gauge_read_binary.c b/src/lib/io/gauge_read_binary.c
index b61284cab..473e4d9c7 100644
--- a/src/lib/io/gauge_read_binary.c
+++ b/src/lib/io/gauge_read_binary.c
@@ -22,7 +22,7 @@
 /* FIXME I will first fix this function by using referral.
          Probably should be done better in the future. AD. */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int read_binary_gauge_data(LemonReader* lemonreader, DML_Checksum* checksum,
                            paramsIldgFormat* input, su3** const gf) {
   int t, x, y, z, status = 0;
@@ -144,7 +144,7 @@ int read_binary_gauge_data(LemonReader* lemonreader, DML_Checksum* checksum,
   free(filebuffer);
   return (0);
 }
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int read_binary_gauge_data(LimeReader *limereader, DML_Checksum *checksum, paramsIldgFormat *input,
                            su3 **const gf) {
   int t, x, y, z, status = 0;
@@ -273,4 +273,4 @@ int read_binary_gauge_data(LimeReader *limereader, DML_Checksum *checksum, param
 #endif
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/gauge_write_binary.c b/src/lib/io/gauge_write_binary.c
index 668b53a17..ad3c7882e 100644
--- a/src/lib/io/gauge_write_binary.c
+++ b/src/lib/io/gauge_write_binary.c
@@ -22,7 +22,7 @@
 /* FIXME I will first fix this function by using referral.
          Probably should be done better in the future. AD. */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int write_binary_gauge_data(LemonWriter* lemonwriter, const int prec, DML_Checksum* checksum) {
   int x, xG, y, yG, z, zG, t, tG, status = 0;
   su3 tmp3[4];
@@ -133,7 +133,7 @@ int write_binary_gauge_data(LemonWriter* lemonwriter, const int prec, DML_Checks
   return 0;
 }
 
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 
 int write_binary_gauge_data(LimeWriter* limewriter, const int prec, DML_Checksum* checksum) {
   int x, X, y, Y, z, Z, tt, t0, tag = 0, id = 0, status = 0;
@@ -281,4 +281,4 @@ int write_binary_gauge_data(LimeWriter* limewriter, const int prec, DML_Checksum
 
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/selector.h b/src/lib/io/selector.h
index 806178bff..236be8d32 100644
--- a/src/lib/io/selector.h
+++ b/src/lib/io/selector.h
@@ -21,11 +21,11 @@
 #define _IO_SELECTOR_H
 
 #include <lime.h>
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #include <lemon.h>
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 #define LIME_FILE MPI_File
 #define WRITER LemonWriter
 #define READER LemonReader
@@ -42,7 +42,7 @@
 #define WriterCloseRecord lemonWriterCloseRecord
 #define DestroyReader lemonDestroyReader
 #define DestroyHeader lemonDestroyHeader
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 #define LIME_FILE FILE
 #define WRITER LimeWriter
 #define READER LimeReader
diff --git a/src/lib/io/spinor_read_binary.c b/src/lib/io/spinor_read_binary.c
index 6d459fd2c..81607a700 100644
--- a/src/lib/io/spinor_read_binary.c
+++ b/src/lib/io/spinor_read_binary.c
@@ -19,7 +19,7 @@
 
 #include "spinor.ih"
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int read_binary_spinor_data(spinor *const s, spinor *const r, LemonReader *lemonreader,
                             DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
@@ -126,7 +126,7 @@ int read_binary_spinor_data(spinor *const s, spinor *const r, LemonReader *lemon
   free(filebuffer);
   return 0;
 }
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int read_binary_spinor_data(spinor *const s, spinor *const r, LimeReader *limereader,
                             DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
@@ -212,9 +212,9 @@ int read_binary_spinor_data(spinor *const s, spinor *const r, LimeReader *limere
 #endif
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int read_binary_spinor_data_l(spinor *const s, LemonReader *lemonreader, DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
   int latticeSize[] = {T_global, g_nproc_x * LX, g_nproc_y * LY, g_nproc_z * LZ};
@@ -314,7 +314,7 @@ int read_binary_spinor_data_l(spinor *const s, LemonReader *lemonreader, DML_Che
   free(filebuffer);
   return 0;
 }
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int read_binary_spinor_data_l(spinor *const s, LimeReader *limereader, DML_Checksum *checksum) {
   int t, x, y, z, i = 0, status = 0;
   n_uint64_t bytes;
@@ -390,4 +390,4 @@ int read_binary_spinor_data_l(spinor *const s, LimeReader *limereader, DML_Check
 #endif
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/spinor_write_binary.c b/src/lib/io/spinor_write_binary.c
index a2bc0cd68..560b5ce65 100644
--- a/src/lib/io/spinor_write_binary.c
+++ b/src/lib/io/spinor_write_binary.c
@@ -19,7 +19,7 @@
 
 #include "spinor.ih"
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int write_binary_spinor_data(spinor *const s, spinor *const r, LemonWriter *lemonwriter,
                              DML_Checksum *checksum, int const prec) {
   int x, y, z, t, i = 0, xG, yG, zG, tG, status = 0;
@@ -124,7 +124,7 @@ int write_binary_spinor_data(spinor *const s, spinor *const r, LemonWriter *lemo
   return 0;
 }
 
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int write_binary_spinor_data(spinor *const s, spinor *const r, LimeWriter *limewriter,
                              DML_Checksum *checksum, const int prec) {
   int x, X, y, Y, z, Z, t, t0, tag = 0, id = 0, i = 0, status = 0;
@@ -272,9 +272,9 @@ int write_binary_spinor_data(spinor *const s, spinor *const r, LimeWriter *limew
   }
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
 int write_binary_spinor_data_l(spinor *const s, LemonWriter *lemonwriter, DML_Checksum *checksum,
                                int const prec) {
   int x, y, z, t, i = 0, xG, yG, zG, tG, status = 0;
@@ -374,7 +374,7 @@ int write_binary_spinor_data_l(spinor *const s, LemonWriter *lemonwriter, DML_Ch
   return 0;
 }
 
-#else /* HAVE_LIBLEMON */
+#else /* TM_USE_LEMON */
 int write_binary_spinor_data_l(spinor *const s, LimeWriter *limewriter, DML_Checksum *checksum,
                                const int prec) {
   int x, X, y, Y, z, Z, t, t0, tag = 0, id = 0, i = 0, status = 0;
@@ -514,4 +514,4 @@ int write_binary_spinor_data_l(spinor *const s, LimeWriter *limewriter, DML_Chec
   }
   return (0);
 }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
diff --git a/src/lib/io/spinor_write_propagator_type.c b/src/lib/io/spinor_write_propagator_type.c
index 67356b8f1..77eb17728 100644
--- a/src/lib/io/spinor_write_propagator_type.c
+++ b/src/lib/io/spinor_write_propagator_type.c
@@ -4,9 +4,9 @@ void write_propagator_type(WRITER *writer, const int type) {
   uint64_t bytes;
   char *message;
 
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
 
     message = (char *)malloc(128);
 
@@ -34,7 +34,7 @@ void write_propagator_type(WRITER *writer, const int type) {
 
     close_writer_record(writer);
     free(message);
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
 }
diff --git a/src/lib/io/spinor_write_source_format.c b/src/lib/io/spinor_write_source_format.c
index a501ae5d3..e6cf0e782 100644
--- a/src/lib/io/spinor_write_source_format.c
+++ b/src/lib/io/spinor_write_source_format.c
@@ -22,9 +22,9 @@
 void write_source_format(WRITER *writer, paramsSourceFormat const *format) {
   uint64_t bytes;
   char *buf = NULL;
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
     buf = (char *)malloc(512);
     sprintf(buf,
             "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
@@ -49,7 +49,7 @@ void write_source_format(WRITER *writer, paramsSourceFormat const *format) {
     close_writer_record(writer);
 
     free(buf);
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
 }
diff --git a/src/lib/io/utils_construct_reader.c b/src/lib/io/utils_construct_reader.c
index 2714455b2..832ede73d 100644
--- a/src/lib/io/utils_construct_reader.c
+++ b/src/lib/io/utils_construct_reader.c
@@ -7,22 +7,22 @@ void construct_reader(READER **reader, char *filename) {
   int status = 0;
 
   if (g_debug_level > 0 && g_cart_id == 0) {
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     printf("# Constructing LEMON reader for file %s ...\n", filename);
 #else
     printf("# Constructing LIME reader for file %s ...\n", filename);
 #endif
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   fh = (MPI_File *)malloc(sizeof(MPI_File));
   status = MPI_File_open(g_cart_grid, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, fh);
   status = (status == MPI_SUCCESS) ? 0 : 1;
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
   fh = fopen(filename, "r");
   status = (fh == NULL) ? 1 : 0;
   fflush(stderr);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
   if (status) {
     kill_with_error(fh, g_cart_id,
@@ -30,11 +30,11 @@ void construct_reader(READER **reader, char *filename) {
                     "rights.\nUnable to continue.\n");
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   *reader = lemonCreateReader(fh, g_cart_grid);
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
   *reader = limeCreateReader(fh);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
   if (*reader == (READER *)NULL) {
     kill_with_error(fh, g_cart_id, "\nCould not create reader, unable to continue.\n");
diff --git a/src/lib/io/utils_construct_writer.c b/src/lib/io/utils_construct_writer.c
index 4f13900fe..f2fe58bb7 100644
--- a/src/lib/io/utils_construct_writer.c
+++ b/src/lib/io/utils_construct_writer.c
@@ -4,14 +4,14 @@ void construct_writer(WRITER **writer, char *filename, const int append) {
   LIME_FILE *fh = NULL;
   int status = 0;
   if (g_debug_level > 0 && g_cart_id == 0) {
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     printf("# Constructing LEMON writer for file %s for append = %d\n", filename, append);
 #else
     printf("# Constructing LIME writer for file %s for append = %d\n", filename, append);
 #endif
   }
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   fh = (MPI_File *)malloc(sizeof(MPI_File));
   if (append) {
     status = MPI_File_open(g_cart_grid, filename,
@@ -24,7 +24,7 @@ void construct_writer(WRITER **writer, char *filename, const int append) {
   status = (status == MPI_SUCCESS) ? 0 : 1;
   *writer = lemonCreateWriter(fh, g_cart_grid);
   status = status || (writer == NULL);
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
   if (g_cart_id == 0) {
     if (append) {
       fh = fopen(filename, "a");
@@ -35,7 +35,7 @@ void construct_writer(WRITER **writer, char *filename, const int append) {
     *writer = limeCreateWriter(fh);
     status = status || (writer == NULL);
   }
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
   if (status) kill_with_error(fh, g_cart_id, "Failed to create writer. Aborting...\n");
 }
diff --git a/src/lib/io/utils_destruct_reader.c b/src/lib/io/utils_destruct_reader.c
index 4ee23d595..2ed391c47 100644
--- a/src/lib/io/utils_destruct_reader.c
+++ b/src/lib/io/utils_destruct_reader.c
@@ -5,10 +5,10 @@ void destruct_reader(READER *reader) {
 
   fh = reader->fp;
   DestroyReader(reader);
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   MPI_File_close(fh);
   free(fh); /* NB This assumes construct_writer was used to malloc memory! */
-#else       /* HAVE_LIBLEMON */
+#else       /* TM_USE_LEMON */
   fclose(fh);
-#endif      /* HAVE_LIBLEMON */
+#endif      /* TM_USE_LEMON */
 }
diff --git a/src/lib/io/utils_destruct_writer.c b/src/lib/io/utils_destruct_writer.c
index 840c06b4e..1f6216167 100644
--- a/src/lib/io/utils_destruct_writer.c
+++ b/src/lib/io/utils_destruct_writer.c
@@ -3,16 +3,16 @@
 void destruct_writer(WRITER *writer) {
   LIME_FILE *fh = NULL;
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   fh = writer->fp;
   lemonDestroyWriter(writer);
   MPI_File_close(fh);
   free(fh); /* NB This assumes construct_writer was used to malloc memory! */
-#else       /* HAVE_LIBLEMON */
+#else       /* TM_USE_LEMON */
   if (g_cart_id == 0) {
     fh = writer->fp;
     limeDestroyWriter(writer);
     fclose(fh);
   }
-#endif      /* HAVE_LIBLEMON */
+#endif      /* TM_USE_LEMON */
 }
diff --git a/src/lib/io/utils_kill_with_error.c b/src/lib/io/utils_kill_with_error.c
index bd697220d..322536bd7 100644
--- a/src/lib/io/utils_kill_with_error.c
+++ b/src/lib/io/utils_kill_with_error.c
@@ -7,11 +7,11 @@ void kill_with_error(LIME_FILE *fh, int const rank, char const *error) {
   }
 
   if (fh != NULL)
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     MPI_File_close(fh);
 #else
     fclose(fh);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
 
 #ifdef TM_USE_MPI
   MPI_Abort(MPI_COMM_WORLD, 1);
diff --git a/src/lib/io/utils_write_first_message.c b/src/lib/io/utils_write_first_message.c
index 983b92b0a..287d67c37 100644
--- a/src/lib/io/utils_write_first_message.c
+++ b/src/lib/io/utils_write_first_message.c
@@ -30,28 +30,28 @@ int write_first_messages(FILE* parameterfile, char const* const executable,
            TMLQCD_PACKAGE_VERSION, git_hash);
   printf("%s", message);
   fprintf(parameterfile, "%s", message);
-#ifdef _GAUGE_COPY
-  printf("# The code is compiled with -D_GAUGE_COPY\n");
-  fprintf(parameterfile, "# The code is compiled with -D_GAUGE_COPY\n");
+#ifdef TM_GAUGE_COPY
+  printf("# The code is compiled with -DTM_GAUGE_COPY\n");
+  fprintf(parameterfile, "# The code is compiled with -DTM_GAUGE_COPY\n");
 #endif
-#ifdef _USE_HALFSPINOR
-  printf("# The code is compiled with -D_USE_HALFSPINOR\n");
-  fprintf(parameterfile, "# The code is compiled with -D_USE_HALFSPINOR\n");
+#ifdef TM_USE_HALFSPINOR
+  printf("# The code is compiled with -DTM_USE_HALFSPINOR\n");
+  fprintf(parameterfile, "# The code is compiled with -DTM_USE_HALFSPINOR\n");
 #endif
-#ifdef _USE_SHMEM
-  printf("# the code is compiled with -D_USE_SHMEM\n");
-  fprintf(parameterfile, "# the code is compiled with -D_USE_SHMEM\n");
-#ifdef _PERSISTENT
+#ifdef TM_USE_SHMEM
+  printf("# the code is compiled with -DTM_USE_SHMEM\n");
+  fprintf(parameterfile, "# the code is compiled with -DTM_USE_SHMEM\n");
+#ifdef TM_PERSISTENT
   printf("# the code is compiled for persistent MPI calls (halfspinor only)\n");
   fprintf(parameterfile, "# the code is compiled for persistent MPI calls (halfspinor only)\n");
 #endif
 #endif
 #ifdef TM_USE_MPI
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
   printf("# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
   fprintf(parameterfile, "# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
 #endif
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   printf("# the code is compiled with MPI IO / Lemon\n");
   fprintf(parameterfile, "# the code is compiled with MPI IO / Lemon\n");
 #endif
diff --git a/src/lib/io/utils_write_header.c b/src/lib/io/utils_write_header.c
index 7f5f85c83..be8ae4ade 100644
--- a/src/lib/io/utils_write_header.c
+++ b/src/lib/io/utils_write_header.c
@@ -23,9 +23,9 @@ void write_header(WRITER *writer, int MB, int ME, char const *type, uint64_t byt
   int status;
   RECORD_HEADER *header;
 
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
     /* Nasty (but probably harmless) hack to get rid of const qualifier - the original c-lime was
      * sloppy here. */
     header = CreateHeader(MB, ME, (char *)type, bytes);
@@ -35,8 +35,8 @@ void write_header(WRITER *writer, int MB, int ME, char const *type, uint64_t byt
     if (status != LIME_SUCCESS) {
       kill_with_error(writer->fp, g_cart_id, "Header writing error. Aborting\n");
     }
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
   return;
 }
diff --git a/src/lib/io/utils_write_message.c b/src/lib/io/utils_write_message.c
index b71cdbbce..d346c9a9c 100644
--- a/src/lib/io/utils_write_message.c
+++ b/src/lib/io/utils_write_message.c
@@ -23,9 +23,9 @@ int write_message(WRITER *writer, char const *buffer, uint64_t bytes) {
   int status;
   n_uint64_t bytesWritten = bytes;
 
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   if (g_cart_id == 0) {
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
     if (buffer == (char *)NULL) return (0);
 
 #ifdef TM_USE_MPI
@@ -35,8 +35,8 @@ int write_message(WRITER *writer, char const *buffer, uint64_t bytes) {
 #endif
     if (status != LIME_SUCCESS || bytes != bytesWritten)
       kill_with_error(writer->fp, g_cart_id, "I/O error on writing message. Aborting...\n");
-#ifndef HAVE_LIBLEMON
+#ifndef TM_USE_LEMON
   }
-#endif /* ! HAVE_LIBLEMON */
+#endif /* ! TM_USE_LEMON */
   return (0);
 }
diff --git a/src/lib/linalg/blas.h b/src/lib/linalg/blas.h
index a972e5029..110afb01f 100644
--- a/src/lib/linalg/blas.h
+++ b/src/lib/linalg/blas.h
@@ -23,8 +23,8 @@
 #include <complex.h>
 #include "linalg/fortran.h"
 
-#if defined CRAY || defined HITACHI
-/* On the CRAY is all different, of course... */
+#if defined TM_CRAY || defined HITACHI
+/* On the TM_CRAY is all different, of course... */
 #include "fortran.h"
 #define zgemm ZGEMM
 #define zgemv ZGEMV
diff --git a/src/lib/linalg/lapack.h b/src/lib/linalg/lapack.h
index 1c7f4ce7a..a651b07ae 100644
--- a/src/lib/linalg/lapack.h
+++ b/src/lib/linalg/lapack.h
@@ -23,7 +23,7 @@
 #include <complex.h>
 #include "linalg/fortran.h"
 
-#if defined CRAY || defined HITACHI
+#if defined TM_CRAY || defined HITACHI
 #define zgels CGELS
 #define zgesv CGESV
 #define zgeevx CGEEVX
diff --git a/src/lib/little_D.c b/src/lib/little_D.c
index 370e7583a..2bee49824 100644
--- a/src/lib/little_D.c
+++ b/src/lib/little_D.c
@@ -276,11 +276,11 @@ extern int waitcount;
 void init_little_field_exchange(_Complex double *w) {
 #ifdef TM_USE_MPI
   int i = 0;
-#if (defined PARALLELT || defined PARALLELX)
+#if (defined TM_PARALLELT || defined TM_PARALLELX)
   int no_dirs = 2;
-#elif (defined PARALLELXT || defined PARALLELXY || defined PARALLELXYZ)
+#elif (defined TM_PARALLELXT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   int no_dirs = 4;
-#elif (defined PARALLELXYT || defined PARALLELXYZT)
+#elif (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   int no_dirs = 6;
 #endif
   if (waitcount != 0) {
@@ -304,7 +304,7 @@ void init_little_field_exchange(_Complex double *w) {
               g_nb_list[i], i + 1, g_cart_grid, &lrequests[2 * i + 3]);
     waitcount += 4;
   }
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   /* send to the right, receive from the left */
   i = 6;
   MPI_Isend((void *)(w + g_N_s), g_N_s, MPI_DOUBLE_COMPLEX, g_nb_list[i], i, g_cart_grid,
diff --git a/src/lib/meas/polyakov_loop.c b/src/lib/meas/polyakov_loop.c
index 9108bcb99..25deea402 100644
--- a/src/lib/meas/polyakov_loop.c
+++ b/src/lib/meas/polyakov_loop.c
@@ -446,7 +446,7 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
 
   /* (1) collect contributions from different time/z slices to nodes with rank=0
      in spatial volume/space-time slices */
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   if (dir == 0) {
 #endif
     tmp_ray = (su3 *)calloc(VOL3, sizeof(su3)); /* */
@@ -456,7 +456,7 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
     }
 
     MPI_Reduce(tmp_loc, tmp_ray, VOL3, mpi_su3, mpi_reduce_su3_ray, 0, ray);
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   }
 #endif
 
@@ -475,7 +475,7 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
     ks = 0.0;
 
 #ifdef TM_USE_MPI
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
     u = tmp_ray;
 #else
     if (dir == 0) {
@@ -502,11 +502,11 @@ int polyakov_loop_dir(const int nstore /* in  */, const int dir /* in  */) {
 #ifdef TM_USE_MPI
     MPI_Reduce(&pl_tmp, &pl, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, slice);
   }
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   if (dir == 0) {
 #endif
     free(tmp_ray);
-#ifndef PARALLELXYZT
+#ifndef TM_PARALLELXYZT
   }
 #endif
 
diff --git a/src/lib/measure_gauge_action.c b/src/lib/measure_gauge_action.c
index 6a558a51b..1f7cb6ad5 100644
--- a/src/lib/measure_gauge_action.c
+++ b/src/lib/measure_gauge_action.c
@@ -26,9 +26,7 @@
  *     Returns the value of the action
  ************************************************************************/
 
-#ifdef HAVE_CONFIG_H
 #include <tmlqcd_config.h>
-#endif
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/src/lib/mpi_init.c b/src/lib/mpi_init.c
index 2bbbde315..cc09fd4cd 100644
--- a/src/lib/mpi_init.c
+++ b/src/lib/mpi_init.c
@@ -25,7 +25,7 @@
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 #include "global.h"
@@ -134,7 +134,7 @@ MPI_Datatype halffield_y_slice_gath;
 
 MPI_Datatype halffield_z_slice_cont;
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
 MPI_Datatype field_z_slice_even_dn;
 MPI_Datatype field_z_slice_even_up;
 MPI_Datatype field_z_slice_odd_dn;
@@ -188,60 +188,60 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   }
 
 #ifdef TM_USE_MPI
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
   /* we need that the PE number in MPI_COMM_WORL  */
   /* exactly correspond to the one in g_cart_grid */
   reorder = 0;
 #endif
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 0; /* the other N_PROC_? are read from input, if not constraint below */
                 /* N_PROC_T will be set by MPI_Dims_create, if not constraint below */
 #endif
 
-#if defined PARALLELT
+#if defined TM_PARALLELT
   ndims = 1;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_X = 1;
   N_PROC_Y = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELX
+#if defined TM_PARALLELX
   ndims = 1;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 1;
   N_PROC_Y = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXT
+#if defined TM_PARALLELXT
   ndims = 2;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Y = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXY
+#if defined TM_PARALLELXY
   ndims = 2;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 1;
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXYT
+#if defined TM_PARALLELXYT
   ndims = 3;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Z = 1;
 #endif
 #endif
-#if defined PARALLELXYZ
+#if defined TM_PARALLELXYZ
   ndims = 3;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = 1;
 #endif
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   ndims = 4;
 #endif
   dims[0] = N_PROC_T;
@@ -278,7 +278,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
     exit(-1);
   }
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_T = g_nproc_t;
   N_PROC_X = g_nproc_x;
   N_PROC_Y = g_nproc_y;
@@ -289,42 +289,42 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   LZ = LZ / g_nproc_z;
   VOLUME = (T * LX * LY * LZ);
   SPACEVOLUME = VOLUME / T;
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   RAND = (2 * LX * LY * LZ);
   EDGES = 0;
-#elif defined PARALLELX
+#elif defined TM_PARALLELX
   RAND = (2 * T * LY * LZ);
   EDGES = 0;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   RAND = 2 * LZ * (LY * LX + T * LY);
   EDGES = 4 * LZ * LY;
-#elif defined PARALLELXY
+#elif defined TM_PARALLELXY
   RAND = 2 * LZ * T * (LX + LY);
   EDGES = 4 * LZ * T;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   RAND = 2 * LZ * (LY * LX + T * LY + T * LX);
   EDGES = 4 * LZ * (LY + T + LX);
-#elif defined PARALLELXYZ
+#elif defined TM_PARALLELXYZ
   RAND = 2 * T * (LY * LZ + LX * LZ + LX * LY);
   EDGES = 4 * T * (LX + LY + LZ);
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   RAND = 2 * LZ * LY * LX + 2 * LZ * T * LY + 2 * LZ * T * LX + 2 * T * LX * LY;
   EDGES = 4 * LZ * LY + 4 * LZ * T + 4 * LZ * LX + 4 * LY * T + 4 * LY * LX + 4 * T * LX;
-#else  /* ifdef PARALLELT */
+#else  /* ifdef TM_PARALLELT */
   RAND = 0;
   EDGES = 0;
-#endif /* ifdef PARALLELT */
+#endif /* ifdef TM_PARALLELT */
   /* Note that VOLUMEPLUSRAND is not always equal to VOLUME+RAND */
   /* VOLUMEPLUSRAND rather includes the edges */
   VOLUMEPLUSRAND = VOLUME + RAND + EDGES;
   SPACERAND = RAND / T;
-#endif /* ifndef FIXEDVOLUME */
+#endif /* ifndef TM_FIXEDVOLUME */
   g_dbw2rand = (RAND + 2 * EDGES);
 
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   field_buffer_z = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
   field_buffer_z2 = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
   field_buffer_z3 = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
   field_buffer_z4 = (spinor *)malloc(T * LX * LY / 2 * sizeof(spinor));
 #endif
@@ -347,23 +347,23 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   for (i = 0; i < 8; i++) {
     g_nb_list[i] = g_cart_id;
   }
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   MPI_Cart_shift(g_cart_grid, 0, 1, &g_nb_t_dn, &g_nb_t_up);
   g_nb_list[0] = g_nb_t_up;
   g_nb_list[1] = g_nb_t_dn;
 #endif
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || \
-     defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
+     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 1, 1, &g_nb_x_dn, &g_nb_x_up);
   g_nb_list[2] = g_nb_x_up;
   g_nb_list[3] = g_nb_x_dn;
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 2, 1, &g_nb_y_dn, &g_nb_y_up);
   g_nb_list[4] = g_nb_y_up;
   g_nb_list[5] = g_nb_y_dn;
 #endif
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 3, 1, &g_nb_z_dn, &g_nb_z_up);
   g_nb_list[6] = g_nb_z_up;
   g_nb_list[7] = g_nb_z_dn;
@@ -669,7 +669,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   g_mpi_ST_rank = 0;
   g_stdio_proc = 0;
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T = T_global;
   VOLUME = (T * LX * LY * LZ);
   SPACEVOLUME = VOLUME / T;
@@ -687,7 +687,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
 
   /* Here we perform some checks in order not to */
   /* run into trouble later                      */
-#if (defined PARALLELXYZT || defined PARALLELXYZ)
+#if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
   if ((T * LX * LY) % 2 != 0 && even_odd_flag == 1) {
     fprintf(stderr, "T*LX*LY must be even!\nAborting prgram...\n");
 #ifdef TM_USE_MPI
diff --git a/src/lib/mpi_init.h b/src/lib/mpi_init.h
index dce6dfad7..d9476e662 100644
--- a/src/lib/mpi_init.h
+++ b/src/lib/mpi_init.h
@@ -108,8 +108,8 @@ extern MPI_Datatype halffield_y_slice_gath;
 extern MPI_Datatype halffield_z_slice_cont;
 
 
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || \
-     defined PARALLELXYZ)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELXYZ)
 extern MPI_Datatype field_z_slice_even_dn;
 extern MPI_Datatype field_z_slice_even_up;
 extern MPI_Datatype field_z_slice_odd_dn;
diff --git a/src/lib/operator.c b/src/lib/operator.c
index 6b6a94df2..e15a97701 100644
--- a/src/lib/operator.c
+++ b/src/lib/operator.c
@@ -63,7 +63,7 @@
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
diff --git a/src/lib/operator/D_psi_body.c b/src/lib/operator/D_psi_body.c
index 06bde0fc5..b5acd1158 100644
--- a/src/lib/operator/D_psi_body.c
+++ b/src/lib/operator/D_psi_body.c
@@ -283,7 +283,7 @@ void _PSWITCH(D_psi)(_PTSWITCH(spinor) *const P, _PTSWITCH(spinor) *const Q) {
   _C_TYPE ALIGN32 phase_2l = (_C_TYPE)phase_2;
   _C_TYPE ALIGN32 phase_3l = (_C_TYPE)phase_3;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (_PSWITCH(g_update_gauge_copy)) {
     _PSWITCH(update_backward_gauge)(_PSWITCH(g_gauge_field));
   }
diff --git a/src/lib/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
index a8da9e810..8b106e10a 100644
--- a/src/lib/operator/Hopping_Matrix.c
+++ b/src/lib/operator/Hopping_Matrix.c
@@ -38,11 +38,11 @@
  *
  *  Structure of top level precompiler directives
  *
- * - defining _USE_HALFSPINOR implies that we also use
+ * - defining TM_USE_HALFSPINOR implies that we also use
  *   a "gauge copy"
  *
  * - such that we are checking for the _USE_GAUGECOPY feature seperatly in the
- *   ELSE branch of the "if defined _USE_HALFSPINOR" statement
+ *   ELSE branch of the "if defined TM_USE_HALFSPINOR" statement
  *
  ****************************************************************/
 
@@ -64,11 +64,11 @@
 #include "operator/Hopping_Matrix.h"
 #include "update_backward_gauge.h"
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 #include "operator/halfspinor_hopping.h"
 
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -88,10 +88,10 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   return;
 }
 
-#else /* thats _USE_HALFSPINOR */
+#else /* thats TM_USE_HALFSPINOR */
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -114,4 +114,4 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   return;
 }
 
-#endif /* thats _USE_HALFSPINOR */
+#endif /* thats TM_USE_HALFSPINOR */
diff --git a/src/lib/operator/Hopping_Matrix_32.c b/src/lib/operator/Hopping_Matrix_32.c
index d1fbe78c7..1198d52bb 100644
--- a/src/lib/operator/Hopping_Matrix_32.c
+++ b/src/lib/operator/Hopping_Matrix_32.c
@@ -39,11 +39,11 @@
  *
  *  Structure of top level precompiler directives
  *
- * - defining _USE_HALFSPINOR implies that we also use
+ * - defining TM_USE_HALFSPINOR implies that we also use
  *   a "gauge copy"
  *
  * - such that we are checking for the _USE_GAUGECOPY feature seperatly in the
- *   ELSE branch of the "if defined _USE_HALFSPINOR" statement
+ *   ELSE branch of the "if defined TM_USE_HALFSPINOR" statement
  *
  ****************************************************************/
 
@@ -66,13 +66,13 @@
 #include "update_backward_gauge.h"
 #include "operator/Hopping_Matrix_32.h"
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 #include "operator/halfspinor_hopping_32.h"
 #endif
 
 void Hopping_Matrix_32_orphaned(const int ieo, spinor32* const l, spinor32* const k) {
-#if defined _USE_HALFSPINOR
-#ifdef _GAUGE_COPY
+#if defined TM_USE_HALFSPINOR
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy_32) {
     update_backward_gauge_32_orphaned(g_gauge_field_32);
   }
diff --git a/src/lib/operator/Hopping_Matrix_nocom.c b/src/lib/operator/Hopping_Matrix_nocom.c
index dce8ad591..c7814bbb0 100644
--- a/src/lib/operator/Hopping_Matrix_nocom.c
+++ b/src/lib/operator/Hopping_Matrix_nocom.c
@@ -48,8 +48,8 @@
 
 #define Hopping_Matrix Hopping_Matrix_nocom
 #define _NO_COMM 1
-#ifdef _KOJAK_INST
-#undef _KOJAK_INST
+#ifdef TM_KOJAK_INST
+#undef TM_KOJAK_INST
 #endif
 
 #include "Hopping_Matrix.c"
diff --git a/src/lib/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
index 542292b1d..a2c54c7e4 100644
--- a/src/lib/operator/halfspinor_body.c
+++ b/src/lib/operator/halfspinor_body.c
@@ -30,7 +30,7 @@ halfspinor* restrict* phi ALIGN;
 halfspinor32* restrict* phi32 ALIGN;
 _declare_hregs();
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(hoppingmatrix)
 #endif
 
@@ -320,6 +320,6 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
   }
 }
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(hoppingmatrix)
 #endif
diff --git a/src/lib/operator/hopping_bg_dbl.c b/src/lib/operator/hopping_bg_dbl.c
index 02e6b5c04..93af99e24 100644
--- a/src/lib/operator/hopping_bg_dbl.c
+++ b/src/lib/operator/hopping_bg_dbl.c
@@ -41,7 +41,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   __alignx(16, l);
   __alignx(16, k);
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -64,7 +64,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 
   sp = k + icy;
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = &g_gauge_field_copy[ioff][0];
 #else
   up = &g_gauge_field[ix][0];
@@ -76,7 +76,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     /*********************** direction +0 ************************/
     iy = g_idn[ix][0];
     icy = g_lexic2eosub[iy];
-#if (!defined _GAUGE_COPY)
+#if (!defined TM_GAUGE_COPY)
     um = &g_gauge_field[iy][0];
 #else
     um = up + 1;
@@ -90,7 +90,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[ix][1];
     icy = g_lexic2eosub[iy];
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -104,7 +104,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_idn[ix][1];
     icy = g_lexic2eosub[iy];
 
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][1];
 #else
     um = up + 1;
@@ -117,7 +117,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[ix][2];
     icy = g_lexic2eosub[iy];
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -131,7 +131,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_idn[ix][2];
     icy = g_lexic2eosub[iy];
 
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][2];
 #else
     um = up + 1;
@@ -145,7 +145,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[ix][3];
     icy = g_lexic2eosub[iy];
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -158,7 +158,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_idn[ix][3];
     icy = g_lexic2eosub[iy];
 
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][3];
 #else
     um = up + 1;
@@ -174,7 +174,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[iz][0];
     icy = g_lexic2eosub[iy];
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up = &g_gauge_field[iz][0];
diff --git a/src/lib/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
index 02df9c943..c3eefb74a 100644
--- a/src/lib/operator/hopping_body_dbl.c
+++ b/src/lib/operator/hopping_body_dbl.c
@@ -43,7 +43,7 @@ if (ieo == 0) {
 #ifndef TM_USE_OMP
 hi = &g_hi[16 * ioff];
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
 up = &g_gauge_field_copy[ioff][0];
 #else
 up = &g_gauge_field[(*hi)][0];
@@ -60,7 +60,7 @@ hi++;
 for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 #ifdef TM_USE_OMP
   hi = &g_hi[16 * icx];
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = &g_gauge_field_copy[icx][0];
 #else
   up = &g_gauge_field[(*hi)][0];
@@ -74,7 +74,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   pn = p + (icx - ioff);
 #endif
   /*********************** direction +t ************************/
-#if (!defined _GAUGE_COPY)
+#if (!defined TM_GAUGE_COPY)
   um = &g_gauge_field[(*hi)][0];
 #else
   um = up + 1;
@@ -86,7 +86,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_p();
 
   /*********************** direction -t ************************/
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -97,7 +97,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_m();
 
   /*********************** direction +1 ************************/
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
   um = &g_gauge_field[(*hi)][1];
 #else
   um = up + 1;
@@ -109,7 +109,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_p();
 
   /*********************** direction -1 ************************/
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -120,7 +120,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_m();
 
   /*********************** direction +2 ************************/
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
   um = &g_gauge_field[(*hi)][2];
 #else
   um = up + 1;
@@ -132,7 +132,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_p();
 
   /*********************** direction -2 ************************/
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -143,7 +143,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_m();
 
   /*********************** direction +3 ************************/
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
   um = &g_gauge_field[(*hi)][3];
 #else
   um = up + 1;
@@ -156,7 +156,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 
   /*********************** direction -3 ************************/
 #ifndef TM_USE_OMP
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
   up = um + 1;
 #else
   up = &g_gauge_field[(*hi)][0];
diff --git a/src/lib/operator/hopping_sgl.c b/src/lib/operator/hopping_sgl.c
index 5067ab13d..062507158 100644
--- a/src/lib/operator/hopping_sgl.c
+++ b/src/lib/operator/hopping_sgl.c
@@ -37,7 +37,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
   spinor32* restrict r, * restrict sp, * restrict sm;
   spinor32 temp;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge();
   }
@@ -72,7 +72,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = &g_gauge_field_copy[icx][0];
 #else
     up = &g_gauge_field[ix][0];
@@ -100,7 +100,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     um = up + 1;
 #else
     um = &g_gauge_field[iy][0];
@@ -129,7 +129,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
 
     sp = k + icy;
 
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -157,7 +157,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][1];
 #else
     um = up + 1;
@@ -185,7 +185,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -212,7 +212,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][2];
 #else
     um = up + 1;
@@ -240,7 +240,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined _GAUGE_COPY))
+#if ((defined TM_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -267,7 +267,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef _GAUGE_COPY
+#ifndef TM_GAUGE_COPY
     um = &g_gauge_field[iy][3];
 #else
     um = up + 1;
diff --git a/src/lib/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
index fd2aef9db..857404088 100644
--- a/src/lib/operator/tm_sub_Hopping_Matrix.c
+++ b/src/lib/operator/tm_sub_Hopping_Matrix.c
@@ -51,12 +51,12 @@
 // where cfactor = a + i b
 //
 
-#if (defined _USE_HALFSPINOR)
+#if (defined TM_USE_HALFSPINOR)
 #include "operator/halfspinor_hopping.h"
 
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spinor* const k,
                            complex double const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -78,10 +78,10 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spin
   return;
 }
 
-#elif (!defined _NO_COMM && !defined _USE_HALFSPINOR)
+#elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* p, spinor* const k,
                            complex double const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index 3b336d2a9..6d1abddba 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -51,12 +51,12 @@
 // where cfactor = a + i b
 //
 
-#if (defined _USE_HALFSPINOR && !defined _NO_COMM)
+#if (defined TM_USE_HALFSPINOR && !defined _NO_COMM)
 #include "operator/halfspinor_hopping.h"
 
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              complex double const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -78,10 +78,10 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
   return;
 }
 
-#elif (!defined _NO_COMM && !defined _USE_HALFSPINOR)
+#elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              double complex const cfactor) {
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -103,4 +103,4 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #endif
   return;
 }
-#endif  //_USE_HALFSPINOR && !defined _NO_COMM
+#endif  //TM_USE_HALFSPINOR && !defined _NO_COMM
diff --git a/src/lib/overrelaxation.c b/src/lib/overrelaxation.c
index 91d95fa30..2a1329bba 100644
--- a/src/lib/overrelaxation.c
+++ b/src/lib/overrelaxation.c
@@ -153,7 +153,7 @@ void flip_subgroup(int ix, int mu, su3 vv, int i) {
   *z = w;
 }
 
-#if defined PARALLEL1
+#if defined TM_PARALLEL1
 void overrel_sweep() {
   int x0, x1, x2, x3;
   int mu, ix;
diff --git a/src/lib/parallel_io.h b/src/lib/parallel_io.h
index 50e03fd59..98df3fb8c 100644
--- a/src/lib/parallel_io.h
+++ b/src/lib/parallel_io.h
@@ -17,8 +17,8 @@
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
 
-#ifndef _PARALLEL_IO_H
-#define _PARALLEL_IO_H
+#ifndef _TM_PARALLEL_IO_H
+#define _TM_PARALLEL_IO_H
 
 #include <lemon.h>
 #include "dml.h"
diff --git a/src/lib/read_input.l b/src/lib/read_input.l
index 6af756c7e..59f002748 100644
--- a/src/lib/read_input.l
+++ b/src/lib/read_input.l
@@ -849,8 +849,8 @@ static inline double fltlist_next_token(int * const list_end){
 }
 
 <INITMULTIGRID>AMG{SPC}* {
-#ifdef DDalphaAMG
- if(myverbose) printf("Initialising DDalphaAMG line %d\n", line_of_file); 
+#ifdef TM_USE_DDalphaAMG
+ if(myverbose) printf("Initialising DDalphaAMG line %d\n", line_of_file);
  BEGIN(MULTIGRID);
 #else
  printf("ERROR line %d: DDalphaAMG library not included\n", line_of_file);
@@ -951,7 +951,7 @@ static inline double fltlist_next_token(int * const list_end){
     mg_no_shifts=0;
     if(myverbose) printf("  MG_MMS_Mass set to %.16f line %d operator %d\n", mg_mms_mass, line_of_file, current_operator);
   }
-  EndDDalphaAMG{SPC}* {
+  End_DDalphaAMG{SPC}* {
   if(myverbose) printf("DDalphaAMG parsed in line %d\n\n", line_of_file);
   BEGIN(0);
   }
@@ -1385,7 +1385,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     optr->solver = MG;
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
@@ -1490,7 +1490,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     optr->solver = MG;
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
@@ -1543,7 +1543,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     optr->solver = MG;
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
@@ -2834,7 +2834,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->solver = MG;
     BEGIN(solver_caller);
@@ -2877,7 +2877,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(solver_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  HB Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->HB_solver = MG;
     BEGIN(solver_caller);
@@ -2902,7 +2902,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(solver_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->solver = MG;
     BEGIN(solver_caller);
@@ -2935,7 +2935,7 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(solver_caller);
   }
   DDalphaAMG {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->solver = MG;
     BEGIN(solver_caller);
@@ -3229,49 +3229,49 @@ static inline double fltlist_next_token(int * const list_end){
 }
 
 <TT>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T_global = atoi(yytext);
   if(myverbose!=0) printf("T =%s\n", yytext);
 #endif
 }
 <LL>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   L = atoi(yytext);
   if(myverbose!=0) printf("L =%s\n", yytext);
 #endif
 }
 <LLX>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   LX = atoi(yytext);
   if(myverbose!=0) printf("LX =%s\n", yytext);
 #endif
 }
 <LLY>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   LY = atoi(yytext);
   if(myverbose!=0) printf("LY =%s\n", yytext);
 #endif
 }
 <LLZ>{DIGIT}+                  {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   LZ = atoi(yytext);
   if(myverbose!=0) printf("LZ =%s\n", yytext);
 #endif
 }
 <NPROCX>{DIGIT}+              {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_X = atoi(yytext);
   if(myverbose!=0) printf("Nr of processors in x direction = %s\n", yytext);
 #endif
 }
 <NPROCY>{DIGIT}+              {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Y = atoi(yytext);
   if(myverbose!=0) printf("Nr of processors in y direction = %s\n", yytext);
 #endif
 }
 <NPROCZ>{DIGIT}+              {
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   N_PROC_Z = atoi(yytext);
   if(myverbose!=0) printf("Nr of processors in z direction = %s\n", yytext);
 #endif
@@ -3776,7 +3776,7 @@ int read_input(const char * conf_file){
    * Setting default values!
    ********************************************/
   reread = 0;
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T_global = _default_T_global;
   L = _default_L;
   LX = _default_LX;
@@ -3994,7 +3994,7 @@ int read_input(const char * conf_file){
   yyout = fopen("/dev/null", "w");
 
   parse_config();  
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   if(LX == 0) {
     LX = L;
   }
@@ -4029,7 +4029,7 @@ int read_input(const char * conf_file){
  */
 
 int reread_input(const char * conf_file){
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   int tt=T, ll=L, lx = LX, ly = LY, lz = LZ, 
       np=N_PROC_X, npy = N_PROC_Y;
 #endif
@@ -4054,7 +4054,7 @@ int reread_input(const char * conf_file){
 
   parse_config();  
 
-#ifndef FIXEDVOLUME
+#ifndef TM_FIXEDVOLUME
   T = tt;
   L = ll;
   LX = lx;
diff --git a/src/lib/solver/cg_her.c b/src/lib/solver/cg_her.c
index bf6981c4b..b556acb25 100644
--- a/src/lib/solver/cg_her.c
+++ b/src/lib/solver/cg_her.c
@@ -102,7 +102,7 @@ int cg_her(spinor* const P, spinor* const Q, const int max_iter, double eps_sq,
     if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       break;
     }
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
     if (((err * err <= eps_sq) && (rel_prec == 0)) ||
         ((err * err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       g_sloppy_precision = 1;
diff --git a/src/lib/solver/cg_her_nd.c b/src/lib/solver/cg_her_nd.c
index 03a85a713..746c21718 100644
--- a/src/lib/solver/cg_her_nd.c
+++ b/src/lib/solver/cg_her_nd.c
@@ -133,7 +133,7 @@ int cg_her_nd(spinor* const P_up, spinor* P_dn, spinor* const Q_up, spinor* cons
     if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       break;
     }
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
     if (((err * err <= eps_sq) && (rel_prec == 0)) ||
         ((err * err <= eps_sq * squarenorm) && (rel_prec == 1))) {
       g_sloppy_precision = 1;
diff --git a/src/lib/solver/cr.c b/src/lib/solver/cr.c
index 58022ac28..f6a1bd348 100644
--- a/src/lib/solver/cr.c
+++ b/src/lib/solver/cr.c
@@ -106,7 +106,7 @@ int cr(spinor* const P, spinor* const Q, const int m, const int max_restarts, co
       break;
     }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
     if (((err * err <= eps_sq) && (rel_prec == 0)) ||
         ((err * err <= eps_sq * norm_sq) && (rel_prec == 1))) {
       if (g_sloppy_precision_flag == 1) {
diff --git a/src/lib/solver/diagonalise_general_matrix.c b/src/lib/solver/diagonalise_general_matrix.c
index 0667da9aa..9fb989da6 100644
--- a/src/lib/solver/diagonalise_general_matrix.c
+++ b/src/lib/solver/diagonalise_general_matrix.c
@@ -70,7 +70,7 @@ void diagonalise_general_matrix(int n, _Complex double *A, int lda, _Complex dou
 
   /* Query call to get the optimal lwork */
   lwork = -1;
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
   _FT(zgeevx)("N", "N", "V", "N", &n, A, &lda, evalues, vl, &n, vr, &n, &ilo, &ihi, scale, &abnrm,
               rcone, rconv, &dummy, &lwork, rwork, &info, 1, 1, 1, 1);
   lwork = (int)(creal(dummy));
diff --git a/src/lib/solver/dirac_operator_eigenvectors.c b/src/lib/solver/dirac_operator_eigenvectors.c
index 42e85d198..845d5aedc 100644
--- a/src/lib/solver/dirac_operator_eigenvectors.c
+++ b/src/lib/solver/dirac_operator_eigenvectors.c
@@ -28,7 +28,7 @@
 #ifdef FFTW
 #include <fftw3.h>
 #endif
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 #include <stdlib.h>
@@ -330,7 +330,7 @@ _Complex double calcDDaggerDovEvalue(const int *praw, double kappa, double rho,
 }
 
 void spinor_fft(spinor *spinor_in, spinor *spinor_out, int tt, int ll, unsigned int forward) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   fftw_plan plan = spinor_fftw_plan(spinor_in, spinor_out, tt, ll, forward, FFTW_WISDOM_ONLY);
   fftw_execute(plan);
 #else
@@ -555,7 +555,7 @@ void spinorPrecWS_Free(spinorPrecWS *ws) {
  */
 
 void eigenvector_Dtm(spinor *spin, double mu, int epsilon, int k, int color, int rawp[4]) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   fftw_plan p1bw;
 #endif
   int i = 0;
@@ -630,7 +630,7 @@ void eigenvector_Dtm(spinor *spin, double mu, int epsilon, int k, int color, int
 
   _spinor_muleq_real(*phi, 1.0 / sqrt((double)(VOLUME)));
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   p1bw = spinor_fftw_plan(spin, spin, T, L, 0, FFTW_WISDOM_ONLY);
   fftw_execute(p1bw);
 #endif
@@ -638,7 +638,7 @@ void eigenvector_Dtm(spinor *spin, double mu, int epsilon, int k, int color, int
   /* spinor mulp half phase */
 }
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan(const spinor *spinor_in, spinor *spinor_out, int T, int ll,
                            unsigned int forward, int fftw_flags) {
   /*    int index_s = gsi(get_index(it, ix, iy, iz, tt, ll)); */
@@ -760,13 +760,13 @@ void spinorPrecondition(spinor *spinor_out, const spinor *spinor_in, spinorPrecW
   spinor phi_plus;
   double OOVOL = 1. / (double)(VOLUME);
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
   fftw_plan plan_fw;
   fftw_plan plan_bw;
 #endif
 
   if (autofft == 1) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
     /*     spinor_mulp_half_phase(spinor_out,spinor_in,ws->c_table, ws->s_table,1,1.); */
     plan_fw = spinor_fftw_plan(spinor_in, spinor_out, tt, ll, 1 /* = true */, FFTW_WISDOM_ONLY);
     fftw_execute(plan_fw);
@@ -889,7 +889,7 @@ void spinorPrecondition(spinor *spinor_out, const spinor *spinor_in, spinorPrecW
   }
 
   if (autofft == 1) {
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
     plan_bw = spinor_fftw_plan(spinor_out, spinor_out, tt, LX, 0, FFTW_WISDOM_ONLY);
     fftw_execute(plan_bw);
 #endif
@@ -1292,7 +1292,7 @@ void spinor_mulp_half_phase(spinor *spinor_out, const spinor *spinor_in, double
  * loading and storing of fftw wisdoms
  */
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 void loadFFTWWisdom(spinor *spinor_in, spinor *spinor_out, int tt, int ll) {
   /*   ostringstream filename_fftw_wisdom; */
   /*   filename_fftw_wisdom << "fftw_wisdom_" << setw(2) << setfill('0') << T << "x"<< setw(2) <<
@@ -2050,7 +2050,7 @@ void calculateDiagFalloffElements(const int op_id) {
   if (g_precWS == NULL) {
     /* we are going to need fft*/
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
     loadFFTWWisdom(g_spinor_field[0], g_spinor_field[1], T, LX);
 #endif
   }
diff --git a/src/lib/solver/dirac_operator_eigenvectors.h b/src/lib/solver/dirac_operator_eigenvectors.h
index 1ebe2ce71..b10a86312 100644
--- a/src/lib/solver/dirac_operator_eigenvectors.h
+++ b/src/lib/solver/dirac_operator_eigenvectors.h
@@ -24,7 +24,7 @@
 #ifdef HAVE_CONFIG_H
 #include "tmlqcd_config.h"
 #endif
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 #include <fftw3.h>
 #endif
 
@@ -68,7 +68,7 @@ extern tm_operator PRECWSOPERATORSELECT[14];
 /* */
 extern double g_prec_sequence_d_dagger_d[3];
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan(const spinor *spinor_in, spinor *spinor_out, int tt, int ll,
                            unsigned int forward, int fftw_flags);
 #endif
@@ -170,7 +170,7 @@ void spinor_mulp_half_phase(spinor *spinor_out, const spinor *spinor_in, double
  * read and write fftw wisdoms
  * this is supposed to speed up things
  */
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 void writeFFTWWisdom(int tt, int ll);
 void loadFFTWWisdom(spinor *spinor_in, spinor *spinor_out, int tt, int ll);
 #endif
diff --git a/src/lib/solver/eigenvalues.c b/src/lib/solver/eigenvalues.c
index 1725387d0..4d8d08887 100644
--- a/src/lib/solver/eigenvalues.c
+++ b/src/lib/solver/eigenvalues.c
@@ -68,7 +68,7 @@ double eigenvalues(int *nr_of_eigenvalues, const int max_iterations, const doubl
                    const int even_odd_flag) {
   double returnvalue;
   _Complex double norm2;
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
   static int allocated = 0;
   char filename[200];
   FILE *ofs;
diff --git a/src/lib/solver/fgmres.c b/src/lib/solver/fgmres.c
index 60d10fa72..154428124 100644
--- a/src/lib/solver/fgmres.c
+++ b/src/lib/solver/fgmres.c
@@ -85,7 +85,7 @@ int fgmres(spinor *const P, spinor *const Q, const int m, const int max_restarts
   atime = gettime();
   cumiter_lgcr = 0;
   if (N == VOLUME) {
-    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); /* #ifdef HAVE_LAPACK */
+    init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); /* #ifdef TM_LAPACK */
   } else {
     init_solver_field(&solver_field, VOLUMEPLUSRAND / 2, nr_sf);
   }
diff --git a/src/lib/solver/fgmres4complex_body.c b/src/lib/solver/fgmres4complex_body.c
index b11528c58..1f6fa9c89 100644
--- a/src/lib/solver/fgmres4complex_body.c
+++ b/src/lib/solver/fgmres4complex_body.c
@@ -57,7 +57,7 @@ int _PSWITCH(fgmres4complex)(_Complex _F_TYPE *const P, _Complex _F_TYPE *const
   int fltcntr = 0;
   double alphasave = 0;
 
-  _PSWITCH(init_lsolver_field)(&solver_field, /*why not N?*/ lda, nr_sf); /* #ifdef HAVE_LAPACK */
+  _PSWITCH(init_lsolver_field)(&solver_field, /*why not N?*/ lda, nr_sf); /* #ifdef TM_LAPACK */
 
   eps = sqrt(eps_sq);
   _PSWITCH(init_lgmres)(m, lda);
diff --git a/src/lib/solver/gmres_dr.c b/src/lib/solver/gmres_dr.c
index 4b9f429e0..781b32d86 100644
--- a/src/lib/solver/gmres_dr.c
+++ b/src/lib/solver/gmres_dr.c
@@ -54,7 +54,7 @@
 #include "solver/solver_field.h"
 #include "su3.h"
 
-#ifndef HAVE_LAPACK
+#ifndef TM_LAPACK
 /* In case there is no lapack use normal gmres */
 int gmres_dr(spinor* const P, spinor* const Q, const int m, const int nr_ev, const int max_restarts,
              const double eps_sq, const int rel_prec, const int N, matrix_mult f) {
diff --git a/src/lib/solver/gram-schmidt.c b/src/lib/solver/gram-schmidt.c
index 1e8da1d24..ffd5d6b29 100644
--- a/src/lib/solver/gram-schmidt.c
+++ b/src/lib/solver/gram-schmidt.c
@@ -26,7 +26,7 @@
 #include "linalg/blas.h"
 #include "linalg_eo.h"
 #include "su3spinor.h"
-#ifdef CRAY
+#ifdef TM_CRAY
 #include <fortran.h>
 #endif
 #include "gram-schmidt.h"
@@ -62,7 +62,7 @@ void IteratedClassicalGS(_Complex double v[], double *vnrm, int n, int m, _Compl
       work1[j] = scalar_prod((spinor *)(A + j * lda), (spinor *)v,
                              n * sizeof(_Complex double) / sizeof(spinor), 1);
     }
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
     _FT(zgemv)(fupl_n, &n, &m, &CMONE, A, &lda, work1, &ONE, &CONE, v, &ONE, 1);
 #endif
     (*vnrm) = sqrt(square_norm((spinor *)v, n * sizeof(_Complex double) / sizeof(spinor), 1));
@@ -90,7 +90,7 @@ void ModifiedGS(_Complex double v[], int n, int m, _Complex double A[], int lda)
     s = scalar_prod((spinor *)(A + i * lda), (spinor *)v,
                     n * sizeof(_Complex double) / sizeof(spinor), 1);
     s = -s;
-#ifdef HAVE_LAPACK
+#ifdef TM_LAPACK
     _FT(zaxpy)(&n, &s, A + i * lda, &ONE, v, &ONE);
 #endif
   }
diff --git a/src/lib/solver/mcr.c b/src/lib/solver/mcr.c
index 707181cc2..184fa567f 100644
--- a/src/lib/solver/mcr.c
+++ b/src/lib/solver/mcr.c
@@ -127,7 +127,7 @@ int mcr(spinor* const P, spinor* const Q, const int m, const int max_restarts, c
         break;
       }
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
       if (((err * err <= eps_sq) && (rel_prec == 0)) ||
           ((err * err <= eps_sq * norm_sq) && (rel_prec == 1))) {
         if (g_sloppy_precision_flag == 1) {
diff --git a/src/lib/solver/monomial_solve.c b/src/lib/solver/monomial_solve.c
index 94873079f..0e73e9b0d 100644
--- a/src/lib/solver/monomial_solve.c
+++ b/src/lib/solver/monomial_solve.c
@@ -77,7 +77,7 @@
 #include "solver/solver_params.h"
 #include "solver/solver_types.h"
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QPHIX
@@ -184,7 +184,7 @@ int solve_degenerate(spinor* const P, spinor* const Q, solver_params_t solver_pa
   } else if (solver_type == BICGSTAB) {
     iteration_count = bicgstab_complex(P, Q, max_iter, eps_sq, rel_prec, N, f);
   }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   else if (solver_type == MG)
     iteration_count = MG_solver(P, Q, eps_sq, max_iter, rel_prec, N, g_gauge_field, f);
 #endif
@@ -283,7 +283,7 @@ int solve_mms_tm(spinor** const P, spinor* const Q, solver_params_t* solver_para
       if (solver_params->type == CGMMS) {
     iteration_count = cg_mms_tm(P, Q, solver_params);
   }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   else if (solver_params->type == MG) {
     // if the mg_mms_mass is larger than the smallest shift we use MG
     if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) {
@@ -507,7 +507,7 @@ int solve_mms_nd(spinor** const Pup, spinor** const Pdn, spinor* const Qup, spin
     } else if (solver_params->type == CGMMSND) {
       iteration_count = cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
     }
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     else if (solver_params->type == MG) {
       // if the mg_mms_mass is larger than the smallest shift we use MG
       if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) {
@@ -691,7 +691,7 @@ int solve_mms_nd_plus(spinor** const Pup, spinor** const Pdn, spinor* const Qup,
 
   int iteration_count = 0;
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   // With MG we can solve directly the unsquared operator
   if (solver_params->type == MG) {
     matrix_mult_nd f = Qtm_tau1_ndpsi_add_Ishift;
diff --git a/src/lib/solver/solver_field.c b/src/lib/solver/solver_field.c
index 1cfd06515..5644a4cae 100644
--- a/src/lib/solver/solver_field.c
+++ b/src/lib/solver/solver_field.c
@@ -37,7 +37,7 @@ int init_solver_field(spinor*** const solver_field, const int V, const int nr) {
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] = (spinor*)shmalloc((nr * V + 1) * sizeof(spinor))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
     errno = 0;
@@ -74,7 +74,7 @@ int init_solver_field_32(spinor32*** const solver_field, const int V, const int
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] = (spinor32*)shmalloc((nr * V + 1) * sizeof(spinor32))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
     errno = 0;
@@ -143,7 +143,7 @@ int init_lsolver_field(_Complex double*** const solver_field, const int V, const
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] =
                   (_Complex double*)shmalloc((nr * V + 1) * sizeof(_Complex double))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
@@ -184,7 +184,7 @@ int init_lsolver_field_32(_Complex float*** const solver_field, const int V, con
   }
 
   /* allocate the full chunk of memory to solver_field[nr] */
-#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR))
+#if (defined TM_USE_SHMEM && !(defined TM_USE_HALFSPINOR))
   if ((void*)((*solver_field)[nr] =
                   (_Complex float*)shmalloc((nr * V + 1) * sizeof(_Complex float))) == NULL) {
     fprintf(stderr, "malloc errno in init_solver_field: %d\n", errno);
diff --git a/src/lib/spinor_fft.c b/src/lib/spinor_fft.c
index fb101d269..54ece4bda 100644
--- a/src/lib/spinor_fft.c
+++ b/src/lib/spinor_fft.c
@@ -22,7 +22,7 @@
 #include "mpi_init.h"
 #include "spinor_fft.h"
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 #include <fftw3.h>
 #endif
 
@@ -35,7 +35,7 @@ void check_mpi_comm_membership(MPI_Comm commself, MPI_Comm commcheck, const char
                                const char *name_b, FILE *logFile);
 #endif
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan2d(spinor *spinor_in, spinor *spinor_out, int dim0, int dim1, int howmany,
                              unsigned int forward, int fftw_flags);
 #endif
@@ -50,7 +50,7 @@ void spinor_fft_transpose_xp_t(spinor *fieldout, spinor *fieldin, int dim0, int
 void spinor_fft_reduce_2d(spinor *localSpinorField, int *collectionRank, spinor ***field_collection,
                           spinor **membuff) {
   /* this implementation is intended for four dimensional parallelisation */
-#if (defined PARALLELXYZT && defined TM_USE_MPI && defined HAVE_FFTW)
+#if (defined TM_PARALLELXYZT && defined TM_USE_MPI && defined TM_USE_FFTW)
 
   int sendRecvCoord[4];
   int i;
@@ -195,7 +195,7 @@ void spinor_fft_reduce_2d(spinor *localSpinorField, int *collectionRank, spinor
 void spinor_fft_redist_2d(spinor *localSpinorField, int collectionRank, spinor **field_collection,
                           spinor *membuff) {
   /* this implementation is intended for four dimensional parallelisation */
-#if (defined PARALLELXYZT && defined TM_USE_MPI && defined HAVE_FFTW)
+#if (defined TM_PARALLELXYZT && defined TM_USE_MPI && defined TM_USE_FFTW)
 
   int sendRecvCoord[4];
   int dims[] = {g_nproc_t, g_nproc_x, g_nproc_y, g_nproc_z};
@@ -326,7 +326,7 @@ void spinor_fft_redist_2d(spinor *localSpinorField, int collectionRank, spinor *
 #endif
 }
 
-#ifdef HAVE_FFTW
+#ifdef TM_USE_FFTW
 fftw_plan spinor_fftw_plan2d(spinor *spinor_in, spinor *spinor_out, int dim0, int dim1,
                              int howmany_wospin, unsigned int forward, int fftw_flags) {
   /*    int index_s = gsi(get_index(it, ix, iy, iz, T, L)); */
diff --git a/src/lib/test/Makefile b/src/lib/test/Makefile
deleted file mode 100644
index 8efc8b569..000000000
--- a/src/lib/test/Makefile
+++ /dev/null
@@ -1,88 +0,0 @@
-TARGETS = scalar_prod_r_test
-
-USESF = yes
-
-OS = -os3
-
-# gcc shouldn't see this options, that's why we don't use CGLAGS here
-NLCCFLAGS = -D_STD_C99_COMPLEX_CHECKED -D_STD_C99_COMPLEX -Dapenext
-INCLUDES = -I../
-# workaround to let nlcc not see the non-standard complex.h
-NLCCINCLUDES = -I${NROOT}/include/nlibc/ ${INCLUDES}
-
-NLCCOPTS = -gp ${NLCCFLAGS} ${NLCCINCLUDES}
-ifdef USESF
-  MPPOPTS = -sf -v
-  SHAKEROPTS = -n -z 
-else 
-  MPPOPTS = -v
-  SHAKEROPTS = +a -z 
-endif
-SOFANOPTS = --rr
-
-# needed due to a bug in nlcc
-NLCCOS = -OS3
-
-NLCC = nlcc-0.5.2
-MPP = mpp
-SOFAN = sofan
-SHAKER = shaker
-M4 = m4
-CCDEP = gcc
-DEPFLAGS = -MM -MQ $*.sasm ${CFLAGS} ${INCLUDES}
-
-DEPFILES = $(addsuffix .d, ${TARGETS})
-MEMFILES = $(addsuffix .mem, ${TARGETS}) $(addsuffix -sofan.mem, ${TARGETS}) \
-	   $(addsuffix .no, ${TARGETS}) $(addsuffix -sofan.no, ${TARGETS})
-ASMFILES = $(addsuffix .sasm, ${TARGETS}) $(addsuffix .masm, ${TARGETS}) $(addsuffix -sofan.masm, ${TARGETS})
-NCDFILES = $(addsuffix .ncd, ${TARGETS}) $(addsuffix -sofan.ncd, ${TARGETS})
-SFOUTFILES = $(addsuffix .svn-out, ${TARGETS}) $(addsuffix .svn-out%, ${TARGETS}) \
-             $(addsuffix .sf_log, ${TARGETS}) $(addsuffix .sf_log%, ${TARGETS}) \
-             $(addsuffix .sf_log0, ${TARGETS}) $(addsuffix .sf_log0%, ${TARGETS}) \
-             $(addsuffix .err-sf, ${TARGETS}) $(addsuffix .svn-out, ${TARGETS}) \
-             $(addsuffix .dmo, ${TARGETS}) \
-	     $(addsuffix -sofan.svn-out, ${TARGETS}) $(addsuffix -sofan.svn-out%, ${TARGETS}) \
-             $(addsuffix -sofan.sf_log, ${TARGETS}) $(addsuffix -sofan.sf_log%, ${TARGETS}) \
-             $(addsuffix -sofan.sf_log0, ${TARGETS}) $(addsuffix -sofan.sf_log0%, ${TARGETS}) \
-             $(addsuffix -sofan.err-sf, ${TARGETS}) $(addsuffix -sofan.svn-out, ${TARGETS}) \
-             $(addsuffix -sofan.dmo, ${TARGETS})
-GCCBINARIES = $(addsuffix .gccbin, ${TARGETS})
-
-all: $(addsuffix -sofan.mem, ${TARGETS})
-allgcc:  $(addsuffix .gccbin, ${TARGETS})
-
--include $(DEPFILES)
-
-%.mem: %.masm
-	${SHAKER} ${SHAKEROPTS} $<
-
-%.masm: %.sasm
-	${MPP} ${OS} ${MPPOPTS} $<
-
-%-sofan.masm: %.masm
-	${SOFAN} ${SOFANOPTS} $< $@
-
-%.sasm: %.c Makefile
-	${NLCC} ${NLCCOPTS} ${NLCCOS} -S $<
-
-%.ncd: %.mem
-	dispminit $< > $@
-
-%-sofan.perf: %-sofan.ncd
-	nperf -asm=$*.sasm -c -l -a $< > $@ || (rm -f $@; exit 1)
-
-# beware, this is not very general
-%.gccbin: %.c
-	gcc -I../ $< -o $@
-
-$(DEPFILES): %.d: %.c Makefile
-	$(CCDEP) ${DEPFLAGS} ${INCLUDES} $< > $@
-
-clean:
-	rm -f ${ASMFILES} ${MEMFILES} ${NCDFILES} ${GCCBINARIES}
-
-distclean: clean
-	rm -f ${DEPFILES} ${SFOUTFILES}
-
-.SECONDARY:
-.DELETE_ON_ERROR:
diff --git a/src/lib/test/check_geometry.c b/src/lib/test/check_geometry.c
index 74589a739..b9f14eb4d 100644
--- a/src/lib/test/check_geometry.c
+++ b/src/lib/test/check_geometry.c
@@ -90,7 +90,7 @@ int check_geometry() {
           ix = g_ipt[x0][x1][x2][x3];
 
           iy0 = g_iup[ix][0];
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x0 != T - 1) {
             iz0 = g_ipt[(x0 + 1) % T][x1][x2][x3];
           } else {
@@ -107,7 +107,7 @@ int check_geometry() {
 #endif
 
           iy1 = g_iup[ix][1];
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x1 != LX - 1) {
             iz1 = g_ipt[x0][(x1 + 1) % LX][x2][x3];
           } else {
@@ -125,7 +125,7 @@ int check_geometry() {
 #endif
 
           iy2 = g_iup[ix][2];
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x2 != LY - 1) {
             iz2 = g_ipt[x0][x1][(x2 + 1) % LY][x3];
           } else {
@@ -145,7 +145,7 @@ int check_geometry() {
 #endif
 
           iy3 = g_iup[ix][3];
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
           if (x3 != LZ - 1) {
             iz3 = g_ipt[x0][x1][x2][(x3 + 1) % LZ];
           } else {
@@ -176,7 +176,7 @@ int check_geometry() {
           }
 
           iy0 = g_idn[ix][0];
-#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x0 != 0) {
             iz0 = g_ipt[(x0 + T - 1) % T][x1][x2][x3];
           } else {
@@ -194,7 +194,7 @@ int check_geometry() {
 #endif
 
           iy1 = g_idn[ix][1];
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x1 != 0) {
             iz1 = g_ipt[x0][(x1 + LX - 1) % LX][x2][x3];
           } else {
@@ -212,7 +212,7 @@ int check_geometry() {
           iz1 = g_ipt[x0][(x1 + LX - 1) % LX][x2][x3];
 #endif
           iy2 = g_idn[ix][2];
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x2 != 0) {
             iz2 = g_ipt[x0][x1][(x2 + LY - 1) % LY][x3];
           } else {
@@ -231,7 +231,7 @@ int check_geometry() {
 #endif
 
           iy3 = g_idn[ix][3];
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
           if (x3 != 0) {
             iz3 = g_ipt[x0][x1][x2][(x3 + LZ - 1) % LZ];
           } else {
@@ -262,8 +262,8 @@ int check_geometry() {
           }
 
           /* The edges */
-          /* In case of PARALLELT there is actually no edge to take care of */
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+          /* In case of TM_PARALLELT there is actually no edge to take care of */
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
           if (x0 == 0) {
             iy0 = g_idn[g_idn[ix][1]][0];
             if (x1 != 0) {
@@ -318,7 +318,7 @@ int check_geometry() {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
           if (x0 == 0) {
             iy0 = g_idn[g_idn[ix][2]][0];
             if (x2 != 0) {
@@ -421,7 +421,7 @@ int check_geometry() {
             }
           }
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
           if (x0 == 0) {
             iy0 = g_idn[g_idn[ix][3]][0];
             if (x3 != 0) {
@@ -700,7 +700,7 @@ int check_geometry() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T + 2; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -827,7 +827,7 @@ int check_geometry() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     for (x0 = 0; x0 < T + 2; x0++) {
       for (x1 = 0; x1 < LX + 2; x1++) {
@@ -1027,7 +1027,7 @@ int check_geometry() {
       }
     }
 #endif
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
     for (x0 = 0; x0 < T + 2; x0++) {
       for (x1 = 0; x1 < LX + 2; x1++) {
         for (x2 = 0; x2 < LY + 2; x2++) {
diff --git a/src/lib/test/check_overlap.c b/src/lib/test/check_overlap.c
index 43742a21b..56763cff4 100644
--- a/src/lib/test/check_overlap.c
+++ b/src/lib/test/check_overlap.c
@@ -105,12 +105,12 @@ int main(int argc, char *argv[]) {
   char *gaugecksum = NULL;
   double plaquette_energy;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst init
 #pragma pomp inst begin(main)
 #endif
 
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
   MPI_File fh;
   LemonWriter *lemonWriter;
   paramsXlfInfo *xlfInfo;
@@ -188,7 +188,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -273,7 +273,7 @@ int main(int argc, char *argv[]) {
 
   phmc_invmaxev = 1.;
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
@@ -286,7 +286,7 @@ int main(int argc, char *argv[]) {
       exit(-1);
     }
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) {
     init_xchange_halffield();
   }
@@ -299,9 +299,9 @@ int main(int argc, char *argv[]) {
       printf("Reading Gauge field from file %s\n", conf_filename);
       fflush(stdout);
     }
-#ifdef HAVE_LIBLEMON
+#ifdef TM_USE_LEMON
     read_lemon_gauge_field_parallel(conf_filename, &gaugecksum, &xlfmessage, &gaugelfn);
-#else  /* HAVE_LIBLEMON */
+#else  /* TM_USE_LEMON */
     if (xlfmessage != (char *)NULL) free(xlfmessage);
     if (gaugelfn != (char *)NULL) free(gaugelfn);
     if (gaugecksum != (char *)NULL) free(gaugecksum);
@@ -310,7 +310,7 @@ int main(int argc, char *argv[]) {
     gaugelfn = read_message(conf_filename, "ildg-data-lfn");
     gaugecksum = read_message(conf_filename, "scidac-checksum");
     printf("%s \n", gaugecksum);
-#endif /* HAVE_LIBLEMON */
+#endif /* TM_USE_LEMON */
     if (g_proc_id == 0) {
       printf("done!\n");
       fflush(stdout);
@@ -389,7 +389,7 @@ int main(int argc, char *argv[]) {
     free_chi_dn_spinor_field();
   }
   return (0);
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(main)
 #endif
 }
diff --git a/src/lib/test/check_xchange.c b/src/lib/test/check_xchange.c
index db5d97cb3..a20f86df4 100644
--- a/src/lib/test/check_xchange.c
+++ b/src/lib/test/check_xchange.c
@@ -63,7 +63,7 @@ int check_xchange() {
       }
     }
 
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     for (x0 = 0; x0 < T; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -74,7 +74,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -113,7 +113,7 @@ int check_xchange() {
       }
     }
 
-#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT))
+#if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
     x = (double*)&g_spinor_field[0][(VOLUME + 2 * LX * LY * LZ) / 2];
     for (i = 0; i < T * LY * LZ / 2 * 24; i++, x++) {
       if ((int)(*x) != g_nb_x_up) {
@@ -139,7 +139,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     x = (double*)&g_spinor_field[0][(VOLUME + 2 * LX * LY * LZ) / 2 + 2 * T * LY * LZ / 2];
     for (i = 0; i < T * LX * LZ / 2 * 24; i++, x++) {
       if ((int)(*x) != g_nb_y_up) {
@@ -166,7 +166,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     set_spinor_field(0, -1.);
 
     for (x0 = 0; x0 < T; x0++) {
@@ -270,7 +270,7 @@ int check_xchange() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* Set the x boundary */
     for (x0 = 0; x0 < T; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
@@ -284,7 +284,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* Set the y boundary */
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
@@ -298,7 +298,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     /* Set the z boundary */
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
@@ -340,7 +340,7 @@ int check_xchange() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     x = (double*)&g_gauge_field[(T + 2) * LX * LY * LZ][0];
     for (i = 0; i < T * LY * LZ * 72; i++, x++) {
       if ((int)(*x) != g_nb_x_up) {
@@ -368,7 +368,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     x = (double*)&g_gauge_field[(T + 2) * LX * LY * LZ + 2 * T * LZ * LY][0];
     for (i = 0; i < T * LX * LZ * 72; i++, x++) {
       if ((int)(*x) != g_nb_y_up) {
@@ -396,7 +396,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     x = (double*)g_gauge_field[VOLUME + 2 * LX * LY * LZ + 2 * T * LZ * LY + 2 * T * LX * LZ];
     for (i = 0; i < T * LX * LY * 72; i++, x++) {
       if ((int)(*x) != g_nb_z_up) {
@@ -504,7 +504,7 @@ int check_xchange() {
     MPI_Barrier(MPI_COMM_WORLD);
 
     /* The edges */
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     fprintf(stdout, "# Rank: %d, (c0, c1, c2, c3) = (%d, %d, %d, %d)\n", g_proc_id,
             g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]);
     fflush(stdout);
@@ -577,7 +577,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
     di[2] = (g_proc_coords[2] - 1) % g_nproc_y;
     di[0] = g_proc_coords[0];
@@ -712,7 +712,7 @@ int check_xchange() {
       }
     }
 #endif
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
     di[3] = (g_proc_coords[3] - 1) % g_nproc_z;
     di[0] = g_proc_coords[0];
@@ -1001,7 +1001,7 @@ int check_xchange() {
         }
       }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
       x = (double*)&g_gauge_field[VOLUMEPLUSRAND + 2 * LX * LY * LZ][0];
       for (i = 0; i < T * LY * LZ * 72; i++, x++) {
         if ((int)(*x) != g_nb_x_up) {
@@ -1029,7 +1029,7 @@ int check_xchange() {
       }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
       x = (double*)&g_gauge_field[VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LZ * LY][0];
       for (i = 0; i < T * LX * LZ * 72; i++, x++) {
         if ((int)(*x) != g_nb_y_up) {
@@ -1058,7 +1058,7 @@ int check_xchange() {
       }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
       x = (double*)&g_gauge_field[VOLUMEPLUSRAND + 2 * LX * LY * LZ + 2 * T * LZ * LY +
                                   2 * T * LX * LZ][0];
       for (i = 0; i < T * LX * LY * 72; i++, x++) {
@@ -1088,7 +1088,7 @@ int check_xchange() {
       }
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
       set_gauge_field(-1.);
 
@@ -1279,7 +1279,7 @@ int check_xchange() {
           }
         }
       }
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
       /* Set the tz boundary */
       for (x1 = 0; x1 < LX; x1++) {
         for (x2 = 0; x2 < LY; x2++) {
@@ -1332,7 +1332,7 @@ int check_xchange() {
       xchange_gauge(g_gauge_field);
       MPI_Barrier(MPI_COMM_WORLD);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
       di[0] = (g_proc_coords[0] - 1) % g_nproc_t;
       di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
       di[2] = g_proc_coords[2];
@@ -1453,7 +1453,7 @@ int check_xchange() {
       }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
       di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
       di[2] = (g_proc_coords[2] - 1) % g_nproc_y;
@@ -1693,7 +1693,7 @@ int check_xchange() {
         }
       }
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
       di[0] = (g_proc_coords[0] - 1) % g_nproc_t;
       di[3] = (g_proc_coords[3] - 1) % g_nproc_z;
@@ -2123,7 +2123,7 @@ int check_xchange() {
         }
       }
     }
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2145,7 +2145,7 @@ int check_xchange() {
       }
     }
 #endif
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2167,7 +2167,7 @@ int check_xchange() {
       }
     }
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     for (x0 = 0; x0 < T; x0++) {
       for (x1 = 0; x1 < LX; x1++) {
         for (x2 = 0; x2 < LY; x2++) {
@@ -2194,7 +2194,7 @@ int check_xchange() {
     xchange_deri(df0);
     MPI_Barrier(MPI_COMM_WORLD);
 
-#if defined PARALLELT
+#if defined TM_PARALLELT
     for (x1 = 0; x1 < LX; x1++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2228,7 +2228,7 @@ int check_xchange() {
       }
     }
 #endif
-#if defined PARALLELXT
+#if defined TM_PARALLELXT
     for (x1 = 1; x1 < LX - 1; x1++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2351,7 +2351,7 @@ int check_xchange() {
       }
     }
 #endif
-#if defined PARALLELXYT
+#if defined TM_PARALLELXYT
     for (x1 = 1; x1 < LX - 1; x1++) {
       for (x2 = 1; x2 < LY - 1; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2748,7 +2748,7 @@ int check_xchange() {
 
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     for (x1 = 1; x1 < LX - 1; x1++) {
       for (x2 = 1; x2 < LY - 1; x2++) {
         for (x3 = 1; x3 < LZ - 1; x3++) {
@@ -3026,7 +3026,7 @@ int check_xchange() {
       }
     }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     // xt edge
     for (x2 = 0; x2 < LY; x2++) {
@@ -3063,7 +3063,7 @@ int check_xchange() {
     }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     // ty edge
     for (x1 = 0; x1 < LX; x1++) {
@@ -3139,7 +3139,7 @@ int check_xchange() {
     xchange_deri(df0);
     MPI_Barrier(MPI_COMM_WORLD);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     di[0] = (g_proc_coords[0] - 1) % g_nproc_t;
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
@@ -3156,7 +3156,7 @@ int check_xchange() {
     di[1] = (g_proc_coords[1] + 1) % g_nproc_x;
     MPI_Cart_rank(g_cart_grid, di, &pp);
 
-#ifdef PARALLELXT
+#ifdef TM_PARALLELXT
     for (x2 = 0; x2 < LY; x2++) {
       for (x3 = 0; x3 < LZ; x3++) {
 #else
@@ -3224,7 +3224,7 @@ int check_xchange() {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
     // xy-edge
     di[1] = (g_proc_coords[1] - 1) % g_nproc_x;
diff --git a/src/lib/test/measure_rectangles.debug.c b/src/lib/test/measure_rectangles.debug.c
index 75a71d2b2..422f681b2 100644
--- a/src/lib/test/measure_rectangles.debug.c
+++ b/src/lib/test/measure_rectangles.debug.c
@@ -61,10 +61,10 @@ double measure_rectangles() {
   char filename[100];
 
   sprintf(filename, "debug_mr.s");
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   sprintf(filename, "debug_mr.pt.%d", g_proc_id);
 #endif
-#ifdef PARALLELXT
+#ifdef TM_PARALLELXT
   sprintf(filename, "debug_mr.pxt.%d", g_proc_id);
 #endif
   debugfile = fopen(filename, "w");
diff --git a/src/lib/update_backward_gauge.c b/src/lib/update_backward_gauge.c
index a041e577c..b28ab6acf 100644
--- a/src/lib/update_backward_gauge.c
+++ b/src/lib/update_backward_gauge.c
@@ -25,7 +25,7 @@
 #include "su3.h"
 #include "update_backward_gauge.h"
 
-#if defined _USE_HALFSPINOR
+#if defined TM_USE_HALFSPINOR
 void update_backward_gauge(su3** const gf) {
 #ifdef TM_USE_OMP
 #pragma omp parallel
diff --git a/src/lib/update_gauge.c b/src/lib/update_gauge.c
index dde4cbf31..af4730e01 100644
--- a/src/lib/update_gauge.c
+++ b/src/lib/update_gauge.c
@@ -39,7 +39,7 @@
 #include "su3spinor.h"
 #include "update_gauge.h"
 #include "xchange/xchange.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 /*******************************************************
@@ -51,7 +51,7 @@
 void update_gauge(const double step, hamiltonian_field_t *const hf) {
   tm_stopwatch_push(&g_timers, __func__, "");
   update_tm_gauge_id(&g_gauge_state, step);
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(step);
 #endif
 
@@ -65,7 +65,7 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
     su3 *z;
     static su3adj deriv;
     su3adj *xm;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(updategauge)
 #endif
 
@@ -115,7 +115,7 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
 
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(updategauge)
 #endif
 }
diff --git a/src/lib/update_momenta_fg.c b/src/lib/update_momenta_fg.c
index 0aab582cd..cf1e9e4fb 100644
--- a/src/lib/update_momenta_fg.c
+++ b/src/lib/update_momenta_fg.c
@@ -44,7 +44,7 @@
 #include "su3adj.h"
 #include "su3spinor.h"
 #include "xchange/xchange.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -123,7 +123,7 @@ void fg_update_momenta_reset_gaugefield(const double step, hamiltonian_field_t *
  *******************************************************/
 void update_momenta_fg(int *mnllist, double step, const int no, hamiltonian_field_t *const hf,
                        double step0) {
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(0.0);
 #endif
   if (g_exposu3_no_c == 0) init_exposu3();
@@ -156,7 +156,7 @@ void update_momenta_fg(int *mnllist, double step, const int no, hamiltonian_fiel
   /* for parallelization */
   xchange_gauge(hf->gaugefield);
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(0.0);
 #endif
 
@@ -201,7 +201,7 @@ void update_momenta_fg(int *mnllist, double step, const int no, hamiltonian_fiel
   /* for parallelization */
   xchange_gauge(hf->gaugefield);
 #endif
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_update_gauge(0.0);
 #endif
 
diff --git a/src/lib/update_tm.c b/src/lib/update_tm.c
index 72a6194e7..3f1cdc5d5 100644
--- a/src/lib/update_tm.c
+++ b/src/lib/update_tm.c
@@ -64,7 +64,7 @@
 #include "su3.h"
 #include "update_tm.h"
 #include "xchange/xchange.h"
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 
@@ -120,7 +120,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
     }
   }
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
   MG_reset();
 #endif
 
@@ -211,7 +211,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
       free(xlfInfo);
     }
 
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     MG_reset();
 #endif
 
@@ -354,7 +354,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
     // will result in the updated gauge field to be propagated
     update_tm_gauge_id(&g_gauge_state, TM_GAUGE_PROPAGATE_THRESHOLD);
     update_tm_gauge_id(&g_gauge_state_32, TM_GAUGE_PROPAGATE_THRESHOLD);
-#ifdef DDalphaAMG
+#ifdef TM_USE_DDalphaAMG
     MG_reset();
 #endif
   }
diff --git a/src/lib/util/io.c b/src/lib/util/io.c
index 6df42d288..4f6267c78 100644
--- a/src/lib/util/io.c
+++ b/src/lib/util/io.c
@@ -36,7 +36,7 @@
  *
  */
 
-#define _FILE_OFFSET_BITS 64
+#define TM_FILE_OFFSET_BITS 64
 
 #include "io.h"
 #include <stdio.h>
diff --git a/src/lib/util/laguer/Makefile b/src/lib/util/laguer/Makefile
deleted file mode 100644
index f9bce70e3..000000000
--- a/src/lib/util/laguer/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-CXX=g++
-CXXFLAGS=-g -O2
-CLNDIR=${HOME}/daten/workdir/cln/
-
-chebyRoot: chebyRoot.C Makefile chebyRoot.H
-	${CXX} $< -g -o $@ -I${CLNDIR}/include/ -L${CLNDIR}/lib -lcln -lm
-
-clean:
-	rm -f *.o chebyRoot *.dat *.log *~
diff --git a/src/lib/util/oox/Makefile b/src/lib/util/oox/Makefile
deleted file mode 100644
index 88de5bdd5..000000000
--- a/src/lib/util/oox/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-CC=gcc
-CXX=g++
-CFLAGS=-O2 -fexpensive-optimizations -fomit-frame-pointer # -mfpmath=sse -msse2 
-LIBS=-lm
-OBJECTS_OOX=oox.o
-INCLUDE=-I./
-
-
-# variables for oox_ga executable
-# if you want to compile with ga lib support
-# please adjust the GALIBPATH variable
-# to the toplevel dir of galib
-# it is assumed that you compiled the library
-# such that a libga.a file is present in the 
-# ./ga subdir of galib
-GALIBPATH=/usr1/scratch/annube/galib247
-LIBS_GA=${LIBS} -L${GALIBPATH}/ga -lga
-CFLAGS_GA=${CFLAGS} -DWITHGALIB
-INCLUDE_GA=${INCLUDE} -I${GALIBPATH}
-OBJECTS_OOX_GA=oox_ga.o oox_gawrapper.o
-
-
-all: oox oox_ga
-
-oox: ${OBJECTS_OOX} Makefile
-	${CXX} ${OBJECTS_OOX} -o $@ ${CFLAGS} ${LIBS}
-
-oox_ga: ${OBJECTS_OOX_GA} Makefile
-	${CXX} ${OBJECTS_OOX_GA} -o $@ ${CFLAGS_GA} ${LIBS_GA}
-
-oox_gawrapper.o: oox_gawrapper.cxx
-	${CXX} ${CFLAGS_GA} -o $@ -c $< ${INCLUDE_GA}
-
-oox_ga.o: oox.c
-	${CC} ${CFLAGS_GA} -o $@ -c $< ${INCLUDE_GA}
-
-clean:
-	rm oox oox_ga *.o
-
-.SUFFIXES:
-
-%.o: %.c
-	${CC} ${CFLAGS}	-o $@ -c $< ${INCLUDE}
-
-%.o: %.cxx
-	${CXX} ${CFLAGS} -o $@ -c $< ${INCLUDE}
diff --git a/src/lib/wrapper/lib_wrapper.c b/src/lib/wrapper/lib_wrapper.c
index 6c95a27d5..9f083adc5 100644
--- a/src/lib/wrapper/lib_wrapper.c
+++ b/src/lib/wrapper/lib_wrapper.c
@@ -121,7 +121,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
   for (int j = 0; j < no_operators; j++)
     if (!operator_list[j].even_odd_flag) even_odd_flag = 0;
 
-#ifdef _GAUGE_COPY
+#ifdef TM_GAUGE_COPY
   int j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
@@ -161,7 +161,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
   // initialise the operators
   init_operators();
 
-#ifdef _USE_HALFSPINOR
+#ifdef TM_USE_HALFSPINOR
   j = init_dirac_halfspinor();
   if (j != 0) {
     fprintf(stderr, "tmLQCD_init_invert: Not enough memory for halffield! Aborting...\n");
@@ -172,7 +172,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
     fprintf(stderr, "tmLQCD_init_invert: Not enough memory for 32-bit halffield! Aborting...\n");
     return (-1);
   }
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
   if (even_odd_flag) init_xchange_halffield();
 #endif
 #endif
diff --git a/src/lib/xchange/xchange_2fields.c b/src/lib/xchange/xchange_2fields.c
index c5dfa86a8..c311bf908 100644
--- a/src/lib/xchange/xchange_2fields.c
+++ b/src/lib/xchange/xchange_2fields.c
@@ -41,18 +41,18 @@
 #include "su3.h"
 #include "xchange_2fields.h"
 
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 
 /* this version uses non-blocking MPI calls */
 void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   MPI_Request requests[32];
   MPI_Status status[32];
   int reqcount = 0;
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   int ix = 0;
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange2fields)
 #endif
 
@@ -88,7 +88,7 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
             g_cart_grid, &requests[reqcount + 1]);
   reqcount = reqcount + 2;
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[reqcount]);
@@ -120,7 +120,7 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   reqcount = reqcount + 2;
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]);
@@ -153,7 +153,7 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* fill buffer ! */
   /* This is now depending on whether the field is */
   /* even or odd */
@@ -237,8 +237,8 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   MPI_Waitall(reqcount, requests, status);
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange2fields)
 #endif
 }
-#endif /*  _NON_BLOCKING */
+#endif /*  TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_2fields.h b/src/lib/xchange/xchange_2fields.h
index 35dc7f6c5..6a83085f0 100644
--- a/src/lib/xchange/xchange_2fields.h
+++ b/src/lib/xchange/xchange_2fields.h
@@ -31,7 +31,7 @@
 #define EVEN 1
 #define ODD 0
 
-#ifdef _NON_BLOCKING
+#ifdef TM_NON_BLOCKING
 void xchange_2fields(spinor* const k, spinor* const l, const int ieo);
 #else
 #define xchange_2fields(k, l, ieo) \
diff --git a/src/lib/xchange/xchange_deri.c b/src/lib/xchange/xchange_deri.c
index a260ed8b6..7defa1e7c 100644
--- a/src/lib/xchange/xchange_deri.c
+++ b/src/lib/xchange/xchange_deri.c
@@ -55,7 +55,7 @@ void xchange_deri(su3adj** const df) {
 #ifdef TM_USE_MPI
   int ix, iy, t, y, z, x;
   MPI_Status status;
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* The edges need to come first */
 
   /* send the data to the neighbour on the left in t direction */
@@ -96,9 +96,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* edges */
 
   /* send the data to the neighbour on the left in x direction */
@@ -178,9 +178,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
 
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
@@ -305,7 +305,7 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* PARALLELXYZT */
+#endif /* TM_PARALLELXYZT */
 
   // now the normal boundaries
 
@@ -341,7 +341,7 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)df[(T + 2) * LX * LY * LZ + T * LY * LZ], 1, deri_x_slice_cont, g_nb_x_dn, 42,
@@ -372,9 +372,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
@@ -406,9 +406,9 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* (defined PARALLELXYT || defined PARALLELXYZT) */
+#endif /* (defined TM_PARALLELXYT || defined TM_PARALLELXYZT) */
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv(
@@ -441,7 +441,7 @@ void xchange_deri(su3adj** const df) {
     }
   }
 
-#endif /* PARALLELXYZT */
+#endif /* TM_PARALLELXYZT */
 #endif /* MPI */
   return;
 }
diff --git a/src/lib/xchange/xchange_field.c b/src/lib/xchange/xchange_field.c
index 576574789..417aa8981 100644
--- a/src/lib/xchange/xchange_field.c
+++ b/src/lib/xchange/xchange_field.c
@@ -35,7 +35,7 @@
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
-#ifdef _USE_SHMEM
+#ifdef TM_USE_SHMEM
 #include <mpp/shmem.h>
 #endif
 
@@ -44,30 +44,30 @@
 #include "su3.h"
 #include "xchange_field.h"
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
 #pragma disjoint(*field_buffer_z2, *field_buffer_z)
 #endif
 
 /* this version uses non-blocking MPI calls */
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 
 void xchange_field(spinor* const l, const int ieo) {
 #ifdef TM_USE_MPI
   MPI_Request requests[16];
   MPI_Status status[16];
 #endif
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int ix = 0;
   int reqcount = 16;
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangefield)
 #endif
 
@@ -84,7 +84,7 @@ void xchange_field(spinor* const l, const int ieo) {
     MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[0]);
     MPI_Irecv((void*)(l + T * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_up, 81,
               g_cart_grid, &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in x direction */
     /* recieve the data from the neighbour on the right in x direction */
     MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[4]);
@@ -92,7 +92,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_cart_grid, &requests[5]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in y direction */
     /* recieve the data from the neighbour on the right in y direction */
     MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[8]);
@@ -100,7 +100,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_y_up, 101, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     /* fill buffer ! */
     /* This is now depending on whether the field is */
     /* even or odd */
@@ -129,7 +129,7 @@ void xchange_field(spinor* const l, const int ieo) {
     MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_dn, 82,
               g_cart_grid, &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in x direction */
     /* recieve the data from the neighbour on the left in x direction */
     MPI_Isend((void*)(l + (LX - 1) * LY * LZ / 2), 1, field_x_slice_gath, g_nb_x_up, 92,
@@ -138,7 +138,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_x_dn, 92, g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in y direction */
     /* recieve the data from the neighbour on the left in y direction */
     MPI_Isend((void*)(l + (LY - 1) * LZ / 2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid,
@@ -147,7 +147,7 @@ void xchange_field(spinor* const l, const int ieo) {
               field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     if (ieo == 1) {
       for (ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
         field_buffer_z2[ix - T * LX * LY / 2] = l[g_field_z_ipt_even[ix]];
@@ -174,7 +174,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_cart_grid, &requests[0]);
     MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_dn, 82,
               g_cart_grid, &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in x direction */
     /* recieve the data from the neighbour on the left in x direction */
     MPI_Isend((void*)(l + (LX - 1) * LY * LZ / 2), 1, field_x_slice_gath, g_nb_x_up, 92,
@@ -183,7 +183,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_x_dn, 92, g_cart_grid, &requests[5]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the right in y direction */
     /* recieve the data from the neighbour on the left in y direction */
     MPI_Isend((void*)(l + (LY - 1) * LZ / 2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid,
@@ -192,7 +192,7 @@ void xchange_field(spinor* const l, const int ieo) {
               field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
     /* fill buffer ! */
     /* This is now depending on whether the field is */
     /* even or odd */
@@ -218,7 +218,7 @@ void xchange_field(spinor* const l, const int ieo) {
     MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[2]);
     MPI_Irecv((void*)(l + T * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_up, 81,
               g_cart_grid, &requests[3]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in x direction */
     /* recieve the data from the neighbour on the right in x direction */
     MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[6]);
@@ -226,7 +226,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
     /* send the data to the neighbour on the left in y direction */
     /* recieve the data from the neighbour on the right in y direction */
     MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[10]);
@@ -234,7 +234,7 @@ void xchange_field(spinor* const l, const int ieo) {
               g_nb_y_up, 101, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
     if (ieo == 1) {
       for (ix = T * LX * LY / 2; ix < T * LX * LY; ix++) {
         field_buffer_z2[ix - T * LX * LY / 2] = l[g_field_z_ipt_even[ix]];
@@ -259,12 +259,12 @@ void xchange_field(spinor* const l, const int ieo) {
 #endif
 
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangefield)
 #endif
 }
 
-#elif (defined _USE_SHMEM) /* _NON_BLOCKING */
+#elif (defined TM_USE_SHMEM) /* TM_NON_BLOCKING */
 
 /* Here comes the version with shared memory */
 /* exchanges the field  l */
@@ -273,7 +273,7 @@ void xchange_field(spinor* const l, const int ieo) {
 #ifdef TM_USE_MPI
   int i, ix, mu, x0, x1, x2, x3, k;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangefield)
 #endif
 
@@ -283,7 +283,7 @@ void xchange_field(spinor* const l, const int ieo) {
   shmem_double_put((double*)(l + (T + 1) * LX * LY * LZ / 2),
                    (double*)(l + (T - 1) * LX * LY * LZ / 2), (LX * LY * LZ * 12), g_nb_t_up);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   k = (T + 2) * LX * LY * LZ / 2;
   for (x0 = 0; x0 < T; x0++) {
     shmem_double_put((double*)(l + k), (double*)(l + g_lexic2eo[g_ipt[x0][0][0][0]]), 12 * LZ * LY,
@@ -298,7 +298,7 @@ void xchange_field(spinor* const l, const int ieo) {
   }
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   k = ((T + 2) * LX * LY * LZ + 2 * T * LY * LZ) / 2;
   for (x0 = 0; x0 < T; x0++) {
     for (x1 = 0; x1 < LX; x1++) {
@@ -317,7 +317,7 @@ void xchange_field(spinor* const l, const int ieo) {
   }
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   x0 = (VOLUME / 2 + LX * LY * LZ + T * LY * LZ + T * LX * LZ);
   if (ieo == 1) {
     for (k = 0; k < T * LX * LY / 2; k++) {
@@ -347,21 +347,21 @@ void xchange_field(spinor* const l, const int ieo) {
   shmem_barrier_all();
 #endif  // MPI
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangefield)
 #endif
 }
 
 /* Here comes the naive version */
 /* Using MPI_Sendrecv */
-#else /* _NON_BLOCKING _USE_SHMEM */
+#else /* TM_NON_BLOCKING TM_USE_SHMEM */
 /* exchanges the field  l */
 void xchange_field(spinor* const l, const int ieo) {
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangefield)
 #endif
 
@@ -379,7 +379,7 @@ void xchange_field(spinor* const l, const int ieo) {
                (void*)(l + (T + 1) * LX * LY * LZ / 2), 1, field_time_slice_cont, g_nb_t_dn, 82,
                g_cart_grid, &status);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91,
@@ -394,7 +394,7 @@ void xchange_field(spinor* const l, const int ieo) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101,
@@ -409,7 +409,7 @@ void xchange_field(spinor* const l, const int ieo) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* fill buffer ! */
   /* This is now depending on whether the field is */
   /* even or odd */
@@ -448,9 +448,9 @@ void xchange_field(spinor* const l, const int ieo) {
 #endif
 #endif  // MPI
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangefield)
 #endif
 }
 
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_gauge.c b/src/lib/xchange/xchange_gauge.c
index 3465d970f..6177a3dbb 100644
--- a/src/lib/xchange/xchange_gauge.c
+++ b/src/lib/xchange/xchange_gauge.c
@@ -38,7 +38,7 @@
 #include "su3adj.h"
 #include "xchange_gauge.h"
 
-#if defined _NON_BLOCKING
+#if defined TM_NON_BLOCKING
 void xchange_gauge(su3** const gf) {
   int cntr = 0;
 #ifdef TM_USE_MPI
@@ -80,7 +80,7 @@ void xchange_gauge(su3** const gf) {
     cntr = cntr + 2;
   }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend(gf[0], 1, gauge_x_slice_gath, g_nb_x_dn, 87, g_cart_grid, &request[cntr]);
@@ -117,7 +117,7 @@ void xchange_gauge(su3** const gf) {
 #endif
   MPI_Waitall(cntr, request, status);
   cntr = 0;
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* The edges */
 
   /* send the data to the neighbour on the left in t direction */
@@ -175,10 +175,10 @@ void xchange_gauge(su3** const gf) {
               g_cart_grid, &request[cntr + 1]);
     cntr = cntr + 2;
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend(gf[0], 1, gauge_y_slice_gath, g_nb_y_dn, 106, g_cart_grid, &request[cntr]);
@@ -212,7 +212,7 @@ void xchange_gauge(su3** const gf) {
 #endif
   MPI_Waitall(cntr, request, status);
   cntr = 0;
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* jetzt wirds richtig eklig ... */
 
@@ -326,9 +326,9 @@ void xchange_gauge(su3** const gf) {
     cntr = cntr + 2;
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -361,7 +361,7 @@ void xchange_gauge(su3** const gf) {
   }
 #endif
   MPI_Waitall(cntr, request, status);
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   cntr = 0;
   /* edges */
 
@@ -538,13 +538,13 @@ void xchange_gauge(su3** const gf) {
   }
   MPI_Waitall(cntr, request, status);
 
-  /* end of if defined PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYZT */
 #endif
 #endif
   return;
 }
 
-#else /* _NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 void xchange_gauge(su3** const gf) {
 
 #ifdef TM_USE_MPI
@@ -576,7 +576,7 @@ void xchange_gauge(su3** const gf) {
                  g_cart_grid, &status);
   }
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv(gf[0], 1, gauge_x_slice_gath, g_nb_x_dn, 93, gf[(T + 2) * LX * LY * LZ], 1,
@@ -648,10 +648,10 @@ void xchange_gauge(su3** const gf) {
                  g_nb_t_up, 98, gf[VOLUMEPLUSRAND + RAND + 6 * LY * LZ], 1, gauge_xt_edge_cont,
                  g_nb_t_dn, 98, g_cart_grid, &status);
   }
-  /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/
+  /* end of if defined TM_PARALLELXT || TM_PARALLELXYT || TM_PARALLELXYZT*/
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv(gf[0], 1, gauge_y_slice_gath, g_nb_y_dn, 103,
@@ -770,9 +770,9 @@ void xchange_gauge(su3** const gf) {
                  gauge_ty_edge_cont, g_nb_y_dn, 298, g_cart_grid, &status);
   }
 
-  /* end of if defined PARALLELXYT || PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYT || TM_PARALLELXYZT */
 #endif
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
   /* z-Rand */
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -954,11 +954,11 @@ void xchange_gauge(su3** const gf) {
                  1, gauge_zy_edge_cont, g_nb_y_dn, 510, g_cart_grid, &status);
   }
 
-  /* end of if defined PARALLELXYZT */
+  /* end of if defined TM_PARALLELXYZT */
 #endif
 #endif
   return;
 }
 
 
-#endif /* _NON_BLOCKING */
+#endif /* TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
index d1eae8a04..3948aa1ca 100644
--- a/src/lib/xchange/xchange_halffield.c
+++ b/src/lib/xchange/xchange_halffield.c
@@ -41,9 +41,9 @@
 #include "su3.h"
 #include "xchange_halffield.h"
 
-#if (defined _USE_HALFSPINOR)
+#if (defined TM_USE_HALFSPINOR)
 
-#if (defined _PERSISTENT)
+#if (defined TM_PERSISTENT)
 
 MPI_Request prequests[16];
 
@@ -51,13 +51,13 @@ MPI_Request prequests[16];
 void init_xchange_halffield() {
 #ifdef TM_USE_MPI
 
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
   int reqcount = 16;
 #endif
@@ -78,7 +78,7 @@ void init_xchange_halffield() {
   MPI_Recv_init((void*)(recvBuffer), LX * LY * LZ * 12 / 2, MPI_DOUBLE, g_nb_t_up, 82, g_cart_grid,
                 &prequests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
@@ -97,7 +97,7 @@ void init_xchange_halffield() {
                 g_cart_grid, &prequests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Send_init((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ), T * LX * LZ * 12 / 2, MPI_DOUBLE,
@@ -115,7 +115,7 @@ void init_xchange_halffield() {
                 g_nb_y_up, 102, g_cart_grid, &prequests[11]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in z direction */
   /* recieve the data from the neighbour on the left in z direction */
   MPI_Send_init((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ + T * LX * LZ),
@@ -141,13 +141,13 @@ void xchange_halffield() {
 #ifdef TM_USE_MPI
 
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
   int reqcount = 16;
 #endif
@@ -158,7 +158,7 @@ void xchange_halffield() {
   return;
 }
 
-#else /* def (_USE_SHMEM || _PERSISTENT) */
+#else /* def (TM_USE_SHMEM || TM_PERSISTENT) */
 /* 4. */
 void xchange_halffield() {
 
@@ -166,17 +166,17 @@ void xchange_halffield() {
 
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangehalf)
 #endif
   /* send the data to the neighbour on the right in t direction */
@@ -193,7 +193,7 @@ void xchange_halffield() {
   MPI_Irecv((void*)(recvBuffer), LX * LY * LZ * 12 / 2, MPI_DOUBLE, g_nb_t_up, 82, g_cart_grid,
             &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
@@ -210,7 +210,7 @@ void xchange_halffield() {
             g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ), T * LX * LZ * 12 / 2, MPI_DOUBLE,
@@ -226,7 +226,7 @@ void xchange_halffield() {
             g_nb_y_up, 102, g_cart_grid, &requests[11]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in z direction */
   /* recieve the data from the neighbour on the left in z direction */
   MPI_Isend((void*)(sendBuffer + LX * LY * LZ + T * LY * LZ + T * LX * LZ), T * LX * LY * 12 / 2,
@@ -246,27 +246,27 @@ void xchange_halffield() {
 #endif /* MPI */
   return;
 
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangehalf)
 #endif
 }
 
-#endif /* def (_USE_SHMEM || _PERSISTENT) */
+#endif /* def (TM_USE_SHMEM || TM_PERSISTENT) */
 void xchange_halffield32() {
 #ifdef TM_USE_MPI
 
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchangehalf32)
 #endif
 
@@ -284,7 +284,7 @@ void xchange_halffield32() {
   MPI_Irecv((void*)(recvBuffer32), LX * LY * LZ * 12 / 2, MPI_FLOAT, g_nb_t_up, 82, g_cart_grid,
             &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
@@ -301,7 +301,7 @@ void xchange_halffield32() {
             g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(sendBuffer32 + LX * LY * LZ + T * LY * LZ), T * LX * LZ * 12 / 2, MPI_FLOAT,
@@ -317,7 +317,7 @@ void xchange_halffield32() {
             g_nb_y_up, 102, g_cart_grid, &requests[11]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in z direction */
   /* recieve the data from the neighbour on the left in z direction */
   MPI_Isend((void*)(sendBuffer32 + LX * LY * LZ + T * LY * LZ + T * LX * LZ), T * LX * LY * 12 / 2,
@@ -336,8 +336,8 @@ void xchange_halffield32() {
   MPI_Waitall(reqcount, requests, status);
 #endif /* MPI */
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchangehalf32)
 #endif
 }
-#endif /* defined _USE_HALFSPINOR */
+#endif /* defined TM_USE_HALFSPINOR */
diff --git a/src/lib/xchange/xchange_lexicfield.c b/src/lib/xchange/xchange_lexicfield.c
index 9def17fc6..56cc4315c 100644
--- a/src/lib/xchange/xchange_lexicfield.c
+++ b/src/lib/xchange/xchange_lexicfield.c
@@ -43,7 +43,7 @@
 #include "xchange_lexicfield.h"
 
 /* this version uses non-blocking MPI calls */
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 
 /* this is the version independent of the content of the function Index (only available with
  * non-blocking)) */
@@ -51,16 +51,16 @@
 void xchange_lexicfield(spinor* const l) {
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield)
 #endif
 
@@ -71,7 +71,7 @@ void xchange_lexicfield(spinor* const l) {
   MPI_Isend((void*)l, 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
   MPI_Irecv((void*)(l + VOLUME), 1, lfield_time_slice_cont, g_nb_t_up, 5081, g_cart_grid,
             &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend((void*)l, 1, lfield_x_slice_gath, g_nb_x_dn, 5091, g_cart_grid, &requests[4]);
@@ -80,7 +80,7 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend((void*)l, 1, lfield_y_slice_gath, g_nb_y_dn, 5101, g_cart_grid, &requests[8]);
@@ -88,7 +88,7 @@ void xchange_lexicfield(spinor* const l) {
             5101, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -103,7 +103,7 @@ void xchange_lexicfield(spinor* const l) {
   MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont, g_nb_t_dn, 5082,
             g_cart_grid, &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
   MPI_Isend((void*)(l + (LX - 1) * LY * LZ), 1, lfield_x_slice_gath, g_nb_x_up, 5092, g_cart_grid,
@@ -112,7 +112,7 @@ void xchange_lexicfield(spinor* const l) {
             5092, g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(l + (LY - 1) * LZ), 1, lfield_y_slice_gath, g_nb_y_up, 5102, g_cart_grid,
@@ -121,7 +121,7 @@ void xchange_lexicfield(spinor* const l) {
             g_nb_y_dn, 5102, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
@@ -135,21 +135,21 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield)
 #endif
 }
 
 /* Here comes the naive version */
 /* Using MPI_Sendrecv */
-#else /* _NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 /* exchanges the field  l */
 void xchange_lexicfield(spinor* const l) {
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield)
 #endif
 
@@ -167,7 +167,7 @@ void xchange_lexicfield(spinor* const l) {
                (void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont, g_nb_t_dn, 5082,
                g_cart_grid, &status);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)l, 1, lfield_x_slice_gath, g_nb_x_dn, 5091,
@@ -182,7 +182,7 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv((void*)l, 1, lfield_y_slice_gath, g_nb_y_dn, 5101,
@@ -197,7 +197,7 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
   MPI_Sendrecv((void*)l, 1, lfield_z_slice_gath, g_nb_z_dn, 5503,
@@ -214,7 +214,7 @@ void xchange_lexicfield(spinor* const l) {
 #endif
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield)
 #endif
 }
@@ -226,20 +226,20 @@ void xchange_lexicfield(spinor* const l) {
  ***********************************************************************/
 
 /* this version uses non-blocking MPI calls */
-#if (defined _NON_BLOCKING)
+#if (defined TM_NON_BLOCKING)
 void xchange_lexicfield32(spinor32* const l) {
   MPI_Request requests[16];
   MPI_Status status[16];
-#ifdef PARALLELT
+#ifdef TM_PARALLELT
   int reqcount = 4;
-#elif defined PARALLELXT
+#elif defined TM_PARALLELXT
   int reqcount = 8;
-#elif defined PARALLELXYT
+#elif defined TM_PARALLELXYT
   int reqcount = 12;
-#elif defined PARALLELXYZT
+#elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield32)
 #endif
 
@@ -250,7 +250,7 @@ void xchange_lexicfield32(spinor32* const l) {
   MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
   MPI_Irecv((void*)(l + VOLUME), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid,
             &requests[1]);
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Isend((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid, &requests[4]);
@@ -259,7 +259,7 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Isend((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[8]);
@@ -267,7 +267,7 @@ void xchange_lexicfield32(spinor32* const l) {
             5101, g_cart_grid, &requests[9]);
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
 
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
@@ -282,7 +282,7 @@ void xchange_lexicfield32(spinor32* const l) {
   MPI_Irecv((void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082,
             g_cart_grid, &requests[3]);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in x direction */
   /* recieve the data from the neighbour on the left in x direction */
   MPI_Isend((void*)(l + (LX - 1) * LY * LZ), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid,
@@ -291,7 +291,7 @@ void xchange_lexicfield32(spinor32* const l) {
             g_nb_x_dn, 5092, g_cart_grid, &requests[7]);
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
   MPI_Isend((void*)(l + (LY - 1) * LZ), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid,
@@ -300,7 +300,7 @@ void xchange_lexicfield32(spinor32* const l) {
             lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[11]);
 #endif
 
-#if defined PARALLELXYZT
+#if defined TM_PARALLELXYZT
 
   /* send the data to the neighbour on the right in y direction */
   /* recieve the data from the neighbour on the left in y direction */
@@ -314,21 +314,21 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield32)
 #endif
 }
 
 /* Here comes the naive version */
 /* Using MPI_Sendrecv */
-#else /* _NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 /* exchanges the field  l */
 void xchange_lexicfield32(spinor32* const l) {
 
-#ifdef PARALLELXYZT
+#ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst begin(xchange_lexicfield32)
 #endif
 
@@ -347,7 +347,7 @@ void xchange_lexicfield32(spinor32* const l) {
                (void*)(l + (T + 1) * LX * LY * LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082,
                g_cart_grid, &status);
 
-#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in x direction */
   /* recieve the data from the neighbour on the right in x direction */
   MPI_Sendrecv((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091,
@@ -362,7 +362,7 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
 
-#if (defined PARALLELXYT || defined PARALLELXYZT)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in y direction */
   /* recieve the data from the neighbour on the right in y direction */
   MPI_Sendrecv((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101,
@@ -377,7 +377,7 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
 
-#if (defined PARALLELXYZT)
+#if (defined TM_PARALLELXYZT)
   /* send the data to the neighbour on the left in z direction */
   /* recieve the data from the neighbour on the right in z direction */
   MPI_Sendrecv((void*)l, 1, lfield_z_slice_gath32, g_nb_z_dn, 5503,
@@ -394,7 +394,7 @@ void xchange_lexicfield32(spinor32* const l) {
 #endif
 #endif
   return;
-#ifdef _KOJAK_INST
+#ifdef TM_KOJAK_INST
 #pragma pomp inst end(xchange_lexicfield32)
 #endif
 }

From 8f39dd2600baf62461c5657120f956ab3e21eb99 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 10 Feb 2026 17:12:26 +0100
Subject: [PATCH 03/19] [cmake] More work

- Add fftw
- Add option to compile the tests (OFF by default)
---
 CMakeLists.txt                    |  3 ++-
 cmake/tmlqcd_config_internal.h.in |  3 ---
 src/bin/CMakeLists.txt            | 25 +++++++++++++++++++++++++
 src/lib/CMakeLists.txt            |  7 +++----
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39adba1c5..2cacfcc39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.24)
+cmake_minimum_required(VERSION 3.30)
 
 project(
   tmlqcd
@@ -103,6 +103,7 @@ option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_USE_GPROF "Enable gprof profiler" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
+option(TM_ENABLE_TESTS "Enable tests" OFF)
 
 # MPI dependent options
 cmake_dependent_option(
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 2765a2b7c..89bc753df 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -93,9 +93,6 @@
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
 #cmakedefine TM_LARGEFILE_SOURCE
 
-/* Define for large files, on AIX-style hosts. */
-#cmakedefine TM_LARGE_FILES 
-
 /* Use even/odd geometry in the gauge fields */
 #cmakedefine TM_NEW_GEOMETRY
 
diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
index 29c9c1d8a..2f135ddae 100644
--- a/src/bin/CMakeLists.txt
+++ b/src/bin/CMakeLists.txt
@@ -17,3 +17,28 @@ foreach(_prog ${tmlqcd_prog})
                POSITION_INDEPENDENT_CODE ON
                LINKER_LANGUAGE "CXX")
 endforeach()
+
+if(TM_ENABLE_TESTS)
+  list(
+    APPEND
+    tmlqcd_test_prog
+    "check_locallity.c;hopping_test.cscalar_prod_r_test.c;test_eigenvalues.c;test_lemon.c"
+  )
+  if(TM_USE_LEMON)
+    list(APPEND tmlqcd_test_prog test_lemon.c)
+  endif()
+  if(TN_USE_QPHIX)
+    list(APPEND tmlqcd_test_prog qphix_test_Dslash.c)
+  endif()
+
+  foreach(_prog ${tmlqcd_test_prog})
+    add_executable(${_prog} "${CMAKE_SOURCE_DIR}/src/bin/tests/${_prog}.c")
+
+    target_link_libraries(${_prog} PUBLIC hmc)
+    set_target_properties(
+      ${_prog}
+      PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+                 POSITION_INDEPENDENT_CODE ON
+                 LINKER_LANGUAGE "CXX")
+  endforeach()
+endif()
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 746b40c0d..ea2f7e41d 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -332,7 +332,7 @@ list(
   get_rectangle_staples.c
   rnd_gauge_trafo.c
   measure_rectangles.c
-  #invert.c
+  # invert.c
   deriv_Sb_D_psi.c
   mpi_init.c
   update_momenta_fg.c
@@ -414,7 +414,7 @@ endif()
 # create a target library with namespacing because cmake does not know name
 # space at all
 
-if (BUILD_SHARED_LIBS)
+if(BUILD_SHARED_LIBS)
   add_library(hmc SHARED "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
 else()
   add_library(hmc STATIC "${ALL_SRC};${FLEX_tmlqcd_input_read_OUTPUTS}")
@@ -449,8 +449,7 @@ target_link_libraries(
          m)
 
 target_compile_definitions(
-  hmc PUBLIC HAVE_CONFIG_H
-             $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
+  hmc PUBLIC HAVE_CONFIG_H $<$<BOOL:${TM_USE_HIP}>:${TM_GPU_PLATFORM_DFLAGS}>)
 
 target_include_directories(
   hmc

From 7315eaeb41769177604c44cc9bb8e892d1b7d344 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Thu, 12 Feb 2026 11:48:58 +0100
Subject: [PATCH 04/19] Removed FindLemon.cmake

---
 cmake/FindLemon.cmake | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 cmake/FindLemon.cmake

diff --git a/cmake/FindLemon.cmake b/cmake/FindLemon.cmake
deleted file mode 100644
index cdeca5e42..000000000
--- a/cmake/FindLemon.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-include(FindPackageHandleStandardArgs)
-
-find_library(
-  TMLQCD_LEMON_LIBRARIES
-  NAMES lemon
-  PATH_SUFFIXES "lib" "lib64")
-
-find_path(
-  TMLQCD_LEMON_INCLUDE_DIRS
-  NAMES lemon.h
-  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
-
-find_package_handle_standard_args(Lemon DEFAULT_MSG TMLQCD_LEMON_LIBRARIES
-                                  TMLQCD_LEMON_INCLUDE_DIRS)
-
-if(NOT TARGET tmlqcd::lemon)
-  add_library(tmlqcd::lemon INTERFACE IMPORTED)
-  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                 "${TMLQCD_LEMON_LIBRARIES}")
-  set_target_properties(tmlqcd::lemon PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                 "${TMLQCD_LEMON_INCLUDE_DIRS}")
-endif()
-
-set(TMLQCD_LEMON_FOUND ON)
-mark_as_advanced(TMLQCD_LEMON_LIBRARIES TMLQCD_LEMON_INCLUDE_DIRS)

From 252d9684ffcacce57d1b5c4bbff73c11ff41b4c6 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Mon, 16 Feb 2026 10:05:47 +0100
Subject: [PATCH 05/19] [cmake] More work

---
 .../repo/packages/lemonio/package.py          |  22 ++-
 .github/workflows/basic-build.yaml            |  38 ++--
 .github/workflows/ddalphaamg-build.yaml       |  47 ++---
 .github/workflows/qphix-build.yaml            |  54 ++----
 CMakeLists.txt                                |  81 ++++----
 cmake/FindDDAlphaAMG.cmake                    |  29 ---
 cmake/FindDDalphaAMG.cmake                    |  28 +++
 cmake/tmlqcd_config_internal.h.in             |  12 +-
 profiling/hmc_mk2/logs/example_log.out        |   2 +-
 src/bin/LapH_ev.c                             | 180 ------------------
 src/bin/benchmark.c                           |   6 +-
 src/bin/deriv_mg_tune.c                       |   2 +-
 src/bin/hmc_tm.c                              |   2 +-
 src/bin/invert.c                              |   2 +-
 src/bin/offline_measurement.c                 |   2 +-
 src/bin/tests/check_locallity.c               |   6 +-
 src/bin/tests/hopping_test.c                  |   6 +-
 src/bin/tests/qphix_test_Dslash.c             |   4 +-
 src/bin/tests/test_eigenvalues.c              |   8 +-
 src/bin/tests/test_lemon.c                    |   2 +-
 src/lib/CMakeLists.txt                        |  15 +-
 src/lib/DDalphaAMG_interface.c                |  32 ++--
 src/lib/buffers/utils_generic_exchange.c      |   2 +-
 src/lib/deriv_Sb.c                            |  18 +-
 src/lib/geometry_eo.c                         |  27 +--
 src/lib/global.h                              |   1 -
 src/lib/init/init.h                           |   2 +-
 src/lib/init/init_dirac_halfspinor.c          |  18 +-
 src/lib/init/init_geometry_indices.c          |   1 -
 src/lib/io/utils_write_first_message.c        |   6 +-
 src/lib/linalg/assign.c                       |   1 -
 src/lib/linalg/assign_add_mul_r_32.c          |   2 +-
 src/lib/linalg/scalar_prod_r.c                |   1 -
 src/lib/matrix_utils.c                        |   5 +-
 src/lib/measure_gauge_action.c                |   2 +-
 src/lib/misc_types.h                          |   2 +-
 src/lib/mpi_init.c                            |  11 +-
 src/lib/mpi_init.h                            |   5 +-
 src/lib/operator/D_psi_body.c                 |   2 +-
 src/lib/operator/Hopping_Matrix.c             |   4 +-
 src/lib/operator/Hopping_Matrix_32.c          |   4 +-
 src/lib/operator/halfspinor_body.c            |   4 +-
 src/lib/operator/hopping_bg_dbl.c             |  20 +-
 src/lib/operator/hopping_body_dbl.c           |  20 +-
 src/lib/operator/hopping_sgl.c                |  18 +-
 src/lib/operator/tm_sub_Hopping_Matrix.c      |   4 +-
 src/lib/operator/tm_times_Hopping_Matrix.c    |   6 +-
 src/lib/read_input.l                          |   2 +-
 src/lib/smearing/utils_reunitarize_MILC.c     |   4 +-
 src/lib/solver/gram-schmidt.c                 |   1 -
 src/lib/test/check_geometry.c                 |   7 +-
 src/lib/test/check_overlap.c                  |   2 +-
 src/lib/wrapper/lib_wrapper.c                 |   6 +-
 src/lib/xchange/xchange_gauge.c               |   1 -
 54 files changed, 281 insertions(+), 508 deletions(-)
 delete mode 100644 cmake/FindDDAlphaAMG.cmake
 create mode 100644 cmake/FindDDalphaAMG.cmake
 delete mode 100644 src/bin/LapH_ev.c

diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
index d70cac492..7508b4b79 100755
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
@@ -2,12 +2,13 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
-from spack_repo.builtin.build_systems.autotools import AutotoolsPackage
+from spack_repo.builtin.build_systems import cmake
+from spack_repo.builtin.build_systems.cmake import CMakePackage, generator
 
 
 from spack.package import *
 
-class Lemonio(AutotoolsPackage):
+class Lemonio(AutotoolsPackage, CMakePackage):
     """LEMON: Lightweight Parallel I/O library for Lattice QCD."""
 
     homepage = "https://github.com/etmc/lemon"
@@ -16,13 +17,18 @@ class Lemonio(AutotoolsPackage):
 
     version('master', branch='master')
 
-    depends_on("autoconf", type="build", when="@master build_system=autotools")
-    depends_on("automake", type="build", when="@master build_system=autotools")
-    depends_on("libtool", type="build", when="@master build_system=autotools")
+    depends_on("libtool", type="build", when="@master build_system=cmake")
+    depends_on("cmake", type="build", when="master build_system=cmake")
 
     depends_on('mpi')
 
-    def configure_args(self):
-        args = []
-        args.append('CC={0}'.format(self.spec['mpi'].mpicc))
+    generator("ninja")
+
+class CMakeBuilder(cmake.CMakeBuilder):
+    def cmake_args(self):
+        spec = self.spec
+        args = [
+            self.define_from_variant("DBUILD_SHARED_LIBS" "shared"),
+        ]
         return args
+
diff --git a/.github/workflows/basic-build.yaml b/.github/workflows/basic-build.yaml
index afe18e145..d46b67830 100644
--- a/.github/workflows/basic-build.yaml
+++ b/.github/workflows/basic-build.yaml
@@ -35,16 +35,16 @@ jobs:
           repository: usqcd-software/c-lime
           path: lime
 
-      - name: autogen_lime
+      - name: create_builddir_lime
         working-directory: ${{github.workspace}}/lime
-        run: ./autogen.sh && mkdir build
+        run: mkdir build
 
       - name: build_lime
         working-directory: ${{github.workspace}}/lime/build
         run: |
           CC=gcc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir .. >> config.log
           make -j
           make install
 
@@ -61,10 +61,9 @@ jobs:
           repository: etmc/lemon
           path: lemon
 
-      - name: autogen_lemon
+      - name: create_builddir_lemon
         working-directory: ${{github.workspace}}/lemon
         run: |
-          autoreconf -i -f
           mkdir build
 
       - name: build_lemon
@@ -72,9 +71,9 @@ jobs:
         run: |
           CC=mpicc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir ..
           make -j
-          make install
+          make install > config.log
       
       - name: Archive lemon config.log
         if: ${{ always() }}
@@ -92,28 +91,19 @@ jobs:
         shell: bash
         run: mkdir ${{github.workspace}}/main/build
 
-      - name: autogen_tmlqcd
-        working-directory: ${{github.workspace}}/main
-        run: autoconf
-
       - name: configure_and_build
         shell: bash
         working-directory: ${{github.workspace}}/main/build
         run: |
-          CC=mpicc CXX=mpicxx \
-            LDFLAGS="-fopenmp" \
             CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
             CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            ../configure \
-            --enable-mpi \
-            --with-mpidimension=4 \
-            --enable-omp \
-            --disable-sse2 \
-            --disable-sse3 \
-            --with-limedir=${{github.workspace}}/lime/build/install_dir \
-            --with-lemondir=${{github.workspace}}/lemon/build/install_dir \
-            --with-lapack="-lblas -llapack" || cat config.log
-          make -j
+            cmake -DCMAKE_PREFIX_PATH="${{github.workspace}}/lime/build/install_dir;${{github.workspace}}/lemon/build/install_dir" \
+            -DTM_USE_MPI=ON \
+            -DTM_USE_OMP=ON \
+            -DTM_USE_LEMON=ON \
+            .. > config.log
+            cat config.log
+            make -j
 
       - name: Archive tmLQCD config.log
         if: ${{ always() }}
@@ -125,7 +115,7 @@ jobs:
       - name: nf2_rgmixedcg_hmc_tmcloverdetratio
         working-directory: ${{github.workspace}}/main/build
         run: |
-          mpirun -np 2 ./hmc_tm \
+          mpirun -np 2 src/bin/hmc_tm \
             -f ../doc/sample-input/sample-hmc-rgmixedcg-tmcloverdetratio.input
       
       - name: Archive nf2_rgmixedcg_hmc_tmcloverdetratio output
diff --git a/.github/workflows/ddalphaamg-build.yaml b/.github/workflows/ddalphaamg-build.yaml
index f50ffcae9..509fb28b6 100644
--- a/.github/workflows/ddalphaamg-build.yaml
+++ b/.github/workflows/ddalphaamg-build.yaml
@@ -40,19 +40,16 @@ jobs:
           repository: usqcd-software/c-lime
           path: lime
 
-      - name: autogen_lime
+      - name: create_builddir_lime
         working-directory: ${{github.workspace}}/lime
-        run: ./autogen.sh
-      
-      - name: create_lime_builddir
-        run: mkdir ${{github.workspace}}/lime/build
+        run: mkdir build
 
       - name: build_lime
         working-directory: ${{github.workspace}}/lime/build
         run: |
           CC=gcc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir .. >> config.log
           make -j
           make install
 
@@ -69,23 +66,20 @@ jobs:
           repository: etmc/lemon
           path: lemon
 
-      - name: create_lemon_builddir
-        run: mkdir ${{github.workspace}}/lemon/build
-
-      - name: autogen_lemon
+      - name: create_builddir_lemon
         working-directory: ${{github.workspace}}/lemon
-        run: autoreconf -i -f
+        run: |
+          mkdir build
 
       - name: build_lemon
         working-directory: ${{github.workspace}}/lemon/build
         run: |
           CC=mpicc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure \
-            --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir ..
           make -j
-          make install
-
+          make install > config.log
+      
       - name: Archive lemon config.log
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
@@ -111,10 +105,6 @@ jobs:
         shell: bash
         run: mkdir ${{github.workspace}}/main/build
 
-      - name: autogen_tmlqcd
-        working-directory: ${{github.workspace}}/main
-        run: autoconf
-
       - name: configure_and_build
         shell: bash
         working-directory: ${{github.workspace}}/main/build
@@ -123,22 +113,19 @@ jobs:
             LDFLAGS="-fopenmp" \
             CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
             CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            ../configure \
-            --enable-mpi \
-            --with-mpidimension=4 \
-            --enable-omp \
-            --disable-sse2 \
-            --disable-sse3 \
-            --with-limedir=${{github.workspace}}/lime/build/install_dir \
-            --with-lemondir=${{github.workspace}}/lemon/build/install_dir \
-            --with-DDalphaAMG=${{github.workspace}}/ddalphaamg \
-            --with-lapack="-lblas -llapack" || cat config.log
+            cmake -DCMAKE_PREFIX_PATH="${{github.workspace}}/lime/build/install_dir;${{github.workspace}}/lemon/build/install_dir;${{github.workspace}}/ddalphaamg" \
+            -DTM_USE_MPI=ON \
+            -DTM_USE_OMP=ON \
+            -DTM_USE_LEMON=ON \
+            -DTM_USE_DDalphaAMG=ON \
+            .. > config.log
+            cat config.log
           make -j
 
       - name: nf2_ddalphaamg_hmc_tmcloverdetratio
         working-directory: ${{github.workspace}}/main/build
         run: |
-          mpirun -np 2 ./hmc_tm \
+          mpirun -np 2 src/bin/hmc_tm \
             -f ../doc/sample-input/sample-hmc-ddalphaamg-tmcloverdetratio.input
 
       - name: Archive nf2_ddalphaamg_hmc_tmcloverdetratio output
diff --git a/.github/workflows/qphix-build.yaml b/.github/workflows/qphix-build.yaml
index 1b39cdf34..eef1b5055 100644
--- a/.github/workflows/qphix-build.yaml
+++ b/.github/workflows/qphix-build.yaml
@@ -35,16 +35,16 @@ jobs:
           repository: usqcd-software/c-lime
           path: lime
 
-      - name: autogen_lime
+      - name: create_builddir_lime
         working-directory: ${{github.workspace}}/lime
-        run: ./autogen.sh && mkdir build
+        run: mkdir build
 
       - name: build_lime
         working-directory: ${{github.workspace}}/lime/build
         run: |
           CC=gcc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir .. >> config.log
           make -j
           make install
 
@@ -61,10 +61,9 @@ jobs:
           repository: etmc/lemon
           path: lemon
 
-      - name: autogen_lemon
+      - name: create_builddir_lemon
         working-directory: ${{github.workspace}}/lemon
         run: |
-          autoreconf -i -f
           mkdir build
 
       - name: build_lemon
@@ -72,11 +71,10 @@ jobs:
         run: |
           CC=mpicc \
             CFLAGS="-march=haswell -mtune=haswell -O2" \
-            ../configure \
-            --prefix=$(pwd)/install_dir
+            cmake -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir ..
           make -j
-          make install
-
+          make install > config.log
+      
       - name: Archive lemon config.log
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
@@ -84,7 +82,6 @@ jobs:
           name: lemon_config_output
           path: ${{github.workspace}}/lemon/build/config.log 
 
-
       - name: get_qmp
         uses: actions/checkout@v4
         with:
@@ -151,9 +148,10 @@ jobs:
             -DCMAKE_C_COMPILER=mpicc \
             -DCMAKE_C_FLAGS="-std=c99 -O2 -mavx2 -mfma -mtune=haswell -march=haswell -fopenmp" \
             -DCMAKE_INSTALL_PREFIX=$(pwd)/install_dir \
-            ..
+            .. >> config.log
           VERBOSE=1 make -j $(( ${nb_cores} + 3 ))
-          make install
+          make install > config.log
+          cat config.log
 
       - name: get_tmlqcd
         uses: actions/checkout@v4
@@ -164,31 +162,21 @@ jobs:
         shell: bash
         run: mkdir ${{github.workspace}}/main/build
 
-      - name: autogen_tmlqcd
-        working-directory: ${{github.workspace}}/main
-        run: autoconf
-
       - name: configure_and_build
         shell: bash
         working-directory: ${{github.workspace}}/main/build
         run: |
           CC=mpicc CXX=mpicxx \
-            LDFLAGS="-fopenmp" \
-            CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-            ../configure \
-            --enable-mpi \
-            --with-mpidimension=4 \
-            --enable-omp \
-            --disable-sse2 \
-            --disable-sse3 \
-            --with-limedir=${{github.workspace}}/lime/build/install_dir \
-            --with-lemondir=${{github.workspace}}/lemon/build/install_dir \
-            --with-lapack="-lblas -llapack" \
-            --with-qmpdir=${{github.workspace}}/qmp/build/install_dir \
-            --with-qphixdir=${{github.workspace}}/qphix/build/install_dir \
-            --enable-qphix-soalen=4 || cat config.log
-          make -j
+          cmake -DCMAKE_PREFIX_PATH="${{github.workspace}}/lime/build/install_dir;${{github.workspace}}/lemon/build/install_dir;${{github.workspace}}/qmp/build/install_dir;${{github.workspace}}/qphix/build/install_dir" \
+           -DTM_USE_MPI=ON \
+           -DTM_USE_OMP=ON \
+           -DTM_USE_LEMON=ON \
+           -DTM_USE_QPHIX=ON \
+           -DCMAKE_CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DCMAKE_CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DQPHIX_DIR="${{github.workspace}}/qphix/build/install_dir" \
+            ..
+          make -j > config.log
 
       - name: Archive tmLQCD config.log
         if: ${{ always() }}
@@ -200,7 +188,7 @@ jobs:
       - name: nf2_qphix_hmc_tmcloverdetratio
         working-directory: ${{github.workspace}}/main/build
         run: |
-          mpirun -np 2 ./hmc_tm \
+          mpirun -np 2 src/bin/hmc_tm \
             -f ../doc/sample-input/sample-hmc-qphix-tmcloverdetratio.input
 
       - name: Archive nf2_qphix_hmc_tmcloverdetratio output
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cacfcc39..a375ad14b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,9 +23,6 @@ endif()
 # =================================================================================================
 # PROJECT AND VERSION
 include(CMakeDependentOption)
-include(CheckSymbolExists)
-include(CheckLibraryExists)
-include(CheckFunctionExists)
 include(GNUInstallDirs)
 
 cmake_policy(SET CMP0048 NEW)
@@ -56,16 +53,18 @@ endif()
 
 find_package(PkgConfig)
 
-# ##############################################################################
-# Define the paths for static libraries and executables
-# ##############################################################################
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY
-    ${cp2k_BINARY_DIR}/lib
-    CACHE PATH "Single output directory for building all libraries.")
-
 # Search for rocm in common locations
-foreach(__var ROCM_ROOT CRAY_ROCM_ROOT ORNL_ROCM_ROOT CRAY_ROCM_PREFIX
-              ROCM_PREFIX CRAY_ROCM_DIR)
+foreach(
+  __var
+  ROCM_ROOT
+  HIP_ROOT
+  HIP_PATH
+  CRAY_ROCM_ROOT
+  ORNL_ROCM_ROOT
+  CRAY_ROCM_PREFIX
+  ROCM_PREFIX
+  CRAY_ROCM_DIR
+  ROCM_PATH)
   if($ENV{${__var}})
     list(APPEND CMAKE_PREFIX_PATH $ENV{__var})
     set(ROCM_PATH
@@ -75,7 +74,7 @@ foreach(__var ROCM_ROOT CRAY_ROCM_ROOT ORNL_ROCM_ROOT CRAY_ROCM_PREFIX
 endforeach()
 
 option(CMAKE_POSITION_INDEPENDENT_CODE "Enable position independent code" ON)
-option(BUILD_SHARED_LIBS "Enable shared library" ON)
+option(BUILD_SHARED_LIBS "Enable shared library" OFF)
 option(TM_USE_FFTW "Enable fftw support" OFF)
 option(TM_USE_MPI "Enable MPI support" OFF)
 option(TM_USE_CUDA "Enable QUDA support" OFF)
@@ -93,15 +92,12 @@ set(TM_ENABLE_ALIGNMENT
 set_property(CACHE TM_ENABLE_ALIGNMENT PROPERTY STRINGS "auto" "none" "16" "32"
                                                 "64")
 
-option(TM_BGL_DRAM "use BGL dram window (BGL only!)" ON)
 option(TM_USE_OPTIMIZATION "enable optimisation" ON)
-option(TM_USE_GAUGECOPY "Enable use of a copy of the gauge field" ON)
+option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
 option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
-option(TM_USE_TSPLITPAR "Enable timeslice-splitted communications" ON)
 option(TM_USE_QPHIX "enable QPhiX" OFF)
 option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
-option(TM_USE_GPROF "Enable gprof profiler" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 option(TM_ENABLE_TESTS "Enable tests" OFF)
 
@@ -203,7 +199,7 @@ if(TM_USE_HDF5)
 endif()
 
 if(TM_USE_LEMON)
-  find_package(Clemon REQUIRED)
+  find_package(lemon REQUIRED)
 endif()
 
 find_package(CLime REQUIRED)
@@ -231,6 +227,8 @@ endif()
 
 if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
   enable_language(CUDA)
+
+  # placeholder for nvhpc for future use
   if(TM_USE_NVHPC)
     find_package(NVHPC REQUIRED COMPONENTS CUDA MATH HOSTUTILS NCCL)
   else()
@@ -238,11 +236,11 @@ if(TM_USE_CUDA OR QUDA_TARGET_CUDA)
   endif()
 endif()
 
+# We may want to use hip-cuda for development or debugging purposes especially
+# if AMD GPU access is not possible. So allow it
+
 if(TM_USE_HIP OR QUDA_TARGET_HIP)
   enable_language(hip)
-
-  # we may want to use hip-cuda for development or debugging purposes especially
-  # if AMD GPU access is not possible. So allow it
   if(TM_USE_CUDA_HIP)
     find_package(CUDA)
   endif()
@@ -254,14 +252,15 @@ if(TM_USE_HIP OR QUDA_TARGET_HIP)
   endif()
 endif()
 
-if(TM_USE_QPIHX)
-  find_package(QPhiX REQUIRED)
+if(TM_USE_QPHIX)
+  find_package(QPhiX REQUIRED CONFIG)
+  message("${QPhiX_LIBRARIES}")
   if(NOT TARGET tmlqcd::qphix)
     add_library(tmlqcd::qphix INTERFACE IMPORTED)
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                   "${QPHIX_LIBRARIES}")
+                                                   "${QPhiX_LIBRARIES}")
     set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                   "${QPHIX_INCLUDE_DIRS}")
+                                                   "${QPhiX_INCLUDE_DIRS}")
   endif()
 endif()
 
@@ -274,17 +273,7 @@ if(TM_USE_FFTW)
 endif()
 
 if(TM_USE_DDalphaAMG)
-  find_package(DDAlphaAMG REQUIRED)
-endif()
-
-# gprofiler
-
-if(TM_USE_GPROF)
-  set(PROFILE_FLAGS "-pg;-g")
-  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "powerpc|powerpc64")
-    list(APPEND PROFILE_FLAGS "-qfullpath")
-  endif()
-  add_compile_options($<BOOL:$<COMPILE_LANGUAGE:C>:$PROFILE_FLAGS>)
+  find_package(DDalphaAMG REQUIRED)
 endif()
 
 if(TM_ENABLE_WARNINGS)
@@ -292,31 +281,26 @@ if(TM_ENABLE_WARNINGS)
                       $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall>)
 endif()
 
-# check for the presence of clock_gettime in libc or librt
-check_symbol_exists(clock_gettime "time.h" TM_CLOCK_GETTIME)
-check_library_exists(rt clock_gettime "" TM_CLOCK_GETTIME_IN_RT)
-check_function_exists(fseeko TM_FSEEKO)
-
 # set the parallelization
 
 if(TM_USE_MPI)
-  if(TM_MPI_DIMENSION EQUAL "1")
+  if(TM_MPI_DIMENSION STREQUAL "1")
     # T parallelisation
     set(TM_PARALLELT ON)
-  elseif(TM_MPI_DIMENSION EQUAL "2")
+  elseif(TM_MPI_DIMENSION STREQUAL "2")
     # XT parallelisation
     set(TM_PARALLELXT ON)
-  elseif(TM_MPI_DIMENSION EQUAL "3")
+  elseif(TM_MPI_DIMENSION STREQUAL "3")
     set(TM_PARALLELXYT ON)
     # XYZ parallelisation
-  elseif(TM_MPI_DIMENSION EQUAL "4")
+  elseif(TM_MPI_DIMENSION STREQUAL "4")
     # timeslice-splitted communications
     set(TM_PARALLELXYZT ON)
-  elseif(TM_MPI_DIMENSION EQUAL "X")
+  elseif(TM_MPI_DIMENSION STREQUAL "X")
     set(TM_PARALLELX ON)
-  elseif(TM_MPI_DIMENSION EQUAL "XY")
+  elseif(TM_MPI_DIMENSION STREQUAL "XY")
     set(TM_PARALLELXY ON)
-  elseif(TM_MPI_DIMENSION EQUAL "XYZ")
+  elseif(TM_MPI_DIMENSION STREQUAL "XYZ")
     set(TM_PARALLELXYZ ON)
   else()
     set(TM_PARALLELXYZT ON)
@@ -346,7 +330,6 @@ if(DEFINED GIT_EXE AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
   message(STATUS "git hash ${TM_SHA}")
 else()
-  # set(TM_GIT_BRANCH "release v${SIRIUS_VERSION}")
   set(TM_SHA
       "https://github.com/etmc/tmLQCD/releases/tag/rel-${TMLQCD_VERSION_MAJOR}-${TMLQCD_VERSION_MINOR}"
   )
diff --git a/cmake/FindDDAlphaAMG.cmake b/cmake/FindDDAlphaAMG.cmake
deleted file mode 100644
index f42c943cc..000000000
--- a/cmake/FindDDAlphaAMG.cmake
+++ /dev/null
@@ -1,29 +0,0 @@
-include(FindPackageHandleStandardArgs)
-
-find_library(
-  TM_DDALPHAAMG_LIBRARIES
-  NAMES DDalphaAMG DDalphaAMG_devel
-  PATH_SUFFIXES "lib" "lib64")
-
-find_path(
-  TM_DDALPHAAMG_INCLUDE_DIRS
-  NAMES DDalphaAMG.h
-  PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
-
-find_package_handle_standard_args(
-  DDAlphaAMG DEFAULT_MSG TMLQCD_DDALPHAAMG_LIBRARIES
-  TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
-
-if(NOT TARGET tmlqcd::DDalphaAMG)
-  add_library(tmlqcd::DDalphaAMG INTERFACE IMPORTED)
-  set_target_properties(
-    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_LINK_LIBRARIES
-                                  "${TMLQCD_DDALPHAAMG_LIBRARIES}")
-  set_target_properties(
-    tmlqcd::DDalphaAMG PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                  "${TMLQCD_DDALPHAAMG_INCLUDE_DIRS}")
-endif()
-
-set(TMLQCD_DDALPHAAMG_FOUND ON)
-mark_as_advanced(TMLQCD_DDALPHAAMG_FOUND TMLQCD_DDALPHAAMG_LIBRARIES
-                 TMLQCD_DDALPHAAMG_INCLUDE_DIRS)
diff --git a/cmake/FindDDalphaAMG.cmake b/cmake/FindDDalphaAMG.cmake
new file mode 100644
index 000000000..5f0d2450f
--- /dev/null
+++ b/cmake/FindDDalphaAMG.cmake
@@ -0,0 +1,28 @@
+include(FindPackageHandleStandardArgs)
+
+find_library(
+  TM_DDALPHAAMG_LIBRARIES
+  NAMES DDalphaAMG DDalphaAMG_devel
+  PATH_SUFFIXES "lib" "lib64")
+
+find_path(
+  TM_DDALPHAAMG_INCLUDE_DIRS
+  NAMES DDalphaAMG.h
+  PATH_SUFFIXES "include")
+
+find_package_handle_standard_args(
+  DDalphaAMG DEFAULT_MSG TM_DDALPHAAMG_LIBRARIES TM_DDALPHAAMG_INCLUDE_DIRS)
+
+if(TM_DDALPHAAMG_LIBRARIES
+   AND TM_DDALPHAAMG_INCLUDE_DIRS
+   AND NOT TARGET tmlqcd::DDalphaAMG)
+  message("INCLUDE: ${TM_DDALPHAAMG_INCLUDE_DIRS}")
+  add_library(tmlqcd::DDalphaAMG INTERFACE IMPORTED)
+  set_property(TARGET tmlqcd::DDalphaAMG PROPERTY INTERFACE_LINK_LIBRARIES
+                                                  "${TM_DDALPHAAMG_LIBRARIES}")
+  set_property(
+    TARGET tmlqcd::DDalphaAMG PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                       "${TM_DDALPHAAMG_INCLUDE_DIRS}")
+endif()
+
+mark_as_advanced(TM_DDALPHAAMG_LIBRARIES TM_DDALPHAAMG_INCLUDE_DIRS)
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 89bc753df..fb8d7d818 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -12,9 +12,6 @@
 /* Define to 1 if you have the `lemon' library (-llemon). */
 #cmakedefine TM_USE_LEMON 
 
-/* 1 if clock_gettime is available for use in benchmark */
-#cmakedefine TM_CLOCK_GETTIME 
-
 /* Compile with MPI support */
 #cmakedefine TM_USE_MPI
 
@@ -31,9 +28,9 @@
 #define PACKAGE_BUGREPORT "@PACKAGE_BUGREPORT@"
 
 /* Define to the full name of this package. */
-#define PACKAGE_NAME "@PROJECT_DESCRIPTION@"
+#define PACKAGE_NAME "@tmlqcd_DESCRIPTION@"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "@PROJECT_VERSION@"
+#define PACKAGE_STRING "@tmlqcd_VERSION@"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "@PACKAGE_TARNAME@"
@@ -65,9 +62,6 @@
 /* Fixed volume at compiletime */
 #cmakedefine TM_FIXEDVOLUME
 
-/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#cmakedefine TM_FSEEKO
-
 /* Alignment for arrays -- necessary for SSE and automated vectorization */
 #define ALIGN_BASE @ALIGN_BASE@
 
@@ -88,7 +82,7 @@
 #cmakedefine TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
-#cmakedefine TM_USE_GAUGECOPY
+#cmakedefine TM_USE_GAUGE_COPY
 
 /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
 #cmakedefine TM_LARGEFILE_SOURCE
diff --git a/profiling/hmc_mk2/logs/example_log.out b/profiling/hmc_mk2/logs/example_log.out
index 22ec86ec9..642963b16 100644
--- a/profiling/hmc_mk2/logs/example_log.out
+++ b/profiling/hmc_mk2/logs/example_log.out
@@ -270,7 +270,7 @@ operator 0 parsed line 229
 This is the hmc code for twisted mass Wilson QCD
 
 Version 5.2.0, commit 51cf008a89944ecdd9345cdb62aaf0a203a7f306
-# The code is compiled with -DTM_GAUGE_COPY
+# The code is compiled with -DTM_USE_GAUGE_COPY
 # The code is compiled with -DTM_USE_HALFSPINOR
 # the code is compiled for non-blocking MPI calls (spinor and gauge)
 # the code is compiled with openMP support
diff --git a/src/bin/LapH_ev.c b/src/bin/LapH_ev.c
deleted file mode 100644
index 08e810b36..000000000
--- a/src/bin/LapH_ev.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/***********************************************************************
- * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- ***********************************************************************/
-/*
- *  Program for computing the eigensystem of the Laplacian operator
- * Authors Luigi Scorzato, Marco Cristoforetti
- *
- *
- *******************************************************************************/
-
-#ifdef HAVE_CONFIG_H
-#include "tmlqcd_config.h"
-#else
-#error "no tmlqcd_config.h"
-#endif
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#ifdef TM_USE_MPI
-#include <mpi.h>
-#endif
-#include <io/gauge.h>
-#include <io/params.h>
-#include "geometry_eo.h"
-#include "global.h"
-#include "init/init.h"
-#include "mpi_init.h"
-#include "ranlxd.h"
-#include "read_input.h"
-#include "solver/eigenvalues_Jacobi.h"
-#include "start.h"
-#include "su3.h"
-#include "xchange/xchange.h"
-
-int main(int argc, char *argv[]) {
-  int tslice, j, k;
-  char conf_filename[50];
-
-#ifdef TM_USE_MPI
-  MPI_Init(&argc, &argv);
-#endif
-
-  /* Read the input file */
-  read_input("LapH.input");
-
-  tmlqcd_mpi_init(argc, argv);
-
-  if (g_proc_id == 0) {
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
-#endif
-#ifdef TM_USE_HALFSPINOR
-    printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
-#endif
-#ifdef TM_USE_SHMEM
-    printf("# the code was compiled with -DTM_USE_SHMEM\n");
-#ifdef TM_PERSISTENT
-    printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
-#endif
-#endif
-#ifdef TM_USE_MPI
-#ifdef TM_NON_BLOCKING
-    printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
-#endif
-#endif
-    printf("\n");
-    fflush(stdout);
-  }
-
-#ifndef WITHLAPH
-  printf(" Error: WITHLAPH not defined");
-  exit(0);
-#endif
-#ifdef TM_USE_MPI
-#ifndef _INDEX_INDEP_GEOM
-  printf(" Error: _INDEX_INDEP_GEOM not defined");
-  exit(0);
-#endif
-#ifndef _USE_TSPLITPAR
-  printf(" Error: _USE_TSPLITPAR not defined");
-  exit(0);
-#endif
-#endif
-#ifdef TM_FIXEDVOLUME
-  printf(" Error: TM_FIXEDVOLUME not allowed");
-  exit(0);
-#endif
-
-  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
-  init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
-
-  if (g_proc_id == 0) {
-    fprintf(stdout, "The number of processes is %d \n", g_nproc);
-    printf("# The lattice size is %d x %d x %d x %d\n", (int)(T * g_nproc_t), (int)(LX * g_nproc_x),
-           (int)(LY * g_nproc_y), (int)(g_nproc_z * LZ));
-    printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),
-           (int)LZ);
-    printf("# Computing LapH eigensystem \n");
-
-    fflush(stdout);
-  }
-
-  /* define the geometry */
-  geometry();
-
-  start_ranlux(1, 123456);
-
-  /* Read Gauge field */
-  sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore);
-  if (g_cart_id == 0) {
-    printf("#\n# Trying to read gauge field from file %s in %s precision.\n", conf_filename,
-           (gauge_precision_read_flag == 32 ? "single" : "double"));
-    fflush(stdout);
-  }
-  if ((j = read_gauge_field(conf_filename, g_gauge_field)) != 0) {
-    fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", j, conf_filename);
-    exit(-2);
-  }
-
-  if (g_cart_id == 0) {
-    printf("# Finished reading gauge field.\n");
-    fflush(stdout);
-  }
-
-#ifdef TM_USE_MPI
-  /*For parallelization: exchange the gaugefield */
-  xchange_gauge(g_gauge_field);
-#endif
-
-  /* Init Jacobi field */
-  init_jacobi_field(SPACEVOLUME + SPACERAND, 3);
-
-#ifdef TM_USE_MPI
-  {
-    /* for debugging in parallel set i_gdb = 0 */
-    volatile int i_gdb = 8;
-    char hostname[256];
-    gethostname(hostname, sizeof(hostname));
-    printf("PID %d on %s ready for attach\n", getpid(), hostname);
-    fflush(stdout);
-    if (g_cart_id == 0) {
-      while (0 == i_gdb) {
-        sleep(5);
-      }
-    }
-  }
-
-  MPI_Barrier(MPI_COMM_WORLD);
-#endif
-
-  for (k = 0; k < 3; k++) random_jacobi_field(g_jacobi_field[k], SPACEVOLUME);
-
-  /* Compute LapH Eigensystem */
-
-  for (tslice = 0; tslice < T; tslice++) {
-    eigenvalues_Jacobi(&no_eigenvalues, 5000, eigenvalue_precision, 0, tslice, nstore);
-  }
-
-#ifdef TM_USE_MPI
-  MPI_Finalize();
-#endif
-  return (0);
-}
diff --git a/src/bin/benchmark.c b/src/bin/benchmark.c
index 72d8c8f4d..b2f4ee68c 100644
--- a/src/bin/benchmark.c
+++ b/src/bin/benchmark.c
@@ -123,8 +123,8 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+    printf("# The code was compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
 #ifdef TM_USE_HALFSPINOR
     printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index 75595bc60..f65b22c48 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -136,7 +136,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index 0d95a3b3c..399362d0b 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -168,7 +168,7 @@ int main(int argc, char *argv[]) {
 
   g_mu = g_mu1;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
   status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
diff --git a/src/bin/invert.c b/src/bin/invert.c
index c3111decb..bb6f15c10 100644
--- a/src/bin/invert.c
+++ b/src/bin/invert.c
@@ -165,7 +165,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index 72a828fb7..c1422858f 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/bin/tests/check_locallity.c b/src/bin/tests/check_locallity.c
index f03806f21..01d12826b 100644
--- a/src/bin/tests/check_locallity.c
+++ b/src/bin/tests/check_locallity.c
@@ -18,13 +18,13 @@
  ***********************************************************************/
 
 #include <lime.h>
-#include <tmlqcd_config.h>
 #include <math.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#include <tmlqcd_config.h>
 #ifdef TM_USE_MPI
 #include <mpi.h>
 #endif
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
   double *norm;
   struct stout_parameters params_smear;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   int kb = 0;
 #endif
 #ifdef TM_USE_MPI
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
diff --git a/src/bin/tests/hopping_test.c b/src/bin/tests/hopping_test.c
index da60c83ba..0e5ff03e7 100644
--- a/src/bin/tests/hopping_test.c
+++ b/src/bin/tests/hopping_test.c
@@ -102,8 +102,8 @@ int main(int argc, char *argv[]) {
   tmlqcd_mpi_init(argc, argv);
 
   if (g_proc_id == 0) {
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+    printf("# The code was compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
 #ifdef TM_USE_HALFSPINOR
     printf("# The code was compiled with -DTM_USE_HALFSPINOR\n");
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
     fflush(stdout);
   }
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/bin/tests/qphix_test_Dslash.c b/src/bin/tests/qphix_test_Dslash.c
index b4218d3e6..41e2602a4 100644
--- a/src/bin/tests/qphix_test_Dslash.c
+++ b/src/bin/tests/qphix_test_Dslash.c
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
   tmlqcd_mpi_init(argc, argv);
   g_dbw2rand = 0;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND, 0);
@@ -180,7 +180,7 @@ int main(int argc, char* argv[]) {
 #endif
 
   g_update_gauge_copy = 1;
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   update_backward_gauge(g_gauge_field);
 #endif
 
diff --git a/src/bin/tests/test_eigenvalues.c b/src/bin/tests/test_eigenvalues.c
index 759d8dd2f..c52d29cf8 100644
--- a/src/bin/tests/test_eigenvalues.c
+++ b/src/bin/tests/test_eigenvalues.c
@@ -227,7 +227,7 @@ int main(int argc, char *argv[]) {
   g_eps_sq_acc = g_eps_sq_acc1;
   g_eps_sq_force = g_eps_sq_force1;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
@@ -277,8 +277,8 @@ int main(int argc, char *argv[]) {
 #ifdef TM_NEW_GEOMETRY
     printf("# The code was compiled with -DTM_NEW_GEOMETRY\n");
 #endif
-#ifdef TM_GAUGE_COPY
-    printf("# The code was compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+    printf("# The code was compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
     printf("# The lattice size is %d x %d x %d x %d\n", (int)(T * g_nproc_t), (int)(LX * g_nproc_x),
            (int)(LY), (int)(LZ));
@@ -430,7 +430,7 @@ int main(int argc, char *argv[]) {
 #ifdef TM_USE_MPI
   xchange_gauge(g_gauge_field);
 #endif
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   update_backward_gauge();
 #endif
 
diff --git a/src/bin/tests/test_lemon.c b/src/bin/tests/test_lemon.c
index 3cef7689c..9ef46be7b 100644
--- a/src/bin/tests/test_lemon.c
+++ b/src/bin/tests/test_lemon.c
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
 
   tmlqcd_mpi_init(argc, argv);
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index ea2f7e41d..ebed35308 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -370,7 +370,7 @@ list(
 list(APPEND TEST_SRC_C test/check_xchange.c test/check_geometry.c
      test/overlaptests.c)
 if(TM_USE_QPHIX)
-  list(APPEND MAIN_SRC_C QphiX/qphix_interface.cpp)
+  list(APPEND MAIN_SRC_C qphix/qphix_interface.cpp)
 endif()
 
 if(TM_USE_QUDA)
@@ -404,11 +404,11 @@ include_directories(
 
 # cmake 4.0 uses a different syntax for the option
 if(CMAKE_MAJOR_VERSION LESS 4)
-  flex_target(tmlqcd_input_read read_input.l read_input.c
-              COMPILE_FLAGS "-Ca -Ptmlqcd")
+  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c
+              COMPILE_FLAGS "-Ca -Ptmlqcd -i")
 else()
-  flex_target(tmlqcd_input_read read_input.l read_input.c OPTIONS
-              "-Ca -Ptmlqcd")
+  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c OPTIONS
+              "-Ca -Ptmlqcd -i")
 endif()
 
 # create a target library with namespacing because cmake does not know name
@@ -425,8 +425,7 @@ set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
 # define a library and add the dependencies
 target_link_libraries(
   hmc
-  PUBLIC $<$<BOOL:${TM_CLOCK_GETTIME_IN_RT}>:rt>
-         $<$<BOOL:${TM_DDalphaAMG}>:tmlqcd::DDalphaAMG>
+  PUBLIC $<$<BOOL:${TM_USE_DDalphaAMG}>:tmlqcd::DDalphaAMG>
          $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
          $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
          $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>
@@ -439,7 +438,7 @@ target_link_libraries(
          roc::hipblas
          hip::host>
          tmlqcd::clime
-         $<$<BOOL:${TM_USE_LEMON}>:clemon::lemon>
+         $<$<BOOL:${TM_USE_LEMON}>:lemon::lemon>
          ${LAPACK_LIBRARIES}
          ${BLAS_LIBRARIES}
          $<$<BOOL:${TM_USE_MPI}>:MPI::MPI_C
diff --git a/src/lib/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
index 80bff4fcc..bf2da4bef 100644
--- a/src/lib/DDalphaAMG_interface.c
+++ b/src/lib/DDalphaAMG_interface.c
@@ -207,7 +207,8 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+          "%e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -215,8 +216,9 @@ static inline int MG_check(spinor *const phi_new, spinor *const phi_old, const i
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-           differ[0], differ[1], differ[0] / differ[1]);
+    printf(
+        "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+        differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
 }
@@ -257,7 +259,8 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
           "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
           "restart... \n");
       printf(
-          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e "
+          "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+          "%e > %e "
           "\n",
           differ[0], differ[1], differ[0] / differ[1], precision);
     }
@@ -265,8 +268,9 @@ static inline int MG_check_nd(spinor *const up_new, spinor *const dn_new, spinor
   }
 
   if (g_debug_level > 0 && g_proc_id == 0)
-    printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-           differ[0], differ[1], differ[0] / differ[1]);
+    printf(
+        "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+        differ[0], differ[1], differ[0] / differ[1]);
 
   return 1;
 }
@@ -304,7 +308,8 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
             "ERROR: something bad happened... MG converged giving the wrong solution!! Trying to "
             "restart... \n");
         printf(
-            "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > "
+            "ERROR contd: || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = "
+            "%e > "
             "%e \n",
             differ[0], differ[1], differ[0] / differ[1], precision[i]);
       }
@@ -313,8 +318,9 @@ static inline int MG_mms_check_nd(spinor **const up_new, spinor **const dn_new,
     }
 
     if (g_debug_level > 0 && g_proc_id == 0)
-      printf("MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
-             differ[0], differ[1], differ[0] / differ[1]);
+      printf(
+          "MGTEST:  || s - f_{tmLQC} * f_{TM_USE_DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n",
+          differ[0], differ[1], differ[0] / differ[1]);
   }
 
   finalize_solver(check_vect, 2);
@@ -367,8 +373,8 @@ static int MG_pre_solve(su3 **gf) {
     mg_do_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
-             100. * (mg_status.coarse_time / mg_status.time));
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
+             mg_status.time, 100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup procedure did not run correctly");
   }
@@ -384,8 +390,8 @@ static int MG_pre_solve(su3 **gf) {
     mg_update_setup = 0;
     mg_tau = gauge_tau;
     if (mg_status.success && g_proc_id == 0)
-      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n", mg_status.time,
-             100. * (mg_status.coarse_time / mg_status.time));
+      printf("TM_USE_DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
+             mg_status.time, 100. * (mg_status.coarse_time / mg_status.time));
     else if (g_proc_id == 0)
       printf("ERROR: setup updating did not run correctly");
   }
diff --git a/src/lib/buffers/utils_generic_exchange.c b/src/lib/buffers/utils_generic_exchange.c
index 474c738ad..d1a68a351 100644
--- a/src/lib/buffers/utils_generic_exchange.c
+++ b/src/lib/buffers/utils_generic_exchange.c
@@ -127,7 +127,7 @@ void generic_exchange(void *field_in, int bytes_per_site) {
   /* Following are implementations using different compile time flags */
 #if defined TM_NON_BLOCKING
 #include "utils_generic_exchange.nonblocking.inc"
-#else  /* TM_NON_BLOCKING */
+#else /* TM_NON_BLOCKING */
 #include "utils_generic_exchange.blocking.inc"
 #endif /* TM_NON_BLOCKING */
 }
diff --git a/src/lib/deriv_Sb.c b/src/lib/deriv_Sb.c
index 7b55eb170..1427c4af0 100644
--- a/src/lib/deriv_Sb.c
+++ b/src/lib/deriv_Sb.c
@@ -56,7 +56,7 @@
 void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field_t* const hf,
               const double factor) {
   tm_stopwatch_push(&g_timers, __func__, "");
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(hf->gaugefield);
   }
@@ -114,7 +114,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = &g_gauge_field_copy[icx][0];
 #else
     up = &hf->gaugefield[ix][0];
@@ -136,7 +136,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][0];
@@ -159,7 +159,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][1];
@@ -181,7 +181,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][1];
@@ -203,7 +203,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][2];
@@ -225,7 +225,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][2];
@@ -247,7 +247,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sp = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       up = um + 1;
 #else
     up = &hf->gaugefield[ix][3];
@@ -269,7 +269,7 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
       icy = g_lexic2eosub[iy];
 
       sm = k + icy;
-#if (defined TM_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
+#if (defined TM_USE_GAUGE_COPY && !defined TM_USE_HALFSPINOR)
       um = up + 1;
 #else
     um = &hf->gaugefield[iy][3];
diff --git a/src/lib/geometry_eo.c b/src/lib/geometry_eo.c
index ceb348e1a..f89189357 100644
--- a/src/lib/geometry_eo.c
+++ b/src/lib/geometry_eo.c
@@ -274,7 +274,8 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
   y3 = (x3 + LZ) % LZ;
   ix = ((y0 * LX + y1) * LY + y2) * LZ + y3;
 
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
   if (x0 == T) {
     ix = VOLUME + y3 + LZ * y2 + LZ * LY * y1;
   }
@@ -433,7 +434,8 @@ int Index(const int x0, const int x1, const int x2, const int x3) {
 
   /* The DBW2 stuff --> second boundary slice */
   /* This we put a the very end.              */
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
   if (x0 == T + 1) {
     ix = VOLUMEPLUSRAND + y3 + LZ * y2 + LZ * LY * y1;
 #if ((defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
@@ -685,14 +687,16 @@ void geometry() {
 
   xeven = malloc(VOLUMEPLUSRAND * sizeof(int));
 
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
   startvaluet = 1;
 #endif
-#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
-     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELX || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   startvaluex = 1;
 #endif
-#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || \
+     defined TM_PARALLELXYZ)
   startvaluey = 1;
 #endif
 #if (defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
@@ -851,7 +855,6 @@ void geometry() {
     }
   }
 
-
 #endif /* TM_PARALLELXYZ || TM_PARALLELXYZT*/
 
   /* The rectangular gauge action part */
@@ -861,7 +864,8 @@ void geometry() {
       printf("# Initialising rectangular gauge action stuff\n");
       fflush(stdout);
     }
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
     for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -910,8 +914,8 @@ void geometry() {
       }
     }
 #endif
-#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
-     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELX || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x2 = -startvaluey; x2 < (LY + startvaluey); x2++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
@@ -959,7 +963,8 @@ void geometry() {
       }
     }
 #endif
-#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || \
+     defined TM_PARALLELXYZ)
     for (x0 = -startvaluet; x0 < (T + startvaluet); x0++) {
       for (x1 = -startvaluex; x1 < (LX + startvaluex); x1++) {
         for (x3 = -startvaluez; x3 < (LZ + startvaluez); x3++) {
diff --git a/src/lib/global.h b/src/lib/global.h
index b0d3b1ac2..31d6dc0d4 100644
--- a/src/lib/global.h
+++ b/src/lib/global.h
@@ -121,7 +121,6 @@ EXTERN int *g_field_z_disp_even_up;
 EXTERN int *g_field_z_disp_odd_dn;
 EXTERN int *g_field_z_disp_odd_up;
 
-
 /* IF PHMC  */
 EXTERN spinor **g_chi_up_spinor_field;
 EXTERN spinor **g_chi_dn_spinor_field;
diff --git a/src/lib/init/init.h b/src/lib/init/init.h
index 0fe9ae51b..127622a8b 100644
--- a/src/lib/init/init.h
+++ b/src/lib/init/init.h
@@ -33,8 +33,8 @@
 #include "init/init_gauge_tmp.h"
 #include "init/init_geometry_indices.h"
 #include "init/init_global_states.h"
-#include "init/init_parallel.h"
 #include "init/init_moment_field.h"
+#include "init/init_parallel.h"
 #include "init/init_spinor_field.h"
 #include "init/init_stout_smear_vars.h"
 #ifdef TM_USE_OMP
diff --git a/src/lib/init/init_dirac_halfspinor.c b/src/lib/init/init_dirac_halfspinor.c
index 891a703e2..6b4fba174 100644
--- a/src/lib/init/init_dirac_halfspinor.c
+++ b/src/lib/init/init_dirac_halfspinor.c
@@ -69,15 +69,13 @@ int init_dirac_halfspinor() {
     errno = 0;
     return (1);
   }
-  sendBuffer =
-      (halfspinor *)(((unsigned long int)(sendBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
+  sendBuffer = (halfspinor *)(((unsigned long int)(sendBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
   if ((void *)(recvBuffer_ = (halfspinor *)calloc(RAND / 2 + 8, sizeof(halfspinor))) == NULL) {
     printf("malloc errno : %d\n", errno);
     errno = 0;
     return (1);
   }
-  recvBuffer =
-      (halfspinor *)(((unsigned long int)(recvBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
+  recvBuffer = (halfspinor *)(((unsigned long int)(recvBuffer_) + ALIGN_BASE + 1) & ~ALIGN_BASE);
 #endif
 
   for (int ieo = 0; ieo < 2; ieo++) {
@@ -94,7 +92,8 @@ int init_dirac_halfspinor() {
         NBPointer[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer[ieo][8 * i] = &sendBuffer[k];
@@ -154,7 +153,8 @@ int init_dirac_halfspinor() {
       for (int mu = 0; mu < 8; mu++) {
         NBPointer[ieo][8 * i + mu] = &HalfSpinor[8 * i + mu];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer[ieo][8 * i] = &recvBuffer[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
@@ -240,7 +240,8 @@ int init_dirac_halfspinor32() {
         NBPointer32[ieo][8 * i + 2 * mu + 1] =
             &HalfSpinor32[8 * g_lexic2eosub[g_iup[j][mu]] + 2 * mu + 1];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == 0) {
         k = (g_lexic2eosub[g_idn[j][0]] - VOLUME / 2);
         NBPointer32[ieo][8 * i] = &sendBuffer32[k];
@@ -300,7 +301,8 @@ int init_dirac_halfspinor32() {
       for (mu = 0; mu < 8; mu++) {
         NBPointer32[ieo][8 * i + mu] = &HalfSpinor32[8 * i + mu];
       }
-#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || (defined TM_PARALLELXYZT))
+#if ((defined TM_PARALLELT) || (defined TM_PARALLELXT) || (defined TM_PARALLELXYT) || \
+     (defined TM_PARALLELXYZT))
       if (t == T - 1) {
         NBPointer32[ieo][8 * i] = &recvBuffer32[(g_lexic2eosub[g_iup[j][0]] - VOLUME / 2)];
       }
diff --git a/src/lib/init/init_geometry_indices.c b/src/lib/init/init_geometry_indices.c
index 6b75fc83a..edd568d93 100644
--- a/src/lib/init/init_geometry_indices.c
+++ b/src/lib/init/init_geometry_indices.c
@@ -74,7 +74,6 @@ int init_geometry_indices(const int V) {
   if ((void *)g_field_z_disp_odd_up == NULL) return (17);
 #endif
 
-
   g_coord = (int **)calloc(VOLUME, sizeof(int *));
   if ((void *)g_coord == NULL) return (19);
   for (i = 0; i < VOLUME; i++) {
diff --git a/src/lib/io/utils_write_first_message.c b/src/lib/io/utils_write_first_message.c
index 287d67c37..4233789cc 100644
--- a/src/lib/io/utils_write_first_message.c
+++ b/src/lib/io/utils_write_first_message.c
@@ -30,9 +30,9 @@ int write_first_messages(FILE* parameterfile, char const* const executable,
            TMLQCD_PACKAGE_VERSION, git_hash);
   printf("%s", message);
   fprintf(parameterfile, "%s", message);
-#ifdef TM_GAUGE_COPY
-  printf("# The code is compiled with -DTM_GAUGE_COPY\n");
-  fprintf(parameterfile, "# The code is compiled with -DTM_GAUGE_COPY\n");
+#ifdef TM_USE_GAUGE_COPY
+  printf("# The code is compiled with -DTM_USE_GAUGE_COPY\n");
+  fprintf(parameterfile, "# The code is compiled with -DTM_USE_GAUGE_COPY\n");
 #endif
 #ifdef TM_USE_HALFSPINOR
   printf("# The code is compiled with -DTM_USE_HALFSPINOR\n");
diff --git a/src/lib/linalg/assign.c b/src/lib/linalg/assign.c
index fd04de1e4..19fcda44b 100644
--- a/src/lib/linalg/assign.c
+++ b/src/lib/linalg/assign.c
@@ -47,4 +47,3 @@ void assign_32(spinor32 *const R, spinor32 *const S, const int N) {
   memcpy(R, S, N * sizeof(spinor32));
   return;
 }
-
diff --git a/src/lib/linalg/assign_add_mul_r_32.c b/src/lib/linalg/assign_add_mul_r_32.c
index 9f6b1a72f..5ab9366ac 100644
--- a/src/lib/linalg/assign_add_mul_r_32.c
+++ b/src/lib/linalg/assign_add_mul_r_32.c
@@ -35,7 +35,7 @@
 #include "su3.h"
 
 void assign_add_mul_r_32_orphaned(spinor32 *const R, spinor32 *const S, const float c,
-                                         const int N) {
+                                  const int N) {
 #ifdef TM_USE_OMP
 #pragma omp parallel for
 #endif
diff --git a/src/lib/linalg/scalar_prod_r.c b/src/lib/linalg/scalar_prod_r.c
index f4fd9293b..c5288aa34 100644
--- a/src/lib/linalg/scalar_prod_r.c
+++ b/src/lib/linalg/scalar_prod_r.c
@@ -97,4 +97,3 @@ double scalar_prod_r(const spinor *const S, const spinor *const R, const int N,
 #endif
   return res;
 }
-
diff --git a/src/lib/matrix_utils.c b/src/lib/matrix_utils.c
index d5c4198ea..63c98657b 100644
--- a/src/lib/matrix_utils.c
+++ b/src/lib/matrix_utils.c
@@ -30,9 +30,8 @@
 #ifndef TM_USE_OMP
 static
 #endif
-    void
-    exponent_from_coefficients(su3 *out, _Complex double f0, _Complex double f1, _Complex double f2,
-                               su3 const *in) {
+    void exponent_from_coefficients(su3 *out, _Complex double f0, _Complex double f1,
+                                    _Complex double f2, su3 const *in) {
   su3 ALIGN tmp;
   _complex_times_su3(tmp, f2, *in);
   _su3_add_equals_complex_identity(tmp, f1);
diff --git a/src/lib/measure_gauge_action.c b/src/lib/measure_gauge_action.c
index 1f7cb6ad5..ecbe7a888 100644
--- a/src/lib/measure_gauge_action.c
+++ b/src/lib/measure_gauge_action.c
@@ -26,10 +26,10 @@
  *     Returns the value of the action
  ************************************************************************/
 
-#include <tmlqcd_config.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <tmlqcd_config.h>
 #ifdef TM_USE_OMP
 #include <omp.h>
 #endif
diff --git a/src/lib/misc_types.h b/src/lib/misc_types.h
index fee62159f..412719dce 100644
--- a/src/lib/misc_types.h
+++ b/src/lib/misc_types.h
@@ -101,7 +101,7 @@ typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = QMP_THREAD_SINGLE,
   TM_MPI_THREAD_MULTIPLE = QMP_THREAD_MULTIPLE
 } tm_mpi_thread_level_t;
-#elif defined(TM_USE_MPI) 
+#elif defined(TM_USE_MPI)
 typedef enum tm_mpi_thread_level_t {
   TM_MPI_THREAD_SINGLE = MPI_THREAD_SERIALIZED,
   TM_MPI_THREAD_MULTIPLE = MPI_THREAD_MULTIPLE
diff --git a/src/lib/mpi_init.c b/src/lib/mpi_init.c
index cc09fd4cd..f245f0556 100644
--- a/src/lib/mpi_init.c
+++ b/src/lib/mpi_init.c
@@ -347,18 +347,20 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   for (i = 0; i < 8; i++) {
     g_nb_list[i] = g_cart_id;
   }
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
   MPI_Cart_shift(g_cart_grid, 0, 1, &g_nb_t_dn, &g_nb_t_up);
   g_nb_list[0] = g_nb_t_up;
   g_nb_list[1] = g_nb_t_dn;
 #endif
-#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELX || \
-     defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
+     defined TM_PARALLELX || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 1, 1, &g_nb_x_dn, &g_nb_x_up);
   g_nb_list[2] = g_nb_x_up;
   g_nb_list[3] = g_nb_x_dn;
 #endif
-#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELXYT || defined TM_PARALLELXYZT || defined TM_PARALLELXY || \
+     defined TM_PARALLELXYZ)
   MPI_Cart_shift(g_cart_grid, 2, 1, &g_nb_y_dn, &g_nb_y_up);
   g_nb_list[4] = g_nb_y_up;
   g_nb_list[5] = g_nb_y_dn;
@@ -552,7 +554,6 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   MPI_Type_commit(&lfield_z_slice_cont32);
   MPI_Type_commit(&lfield_z_slice_gath32);
 
-
   /* The internal z_ and zt_ slices are constructed in geometry() with MPI_Type_indexed() */
 
   /* Now the derivative fields */
diff --git a/src/lib/mpi_init.h b/src/lib/mpi_init.h
index d9476e662..c6e816946 100644
--- a/src/lib/mpi_init.h
+++ b/src/lib/mpi_init.h
@@ -107,9 +107,8 @@ extern MPI_Datatype halffield_y_slice_cont;
 extern MPI_Datatype halffield_y_slice_gath;
 extern MPI_Datatype halffield_z_slice_cont;
 
-
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT || \
-     defined TM_PARALLELXYZ)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT || defined TM_PARALLELXYZ)
 extern MPI_Datatype field_z_slice_even_dn;
 extern MPI_Datatype field_z_slice_even_up;
 extern MPI_Datatype field_z_slice_odd_dn;
diff --git a/src/lib/operator/D_psi_body.c b/src/lib/operator/D_psi_body.c
index b5acd1158..f73822776 100644
--- a/src/lib/operator/D_psi_body.c
+++ b/src/lib/operator/D_psi_body.c
@@ -283,7 +283,7 @@ void _PSWITCH(D_psi)(_PTSWITCH(spinor) *const P, _PTSWITCH(spinor) *const Q) {
   _C_TYPE ALIGN32 phase_2l = (_C_TYPE)phase_2;
   _C_TYPE ALIGN32 phase_3l = (_C_TYPE)phase_3;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (_PSWITCH(g_update_gauge_copy)) {
     _PSWITCH(update_backward_gauge)(_PSWITCH(g_gauge_field));
   }
diff --git a/src/lib/operator/Hopping_Matrix.c b/src/lib/operator/Hopping_Matrix.c
index 8b106e10a..759809a8e 100644
--- a/src/lib/operator/Hopping_Matrix.c
+++ b/src/lib/operator/Hopping_Matrix.c
@@ -68,7 +68,7 @@
 #include "operator/halfspinor_hopping.h"
 
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -91,7 +91,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 #else /* thats TM_USE_HALFSPINOR */
 void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
diff --git a/src/lib/operator/Hopping_Matrix_32.c b/src/lib/operator/Hopping_Matrix_32.c
index 1198d52bb..0991811b7 100644
--- a/src/lib/operator/Hopping_Matrix_32.c
+++ b/src/lib/operator/Hopping_Matrix_32.c
@@ -63,8 +63,8 @@
 #endif
 #include "boundary.h"
 #include "init/init_dirac_halfspinor.h"
-#include "update_backward_gauge.h"
 #include "operator/Hopping_Matrix_32.h"
+#include "update_backward_gauge.h"
 
 #if defined TM_USE_HALFSPINOR
 #include "operator/halfspinor_hopping_32.h"
@@ -72,7 +72,7 @@
 
 void Hopping_Matrix_32_orphaned(const int ieo, spinor32* const l, spinor32* const k) {
 #if defined TM_USE_HALFSPINOR
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy_32) {
     update_backward_gauge_32_orphaned(g_gauge_field_32);
   }
diff --git a/src/lib/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
index a2c54c7e4..3be906764 100644
--- a/src/lib/operator/halfspinor_body.c
+++ b/src/lib/operator/halfspinor_body.c
@@ -103,7 +103,7 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
 
 #if (defined TM_USE_MPI && !defined _NO_COMM)
-  xchange_halffield32();
+    xchange_halffield32();
 #endif
 
 #ifdef TM_USE_OMP
@@ -237,7 +237,7 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
 
 #if (defined TM_USE_MPI && !defined _NO_COMM)
-  xchange_halffield();
+    xchange_halffield();
 #endif
 
 #ifdef TM_USE_OMP
diff --git a/src/lib/operator/hopping_bg_dbl.c b/src/lib/operator/hopping_bg_dbl.c
index 93af99e24..6f8f3778d 100644
--- a/src/lib/operator/hopping_bg_dbl.c
+++ b/src/lib/operator/hopping_bg_dbl.c
@@ -41,7 +41,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
   __alignx(16, l);
   __alignx(16, k);
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -64,7 +64,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
 
   sp = k + icy;
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = &g_gauge_field_copy[ioff][0];
 #else
   up = &g_gauge_field[ix][0];
@@ -76,7 +76,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     /*********************** direction +0 ************************/
     iy = g_idn[ix][0];
     icy = g_lexic2eosub[iy];
-#if (!defined TM_GAUGE_COPY)
+#if (!defined TM_USE_GAUGE_COPY)
     um = &g_gauge_field[iy][0];
 #else
     um = up + 1;
@@ -90,7 +90,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[ix][1];
     icy = g_lexic2eosub[iy];
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -104,7 +104,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_idn[ix][1];
     icy = g_lexic2eosub[iy];
 
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][1];
 #else
     um = up + 1;
@@ -117,7 +117,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[ix][2];
     icy = g_lexic2eosub[iy];
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -131,7 +131,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_idn[ix][2];
     icy = g_lexic2eosub[iy];
 
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][2];
 #else
     um = up + 1;
@@ -145,7 +145,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[ix][3];
     icy = g_lexic2eosub[iy];
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -158,7 +158,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_idn[ix][3];
     icy = g_lexic2eosub[iy];
 
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][3];
 #else
     um = up + 1;
@@ -174,7 +174,7 @@ void Hopping_Matrix(const int ieo, spinor* const l, spinor* const k) {
     iy = g_iup[iz][0];
     icy = g_lexic2eosub[iy];
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up = &g_gauge_field[iz][0];
diff --git a/src/lib/operator/hopping_body_dbl.c b/src/lib/operator/hopping_body_dbl.c
index c3eefb74a..ea3b8cf2c 100644
--- a/src/lib/operator/hopping_body_dbl.c
+++ b/src/lib/operator/hopping_body_dbl.c
@@ -43,7 +43,7 @@ if (ieo == 0) {
 #ifndef TM_USE_OMP
 hi = &g_hi[16 * ioff];
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
 up = &g_gauge_field_copy[ioff][0];
 #else
 up = &g_gauge_field[(*hi)][0];
@@ -60,7 +60,7 @@ hi++;
 for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 #ifdef TM_USE_OMP
   hi = &g_hi[16 * icx];
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = &g_gauge_field_copy[icx][0];
 #else
   up = &g_gauge_field[(*hi)][0];
@@ -74,7 +74,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   pn = p + (icx - ioff);
 #endif
   /*********************** direction +t ************************/
-#if (!defined TM_GAUGE_COPY)
+#if (!defined TM_USE_GAUGE_COPY)
   um = &g_gauge_field[(*hi)][0];
 #else
   um = up + 1;
@@ -86,7 +86,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_p();
 
   /*********************** direction -t ************************/
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -97,7 +97,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_t_m();
 
   /*********************** direction +1 ************************/
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
   um = &g_gauge_field[(*hi)][1];
 #else
   um = up + 1;
@@ -109,7 +109,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_p();
 
   /*********************** direction -1 ************************/
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -120,7 +120,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_x_m();
 
   /*********************** direction +2 ************************/
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
   um = &g_gauge_field[(*hi)][2];
 #else
   um = up + 1;
@@ -132,7 +132,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_p();
 
   /*********************** direction -2 ************************/
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up += 1;
@@ -143,7 +143,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
   _hop_y_m();
 
   /*********************** direction +3 ************************/
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
   um = &g_gauge_field[(*hi)][3];
 #else
   um = up + 1;
@@ -156,7 +156,7 @@ for (int icx = ioff; icx < (VOLUME / 2 + ioff); icx++) {
 
   /*********************** direction -3 ************************/
 #ifndef TM_USE_OMP
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
   up = um + 1;
 #else
   up = &g_gauge_field[(*hi)][0];
diff --git a/src/lib/operator/hopping_sgl.c b/src/lib/operator/hopping_sgl.c
index 062507158..487bfc47f 100644
--- a/src/lib/operator/hopping_sgl.c
+++ b/src/lib/operator/hopping_sgl.c
@@ -37,7 +37,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
   spinor32* restrict r, * restrict sp, * restrict sm;
   spinor32 temp;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge();
   }
@@ -72,7 +72,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = &g_gauge_field_copy[icx][0];
 #else
     up = &g_gauge_field[ix][0];
@@ -100,7 +100,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     um = up + 1;
 #else
     um = &g_gauge_field[iy][0];
@@ -129,7 +129,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
 
     sp = k + icy;
 
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -157,7 +157,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][1];
 #else
     um = up + 1;
@@ -185,7 +185,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -212,7 +212,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][2];
 #else
     um = up + 1;
@@ -240,7 +240,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sp = k + icy;
-#if ((defined TM_GAUGE_COPY))
+#if ((defined TM_USE_GAUGE_COPY))
     up = um + 1;
 #else
     up += 1;
@@ -267,7 +267,7 @@ void Hopping_Matrix(int ieo, spinor32* const l, spinor32* const k) {
     icy = g_lexic2eosub[iy];
 
     sm = k + icy;
-#ifndef TM_GAUGE_COPY
+#ifndef TM_USE_GAUGE_COPY
     um = &g_gauge_field[iy][3];
 #else
     um = up + 1;
diff --git a/src/lib/operator/tm_sub_Hopping_Matrix.c b/src/lib/operator/tm_sub_Hopping_Matrix.c
index 857404088..7edf2c954 100644
--- a/src/lib/operator/tm_sub_Hopping_Matrix.c
+++ b/src/lib/operator/tm_sub_Hopping_Matrix.c
@@ -56,7 +56,7 @@
 
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spinor* const k,
                            complex double const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -81,7 +81,7 @@ void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* const p, spin
 #elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_sub_Hopping_Matrix(const int ieo, spinor* const l, spinor* p, spinor* const k,
                            complex double const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
diff --git a/src/lib/operator/tm_times_Hopping_Matrix.c b/src/lib/operator/tm_times_Hopping_Matrix.c
index 6d1abddba..9b09c090f 100644
--- a/src/lib/operator/tm_times_Hopping_Matrix.c
+++ b/src/lib/operator/tm_times_Hopping_Matrix.c
@@ -56,7 +56,7 @@
 
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              complex double const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -81,7 +81,7 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #elif (!defined _NO_COMM && !defined TM_USE_HALFSPINOR)
 void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
                              double complex const cfactor) {
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   if (g_update_gauge_copy) {
     update_backward_gauge(g_gauge_field);
   }
@@ -103,4 +103,4 @@ void tm_times_Hopping_Matrix(const int ieo, spinor* const l, spinor* const k,
 #endif
   return;
 }
-#endif  //TM_USE_HALFSPINOR && !defined _NO_COMM
+#endif  // TM_USE_HALFSPINOR && !defined _NO_COMM
diff --git a/src/lib/read_input.l b/src/lib/read_input.l
index 59f002748..5eb542f87 100644
--- a/src/lib/read_input.l
+++ b/src/lib/read_input.l
@@ -951,7 +951,7 @@ static inline double fltlist_next_token(int * const list_end){
     mg_no_shifts=0;
     if(myverbose) printf("  MG_MMS_Mass set to %.16f line %d operator %d\n", mg_mms_mass, line_of_file, current_operator);
   }
-  End_DDalphaAMG{SPC}* {
+  EndDDalphaAMG{SPC}* {
   if(myverbose) printf("DDalphaAMG parsed in line %d\n\n", line_of_file);
   BEGIN(0);
   }
diff --git a/src/lib/smearing/utils_reunitarize_MILC.c b/src/lib/smearing/utils_reunitarize_MILC.c
index b5efa2936..fec177a42 100644
--- a/src/lib/smearing/utils_reunitarize_MILC.c
+++ b/src/lib/smearing/utils_reunitarize_MILC.c
@@ -1,5 +1,5 @@
-#include "utils.ih"
 #include <complex.h>
+#include "utils.ih"
 
 /* No reunitarization code seems to be available, so I've adapted (stolen) this routine from the
  * MILC code (who stole it elsewhere, I think ;]) -- AD. */
@@ -36,7 +36,7 @@ void reunitarize(su3 *omega) {
   bj2 = omega->c02;
 
   omega->c20 = bj1 * omega->c12;
-  omega->c20 -= bj2 *omega->c11;
+  omega->c20 -= bj2 * omega->c11;
 
   omega->c21 = bj2 * omega->c10;
   omega->c21 -= bj0 * omega->c12;
diff --git a/src/lib/solver/gram-schmidt.c b/src/lib/solver/gram-schmidt.c
index ffd5d6b29..4c2ee4310 100644
--- a/src/lib/solver/gram-schmidt.c
+++ b/src/lib/solver/gram-schmidt.c
@@ -75,7 +75,6 @@ void IteratedClassicalGS(_Complex double v[], double *vnrm, int n, int m, _Compl
   }
 }
 
-
 /*
  *  ModifiedGramSchmidt
  *
diff --git a/src/lib/test/check_geometry.c b/src/lib/test/check_geometry.c
index b9f14eb4d..20f7acc96 100644
--- a/src/lib/test/check_geometry.c
+++ b/src/lib/test/check_geometry.c
@@ -90,7 +90,8 @@ int check_geometry() {
           ix = g_ipt[x0][x1][x2][x3];
 
           iy0 = g_iup[ix][0];
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
           if (x0 != T - 1) {
             iz0 = g_ipt[(x0 + 1) % T][x1][x2][x3];
           } else {
@@ -176,7 +177,8 @@ int check_geometry() {
           }
 
           iy0 = g_idn[ix][0];
-#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
+#if (defined TM_PARALLELT || defined TM_PARALLELXT || defined TM_PARALLELXYT || \
+     defined TM_PARALLELXYZT)
           if (x0 != 0) {
             iz0 = g_ipt[(x0 + T - 1) % T][x1][x2][x3];
           } else {
@@ -1554,4 +1556,3 @@ int check_geometry() {
 
   return (0);
 }
-
diff --git a/src/lib/test/check_overlap.c b/src/lib/test/check_overlap.c
index 56763cff4..b032f8cdd 100644
--- a/src/lib/test/check_overlap.c
+++ b/src/lib/test/check_overlap.c
@@ -188,7 +188,7 @@ int main(int argc, char *argv[]) {
   g_dbw2rand = 0;
 #endif
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   j = init_gauge_field(VOLUMEPLUSRAND, 1);
 #else
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
diff --git a/src/lib/wrapper/lib_wrapper.c b/src/lib/wrapper/lib_wrapper.c
index 9f083adc5..19d36ddc6 100644
--- a/src/lib/wrapper/lib_wrapper.c
+++ b/src/lib/wrapper/lib_wrapper.c
@@ -60,11 +60,11 @@
 #include "misc_types.h"
 #include "mpi_init.h"
 #include "operator.h"
+#include "operator/clover_leaf.h"
+#include "qphix_interface.h"
 #include "read_input.h"
 #include "sighandler.h"
 #include "start.h"
-#include "operator/clover_leaf.h"
-#include "qphix_interface.h"
 
 #define CONF_FILENAME_LENGTH 500
 
@@ -121,7 +121,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
   for (int j = 0; j < no_operators; j++)
     if (!operator_list[j].even_odd_flag) even_odd_flag = 0;
 
-#ifdef TM_GAUGE_COPY
+#ifdef TM_USE_GAUGE_COPY
   int j = init_gauge_field(VOLUMEPLUSRAND, 1);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
diff --git a/src/lib/xchange/xchange_gauge.c b/src/lib/xchange/xchange_gauge.c
index 6177a3dbb..254702822 100644
--- a/src/lib/xchange/xchange_gauge.c
+++ b/src/lib/xchange/xchange_gauge.c
@@ -960,5 +960,4 @@ void xchange_gauge(su3** const gf) {
   return;
 }
 
-
 #endif /* TM_NON_BLOCKING */

From 83f6c401c0da71640a15f93342ad56d5ed70ee59 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 17 Feb 2026 18:26:48 +0100
Subject: [PATCH 06/19] Improvements

- QUDA_FERMIONIC_FORCES and QUDA_EXPERIMENTAL are always on
- Removed KOJAK instrumentation
- Added a custom Qphix find package file because the original one is broken
---
 .github/workflows/qphix-build.yaml      |  5 ++-
 CMakeLists.txt                          | 18 +--------
 cmake/FindQphix.cmake                   | 39 ++++++++++++++++++
 cmake/tmlqcd_config_internal.h.in       | 17 ++++----
 src/bin/deriv_mg_tune.c                 |  8 ----
 src/bin/hmc_tm.c                        |  8 ----
 src/bin/invert.c                        |  8 ----
 src/bin/offline_measurement.c           |  9 -----
 src/lib/deriv_Sb.c                      |  7 ----
 src/lib/deriv_Sb_D_psi.c                |  7 ----
 src/lib/get_rectangle_staples.c         |  6 ---
 src/lib/get_staples.c                   | 21 ----------
 src/lib/operator/Hopping_Matrix_nocom.c |  3 --
 src/lib/operator/halfspinor_body.c      |  7 ----
 src/lib/quda_interface.c                | 20 ----------
 src/lib/test/check_overlap.c            |  8 ----
 src/lib/update_gauge.c                  | 53 +++++++------------------
 src/lib/xchange/xchange_2fields.c       |  7 ----
 src/lib/xchange/xchange_halffield.c     | 13 ------
 src/lib/xchange/xchange_lexicfield.c    | 28 -------------
 20 files changed, 66 insertions(+), 226 deletions(-)
 create mode 100644 cmake/FindQphix.cmake

diff --git a/.github/workflows/qphix-build.yaml b/.github/workflows/qphix-build.yaml
index eef1b5055..ec4ec5394 100644
--- a/.github/workflows/qphix-build.yaml
+++ b/.github/workflows/qphix-build.yaml
@@ -172,9 +172,10 @@ jobs:
            -DTM_USE_OMP=ON \
            -DTM_USE_LEMON=ON \
            -DTM_USE_QPHIX=ON \
-           -DCMAKE_CXXFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
-           -DCMAKE_CFLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DCMAKE_CXX_FLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
+           -DCMAKE_C_FLAGS="-O2 -mtune=haswell -march=haswell -mavx2 -mfma -DOMPI_SKIP_MPICXX -fopenmp" \
            -DQPHIX_DIR="${{github.workspace}}/qphix/build/install_dir" \
+           -DQMP_DIR="${{github.workspace}}/qmp/build/install_dir" \
             ..
           make -j > config.log
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a375ad14b..803feeef7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,7 @@ option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 option(TM_ENABLE_TESTS "Enable tests" OFF)
-
+set(TM_QPHIX_SOALEN "4" CACHE STRING "QPhiX specific parameter")
 # MPI dependent options
 cmake_dependent_option(
   TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
@@ -123,12 +123,6 @@ cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF "TM_USE_MPI"
                        ON)
 
 # GPU dependent options
-cmake_dependent_option(TM_USE_QUDA_EXPERIMENTAL "Enable QUDA support" ON
-                       "TM_USE_QUDA" OFF)
-cmake_dependent_option(
-  TM_QUDA_FERMIONIC_FORCES "Enable support for fermionic forces using QUDA" ON
-  "TM_USE_QUDA" OFF)
-
 cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
                        "TM_USE_CUDA" OFF)
 
@@ -253,15 +247,7 @@ if(TM_USE_HIP OR QUDA_TARGET_HIP)
 endif()
 
 if(TM_USE_QPHIX)
-  find_package(QPhiX REQUIRED CONFIG)
-  message("${QPhiX_LIBRARIES}")
-  if(NOT TARGET tmlqcd::qphix)
-    add_library(tmlqcd::qphix INTERFACE IMPORTED)
-    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                   "${QPhiX_LIBRARIES}")
-    set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                   "${QPhiX_INCLUDE_DIRS}")
-  endif()
+  find_package(Qphix REQUIRED)
 endif()
 
 # check for fftw3 (rely on pkgconfig).
diff --git a/cmake/FindQphix.cmake b/cmake/FindQphix.cmake
new file mode 100644
index 000000000..15ab2d47a
--- /dev/null
+++ b/cmake/FindQphix.cmake
@@ -0,0 +1,39 @@
+find_library(TM_QMP_LIBS NAMES qmp PATH_SUFFIXES "lib" "lib64")
+find_library(TM_QPHIX_LIBS_CODEGEN NAMES "qphix_codegen" PATH_SUFFIXES "lib" "lib64")
+find_library(TM_QPHIX_LIBS_SOLVER NAMES "qphix_solver" PATH_SUFFIXES "lib" "lib64")
+
+message("${QMP_DIR}")
+
+find_path(
+  TM_QMP_INCLUDE_DIRS
+  NAMES qmp.h
+  PATH_SUFFIXES "include"
+  PATHS "${QMP_DIR}")
+
+find_path(
+  TM_QPHIX_INCLUDE_DIRS
+  NAMES qphix_config.h
+  PATH_SUFFIXES "qphix"
+  PATHS "${QPHIX_DIR}")
+find_path(
+  TM_QPHIX_CODEGEN_INCLUDE_DIRS
+  NAMES qpx_utils.h
+  PATH_SUFFIXES "qphix_codegen"
+  PATHS "${QPHIX_DIR}")
+
+message("${TM_QMP_INCLUDE_DIRS} ${TM_QPHIX_INCLUDE_DIRS} ${TM_QMP_LIBS} ${TM_QPHIX_LIBS_CODEGEN} ${TM_QPHIX_LIBS_SOLVER}")
+
+find_package_handle_standard_args(
+  Qphix DEFAULT_MSG TM_QPHIX_LIBS_CODEGEN TM_QPHIX_LIBS_SOLVER TM_QPHIX_INCLUDE_DIRS TM_QMP_LIBS TM_QMP_INCLUDE_DIRS TM_QPHIX_CODEGEN_INCLUDE_DIRS)
+
+if(TM_QMP_LIBS
+    AND TM_QPHIX_INCLUDE_DIRS
+    AND NOT TARGET tmlqcd::qphix)
+  add_library(tmlqcd::qphix INTERFACE IMPORTED)
+  set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_LINK_LIBRARIES
+    "${TM_QPHIX_LIBS_CODEGEN};${TM_QPHIX_LIBS_SOLVER};${TM_QMP_LIBS}")
+  set_target_properties(tmlqcd::qphix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+    "${TM_QMP_INCLUDE_DIRS};${TM_QPHIX_INCLUDE_DIRS}/..;${TM_QPHIX_INCLUDE_DIRS};${TM_QPHIX_CODEGEN_INCLUDE_DIRS}")
+endif()
+
+mark_as_advanced(TM_QPHIX_LIBRARIES TM_QPHIX_INCLUDE_DIRS TM_QMP_LIBS TM_QMP_INCLUDE_DIRS)
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index fb8d7d818..145df156a 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -66,7 +66,7 @@
 #define ALIGN_BASE @ALIGN_BASE@
 
 /* Alignment compiler hint macro */
-#cmakedefine ALIGN @ALIGN@
+#define ALIGN @ALIGN@
 
 /* Alignment for 32bit arrays -- necessary for SSE and automated vectorization */
 #define ALIGN_BASE32 @ALIGN_BASE32@
@@ -76,10 +76,10 @@
 
 /* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
    `char[]'. */
-#cmakedefine YYTEXT_POINTER
+//#cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#cmakedefine TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
+#define TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
 #cmakedefine TM_USE_GAUGE_COPY
@@ -111,17 +111,14 @@
 /* Using QUDA GPU */
 #cmakedefine TM_USE_QUDA 
 
-/* Using experimental QUDA version */
-#cmakedefine TM_QUDA_EXPERIMENTAL
-
-/* Using QUDA fermionic forces */
-#cmakedefine TM_QUDA_FERMIONIC_FORCES
-
 /* Using DDalphaAMG */
 #cmakedefine TM_USE_DDalphaAMG
 
 /* Using QPHIX */
 #cmakedefine TM_USE_QPHIX 
 
+#ifdef TM_USE_QPHIX
 /* Structure of Array length to use with QPhiX */
-#cmakedefine QPHIX_SOALEN @TMLQCD_QPHIX_SOALEN@
+#define QPHIX_SOALEN @TM_QPHIX_SOALEN@
+#endif
+
diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index f65b22c48..7c45524de 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -98,11 +98,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_DERIV_MG_TUNE);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   verbose = 1;
   g_use_clover_flag = 0;
 
@@ -367,9 +362,6 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(const tm_ExitCode_t exit_code) {
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index 399362d0b..b68a5250f 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -113,11 +113,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_HMC_TM);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   strcpy(gauge_filename, "conf.save");
   strcpy(nstore_filename, "nstore_counter");
   strcpy(tmp_filename, ".conf.tmp");
@@ -591,9 +586,6 @@ int main(int argc, char *argv[]) {
 #endif
 
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(const tm_ExitCode_t exit_code) {
diff --git a/src/bin/invert.c b/src/bin/invert.c
index bb6f15c10..b5040ba88 100644
--- a/src/bin/invert.c
+++ b/src/bin/invert.c
@@ -114,11 +114,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_INVERT);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   DUM_DERI = 8;
   DUM_MATRIX = DUM_DERI + 5;
   NO_OF_SPINORFIELDS = DUM_MATRIX + 4;
@@ -457,9 +452,6 @@ int main(int argc, char *argv[]) {
   MPI_Finalize();
 #endif
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(tm_ExitCode_t exit_code) {
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index c1422858f..b6cbc13fa 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -83,11 +83,6 @@ int main(int argc, char *argv[]) {
 
   init_critical_globals(TM_PROGRAM_OFFLINE_MEASUREMENT);
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
   DUM_DERI = 8;
   DUM_MATRIX = DUM_DERI + 5;
   NO_OF_SPINORFIELDS = DUM_MATRIX + 3;
@@ -306,10 +301,6 @@ int main(int argc, char *argv[]) {
   MPI_Finalize();
 #endif
   return (0);
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
 
 static void usage(const tm_ExitCode_t exit_code) {
diff --git a/src/lib/deriv_Sb.c b/src/lib/deriv_Sb.c
index 1427c4af0..c48c8db38 100644
--- a/src/lib/deriv_Sb.c
+++ b/src/lib/deriv_Sb.c
@@ -83,10 +83,6 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
 
 #ifdef TM_USE_OMP
 #undef static
-#endif
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(derivSb)
 #endif
 
     if (ieo == 0) {
@@ -292,7 +288,4 @@ void deriv_Sb(const int ieo, spinor* const l, spinor* const k, hamiltonian_field
   } /* OpenMP closing brace */
 #endif
   tm_stopwatch_pop(&g_timers, 0, 1, "");
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(derivSb)
-#endif
 }
diff --git a/src/lib/deriv_Sb_D_psi.c b/src/lib/deriv_Sb_D_psi.c
index 61da4b9d2..3f3319efc 100644
--- a/src/lib/deriv_Sb_D_psi.c
+++ b/src/lib/deriv_Sb_D_psi.c
@@ -61,10 +61,6 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 
 #ifdef TM_USE_OMP
 #undef static
-#endif
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(derivSb)
 #endif
 
     /************** loop over all lattice sites ****************/
@@ -225,9 +221,6 @@ void deriv_Sb_D_psi(spinor* const l, spinor* const k, hamiltonian_field_t* const
 
       /****************** end of loop ************************/
     }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(derivSb)
-#endif
 
 #ifdef TM_USE_OMP
   } /*OpenMP closing brace */
diff --git a/src/lib/get_rectangle_staples.c b/src/lib/get_rectangle_staples.c
index eab6b9d9e..c8f69596b 100644
--- a/src/lib/get_rectangle_staples.c
+++ b/src/lib/get_rectangle_staples.c
@@ -34,9 +34,6 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
                                    const su3 *const *const gf) {
   su3 ALIGN tmp1, tmp2;
   const su3 *a, *b, *c, *d, *e;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(rectstaples)
-#endif
   _su3_zero((*v));
   for (int nu = 0; nu < 4; nu++) {
     if (mu != nu) {
@@ -178,7 +175,4 @@ void get_rectangle_staples_general(su3 *const v, const int x, const int mu,
       _su3_times_su3_acc((*v), tmp2, tmp1);
     }
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(rectstaples)
-#endif
 }
diff --git a/src/lib/get_staples.c b/src/lib/get_staples.c
index b33010f2c..ae7f19d09 100644
--- a/src/lib/get_staples.c
+++ b/src/lib/get_staples.c
@@ -35,10 +35,6 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(staples)
-#endif
-
   _su3_zero(*staple);
   for (int k = 0; k < 4; k++) {
     if (k != mu) {
@@ -61,9 +57,6 @@ void get_staples(su3* const staple, const int x, const int mu, const su3** in_ga
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(staples)
-#endif
 }
 
 void get_spacelike_staples(su3* const staple, const int x, const int mu,
@@ -72,10 +65,6 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(staples)
-#endif
-
   _su3_zero(*staple);
   for (int k = 1; k < 4; k++) {
     if (k != mu) {
@@ -98,9 +87,6 @@ void get_spacelike_staples(su3* const staple, const int x, const int mu,
       _su3d_times_su3_acc(*staple, *w1, st);
     }
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(staples)
-#endif
 }
 
 void get_timelike_staples(su3* const staple, const int x, const int mu,
@@ -109,10 +95,6 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
   su3 ALIGN st;
   const su3 *w1, *w2, *w3;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(staples)
-#endif
-
   _su3_zero(*staple);
   int k = 0;
   if (k != mu) {
@@ -134,7 +116,4 @@ void get_timelike_staples(su3* const staple, const int x, const int mu,
     /* v = v + w1^d * st */
     _su3d_times_su3_acc(*staple, *w1, st);
   }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(staples)
-#endif
 }
diff --git a/src/lib/operator/Hopping_Matrix_nocom.c b/src/lib/operator/Hopping_Matrix_nocom.c
index c7814bbb0..00c34c38b 100644
--- a/src/lib/operator/Hopping_Matrix_nocom.c
+++ b/src/lib/operator/Hopping_Matrix_nocom.c
@@ -48,8 +48,5 @@
 
 #define Hopping_Matrix Hopping_Matrix_nocom
 #define _NO_COMM 1
-#ifdef TM_KOJAK_INST
-#undef TM_KOJAK_INST
-#endif
 
 #include "Hopping_Matrix.c"
diff --git a/src/lib/operator/halfspinor_body.c b/src/lib/operator/halfspinor_body.c
index 3be906764..8286c89f3 100644
--- a/src/lib/operator/halfspinor_body.c
+++ b/src/lib/operator/halfspinor_body.c
@@ -30,10 +30,6 @@ halfspinor* restrict* phi ALIGN;
 halfspinor32* restrict* phi32 ALIGN;
 _declare_hregs();
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(hoppingmatrix)
-#endif
-
 #ifndef TM_USE_OMP
 s = k;
 _prefetch_spinor(s);
@@ -320,6 +316,3 @@ if (g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
 #endif
   }
 }
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(hoppingmatrix)
-#endif
diff --git a/src/lib/quda_interface.c b/src/lib/quda_interface.c
index a746a7261..0e55f5cb9 100644
--- a/src/lib/quda_interface.c
+++ b/src/lib/quda_interface.c
@@ -2059,9 +2059,7 @@ void _setQudaMultigridParam(QudaMultigridParam *mg_param) {
 
     // this is needed after QUDA commit
     // https://github.com/lattice/quda/commit/7903288629f0fcc474989fec5a1393ecc17a4b42
-#ifdef TM_QUDA_EXPERIMENTAL
     mg_param->n_vec_batch[level] = 1;
-#endif
 
     // set the MG EigSolver parameters, almost equivalent to
     // setEigParam from QUDA's multigrid_invert_test, except
@@ -3031,7 +3029,6 @@ void quda_mg_tune_params(void *spinorOut, void *spinorIn, const int max_iter) {
   free(tunable_params);
 }
 
-#ifdef TM_QUDA_FERMIONIC_FORCES
 void compute_cloverdet_derivative_quda(monomial *const mnl, hamiltonian_field_t *const hf,
                                        spinor *const X_o, spinor *const phi, int detratio) {
   tm_stopwatch_push(&g_timers, __func__, "");
@@ -3131,23 +3128,6 @@ void compute_ndcloverrat_derivative_quda(monomial *const mnl, hamiltonian_field_
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
-#else
-void compute_cloverdet_derivative_quda(monomial *const mnl, hamiltonian_field_t *const hf,
-                                       spinor *const X_o, spinor *const phi, int detratio) {
-  tm_debug_printf(0, 0,
-                  "Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with "
-                  "--enable-quda_fermionic=yes\n");
-  exit(1);
-}
-void compute_ndcloverrat_derivative_quda(monomial *const mnl, hamiltonian_field_t *const hf,
-                                         spinor **const Qup, spinor **const Qdn,
-                                         solver_params_t *solver_params, int detratio) {
-  tm_debug_printf(0, 0,
-                  "Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with "
-                  "--enable-quda_fermionic=yes\n");
-  exit(1);
-}
-#endif
 
 void compute_WFlow_quda(const double eps, const double tmax, const int traj, FILE *outfile) {
   tm_stopwatch_push(&g_timers, __func__, "");
diff --git a/src/lib/test/check_overlap.c b/src/lib/test/check_overlap.c
index b032f8cdd..d34e2ae5b 100644
--- a/src/lib/test/check_overlap.c
+++ b/src/lib/test/check_overlap.c
@@ -105,11 +105,6 @@ int main(int argc, char *argv[]) {
   char *gaugecksum = NULL;
   double plaquette_energy;
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
 #ifdef TM_USE_LEMON
   MPI_File fh;
   LemonWriter *lemonWriter;
@@ -389,7 +384,4 @@ int main(int argc, char *argv[]) {
     free_chi_dn_spinor_field();
   }
   return (0);
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(main)
-#endif
 }
diff --git a/src/lib/update_gauge.c b/src/lib/update_gauge.c
index af4730e01..7a7dd34a1 100644
--- a/src/lib/update_gauge.c
+++ b/src/lib/update_gauge.c
@@ -56,43 +56,23 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
 #endif
 
 #ifdef TM_USE_OMP
-#define static
-#pragma omp parallel
-  {
+#pragma omp parallel for
 #endif
-    int i, mu;
-    static su3 v, w;
-    su3 *z;
-    static su3adj deriv;
-    su3adj *xm;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(updategauge)
-#endif
-
-#ifdef TM_USE_OMP
-#undef static
-#endif
-
-#ifdef TM_USE_OMP
-#pragma omp for
-#endif
-    for (i = 0; i < VOLUME; i++) {
-      for (mu = 0; mu < 4; mu++) {
-        /* moment[i][mu] = h_{i,mu}^{alpha} */
-        xm = &hf->momenta[i][mu];
-        z = &hf->gaugefield[i][mu];
-        _su3adj_assign_const_times_su3adj(deriv, step, *xm);
-        exposu3(&w, &deriv);
-        restoresu3(&v, &w);
-        _su3_times_su3(w, v, *z);
-        restoresu3(&v, &w);
-        _su3_assign(*z, v);
-      }
+  for (int i = 0; i < VOLUME; i++) {
+    for (int mu = 0; mu < 4; mu++) {
+      /* moment[i][mu] = h_{i,mu}^{alpha} */
+      su3 v, w;
+      su3adj *xm = &hf->momenta[i][mu];
+      su3 *z = &hf->gaugefield[i][mu];
+      su3adj deriv;
+      _su3adj_assign_const_times_su3adj(deriv, step, *xm);
+      exposu3(&w, &deriv);
+      restoresu3(&v, &w);
+      _su3_times_su3(w, v, *z);
+      restoresu3(&v, &w);
+      _su3_assign(*z, v);
     }
-
-#ifdef TM_USE_OMP
-  } /* OpenMP parallel closing brace */
-#endif
+  }
 
 #ifdef TM_USE_MPI
   /* for parallelization */
@@ -115,7 +95,4 @@ void update_gauge(const double step, hamiltonian_field_t *const hf) {
 
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(updategauge)
-#endif
 }
diff --git a/src/lib/xchange/xchange_2fields.c b/src/lib/xchange/xchange_2fields.c
index c311bf908..46496a0ba 100644
--- a/src/lib/xchange/xchange_2fields.c
+++ b/src/lib/xchange/xchange_2fields.c
@@ -52,10 +52,6 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   int ix = 0;
 #endif
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange2fields)
-#endif
-
 #ifdef TM_USE_MPI
 
   /* send the data to the neighbour on the left */
@@ -237,8 +233,5 @@ void xchange_2fields(spinor* const l, spinor* const k, const int ieo) {
   MPI_Waitall(reqcount, requests, status);
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange2fields)
-#endif
 }
 #endif /*  TM_NON_BLOCKING */
diff --git a/src/lib/xchange/xchange_halffield.c b/src/lib/xchange/xchange_halffield.c
index 3948aa1ca..0dd1effca 100644
--- a/src/lib/xchange/xchange_halffield.c
+++ b/src/lib/xchange/xchange_halffield.c
@@ -176,9 +176,6 @@ void xchange_halffield() {
   int reqcount = 16;
 #endif
 
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchangehalf)
-#endif
   /* send the data to the neighbour on the right in t direction */
   /* recieve the data from the neighbour on the left in t direction */
   MPI_Isend((void*)(sendBuffer), LX * LY * LZ * 12 / 2, MPI_DOUBLE, g_nb_t_up, 81, g_cart_grid,
@@ -245,10 +242,6 @@ void xchange_halffield() {
   MPI_Waitall(reqcount, requests, status);
 #endif /* MPI */
   return;
-
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchangehalf)
-#endif
 }
 
 #endif /* def (TM_USE_SHMEM || TM_PERSISTENT) */
@@ -265,9 +258,6 @@ void xchange_halffield32() {
   int reqcount = 12;
 #elif defined TM_PARALLELXYZT
   int reqcount = 16;
-#endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchangehalf32)
 #endif
 
   /* send the data to the neighbour on the right in t direction */
@@ -336,8 +326,5 @@ void xchange_halffield32() {
   MPI_Waitall(reqcount, requests, status);
 #endif /* MPI */
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchangehalf32)
-#endif
 }
 #endif /* defined TM_USE_HALFSPINOR */
diff --git a/src/lib/xchange/xchange_lexicfield.c b/src/lib/xchange/xchange_lexicfield.c
index 56cc4315c..282ca8dfa 100644
--- a/src/lib/xchange/xchange_lexicfield.c
+++ b/src/lib/xchange/xchange_lexicfield.c
@@ -60,12 +60,8 @@ void xchange_lexicfield(spinor* const l) {
 #elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield)
-#endif
 
 #ifdef TM_USE_MPI
-
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
   MPI_Isend((void*)l, 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
@@ -135,9 +131,6 @@ void xchange_lexicfield(spinor* const l) {
 
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield)
-#endif
 }
 
 /* Here comes the naive version */
@@ -149,12 +142,8 @@ void xchange_lexicfield(spinor* const l) {
 #ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield)
-#endif
 
 #ifdef TM_USE_MPI
-
   MPI_Status status;
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
@@ -214,9 +203,6 @@ void xchange_lexicfield(spinor* const l) {
 #endif
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield)
-#endif
 }
 
 #endif
@@ -239,12 +225,8 @@ void xchange_lexicfield32(spinor32* const l) {
 #elif defined TM_PARALLELXYZT
   int reqcount = 16;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield32)
-#endif
 
 #ifdef TM_USE_MPI
-
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
   MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
@@ -314,9 +296,6 @@ void xchange_lexicfield32(spinor32* const l) {
 
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield32)
-#endif
 }
 
 /* Here comes the naive version */
@@ -328,12 +307,8 @@ void xchange_lexicfield32(spinor32* const l) {
 #ifdef TM_PARALLELXYZT
   int x0 = 0, x1 = 0, x2 = 0, ix = 0;
 #endif
-#ifdef TM_KOJAK_INST
-#pragma pomp inst begin(xchange_lexicfield32)
-#endif
 
 #ifdef TM_USE_MPI
-
   MPI_Status status;
   /* send the data to the neighbour on the left */
   /* recieve the data from the neighbour on the right */
@@ -394,9 +369,6 @@ void xchange_lexicfield32(spinor32* const l) {
 #endif
 #endif
   return;
-#ifdef TM_KOJAK_INST
-#pragma pomp inst end(xchange_lexicfield32)
-#endif
 }
 
 #endif

From a24536ddff5eb622b1d87ca821adc70b482b9435 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Thu, 19 Feb 2026 14:39:07 +0100
Subject: [PATCH 07/19] Added basic documentation

---
 .ci/include/cscs/01-test-templates.yml        |  37 ++--
 .../repo/packages/lemonio/package.py          |   2 +-
 .../repo/packages/tmlqcd/package.py           | 113 ++++++++++
 CMakeLists.txt                                |  19 +-
 README.md                                     | 131 ++++++++++++
 cmake/FindCLime.cmake                         |  19 +-
 cmake/tmlqcd_config_internal.h.in             |   2 +-
 doc/install.tex                               | 201 ++++++++++--------
 install-sh                                    |   0
 quda_gauge_paths.inc                          | 158 --------------
 10 files changed, 397 insertions(+), 285 deletions(-)
 create mode 100644 .ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
 create mode 100644 README.md
 delete mode 100644 install-sh
 delete mode 100644 quda_gauge_paths.inc

diff --git a/.ci/include/cscs/01-test-templates.yml b/.ci/include/cscs/01-test-templates.yml
index 9a4a8da45..9b3a1c414 100644
--- a/.ci/include/cscs/01-test-templates.yml
+++ b/.ci/include/cscs/01-test-templates.yml
@@ -8,30 +8,29 @@ include:
   image: ${UENV_NAME}/${UENV_VERSION}:${UENV_TAG}
   variables:
     WITH_UENV_VIEW: "default"
-    CFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    CXXFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
-    LDFLAGS: "-fopenmp"
+#    CFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
+#    CXXFLAGS: "-O3 -fopenmp -mtune=neoverse-v2 -mcpu=neoverse-v2"
+#    LDFLAGS: "-fopenmp"
   before_script:
     - |
       if test "${SLURM_PROCID}" -eq "0"; then
         export CC="$(which mpicc)"
         export CXX="$(which mpicxx)"
-        mkdir -p install_dir
-        autoconf
-        ./configure \
-          --enable-quda_experimental \
-          --enable-mpi \
-          --enable-omp \
-          --with-mpidimension=4 \
-          --disable-sse2 \
-          --disable-sse3 \
-          --enable-alignment=32 \
-          --with-qudadir="/user-environment/env/default" \
-          --with-limedir="/user-environment/env/default" \
-          --with-lemondir="/user-environment/env/default" \
-          --with-lapack="-lopenblas -L/user-environment/env/default/lib" \
-          --with-cudadir="/user-environment/env/default/lib64" \
-          --prefix="$(pwd)/install_dir"
+        mkdir -p build_dir
+        cd build_dir
+        cmake -DCMAKE_PREFIX_PATH="/user-environment/env/default" \
+              -DTM_USE_MPI=ON \
+              -DTM_USE_CUDA=ON \
+              -DCMAKE_C_CFLAGS="-O3 -mtune=neoverse-v2 -mcpu=neoverse-v2" \
+              -DCMAKE_CXX_FLAGS="-O3 -mtune=neoverse-v2 -mcpu=neoverse-v2" \
+              -DCMAKE_CUDA_ARCHITECTURES=90a \
+              -DTM_USE_OMP=ON \
+              -DTM_USE_QUDA=ON \
+              -DTM_USE_LEMON=ON \
+              -DTM_ENABLE_ALIGNMENT=32 \
+              -DTM_USE_GAUGE_COPY-ON \
+              -DTM_USE_HALFSPINOR=ON \
+              -DCMAKE_INSTALL_PREFIX=../install_dir ..
         make
         make install
         touch preparation-done-${CI_JOB_ID}
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
index 7508b4b79..4d7340a03 100755
--- a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/lemonio/package.py
@@ -28,7 +28,7 @@ class CMakeBuilder(cmake.CMakeBuilder):
     def cmake_args(self):
         spec = self.spec
         args = [
-            self.define_from_variant("DBUILD_SHARED_LIBS" "shared"),
+            self.define_from_variant("DBUILD_SHARED_LIBS", "shared"),
         ]
         return args
 
diff --git a/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
new file mode 100644
index 000000000..13fb3238e
--- /dev/null
+++ b/.ci/uenv-recipes/tmlqcd/daint-gh200/repo/packages/tmlqcd/package.py
@@ -0,0 +1,113 @@
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack_repo.builtin.build_systems.cmake import CmakePackage
+from spack_repo.builtin.build_systems.rocm import ROCmPackage
+from spack_repo.builtin.build_systems.cuda import CudaPackage
+
+from spack.package import *
+
+class Tmlqcd(CmakePackage, CudaPackage, ROCmPackage):
+"""Base class for building tmlQCD."""
+
+    homepage = "https://www.itkp.uni-bonn.de/~urbach/software.html"
+    url = "https://github.com/etmc/tmLQCD/archive/refs/tags/rel-5-1-6.tar.gz"
+    git = "https://github.com/etmc/tmLQCD.git"
+    license("GPL-3.0-or-later")
+
+    maintainers("mtaillefumier")
+    version("master", branch="master")
+
+    variant("lemon", default=False, description="Enable the lemon backend")
+    variant("mpi", default=True, description="Enable mpi support")
+    variant("DDalphaAMG", default=False, description="Enable DAlphaAMG support")
+    variant("openmp", default=True, description="Enable OpenMP")
+    variant("fftw", default=True, description="Enable FFTW interface")
+    variant(
+        "persistent_mpi",
+        default=True,
+        description="Enable persistent mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant(
+        "nonblocking_mpi",
+        default=True,
+        description="Enable non-blocking mpi calls for spinor and gauge fields",
+        when="+mpi",
+    )
+    variant("fixedvolume", default=True, description="Enable fixed volume at compile time")
+    variant(
+        "alignment",
+        default="auto",
+        values=("none", "auto", "16", "32", "64"),
+        description="Automatically or expliclty align arrays",
+    )
+    variant("gauge_copy", default=True, description="Enable gauge field copy")
+    variant("half_spinor", default=True, description="Use a Dirac operator with half-spinor")
+    variant("shared", default=False, description="Enable shared library")
+    variant("shmem", default=False, description="Use shmem API")
+    variant("quda", default=True, description="Enable the QUDA library", when="+cuda",)
+    variant("quda", default=True, description="Enable the QUDA library", when="+rocm",)
+    variant(
+        "QPhiX", default=False, description="Enable the QPhiX library for Intel Xeon and Xeon Phis"
+    )
+    variant(
+        "mpi_dimensions",
+        default="4",
+        values=("1", "2", "3", "4", "x", "xy", "xyz"),
+        description="number of dimensions the mpi processes are distributed. the default is parallelization over all four dimensions txyz",
+        when="+mpi",
+    )
+
+    generator("ninja")
+
+    # language dependencies
+    depends_on("c", type="build")
+    depends_on("cxx", type="build")
+    depends_on("fortran", type="build")
+
+    # conflicts
+    conflicts("+cuda", when="cuda_arch=none")
+    conflicts("+rocm", when="amdgpu_target=none")
+
+    # hard dependencies
+    depends_on("c-lime")
+    depends_on("blas")
+    depends_on("lapack")
+    depends_on("pkgconfig", type="build")
+
+     # dependencies
+    depends_on("mpi", when="+mpi")
+    depends_on("lemon-io", when="+lemon")
+
+    with when("+quda"):
+        depends_on(
+            "quda+twisted_mass+twisted_clover+clover+ndeg_twisted_clover+ndeg_twisted_mass+wilson+qdp+staggered+usqcd+multigrid"
+        )
+
+        depends_on("quda+mpi", when="+mpi")
+        depends_on("quda+cuda", when="+cuda")
+        depends_on("quda+rocm", when="+rocm")
+        depends_on("quda+nvshmem", when="+shmem")
+
+    depends_on("fftw-api@3", when="+fftw")
+
+class CMakeBuilder(cmake.CMakeBuilder):
+    def cmake_args(self):
+        spec = self.spec
+        args = [
+            self.define_from_variant("DBUILD_SHARED_LIBS", "shared"),
+            self.define_from_variant("TM_USE_LEMON", "lemon"),
+            self.define_from_variant("TM_USE_MPI", "mpi"),
+            self.define_from_variant("TM_USE_QUDA", "quda"),
+            self.define_from_variant("TM_USE_CUDA","cuda"),
+            self.define_from_variant("TM_USE_HIP", "cuda"),
+            self.define_from_variant("TM_USE_FFTW", "fftw"),
+            self.define_from_variant("TM_FIXEDVOLUME", "fixed_volume"),
+            self.define_from_variant("TM_USE_OMP", "openmp"),
+            self.define_from_variant("TM_USE_SHMEM", "shmem"),
+            self.define_from_variant("TM_USE_GAUGE_COPY", "gauge_copy"),
+            self.define_from_variant("TM_USE_HALFSPINOR", "half_spinor"),
+        ]
+        return args
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 803feeef7..d363e407c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,8 +80,8 @@ option(TM_USE_MPI "Enable MPI support" OFF)
 option(TM_USE_CUDA "Enable QUDA support" OFF)
 option(TM_USE_HIP "Enable HIP support" OFF)
 option(TM_USE_DDalphaAMG "Enable DDalphaAMG support" OFF)
-option(TM_USE_OMP "Enable openMP" ON)
-option(TM_FIXEDVOLUME "fix volume at compile time" OFF)
+option(TM_USE_OMP "Enable OpenMP" ON)
+option(TM_FIXEDVOLUME "Fix volume at compile time" OFF)
 set(TM_ENABLE_ALIGNMENT
     "auto"
     CACHE
@@ -95,7 +95,7 @@ set_property(CACHE TM_ENABLE_ALIGNMENT PROPERTY STRINGS "auto" "none" "16" "32"
 option(TM_USE_OPTIMIZATION "enable optimisation" ON)
 option(TM_USE_GAUGE_COPY "Enable use of a copy of the gauge field" ON)
 option(TM_USE_HALFSPINOR "Use a Dirac Op. with halfspinor exchange" ON)
-option(TM_USE_QPHIX "enable QPhiX" OFF)
+option(TM_USE_QPHIX "Enable QPhiX" OFF)
 option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
@@ -106,7 +106,7 @@ cmake_dependent_option(
   TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
   "TM_USE_MPI" OFF)
 cmake_dependent_option(
-  TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gaug" ON
+  TM_NONBLOCKING_MPI "Use non-blocking MPI calls for spinor and gauge" ON
   "TM_USE_MPI" OFF)
 
 # need to do it properly. Just a place holder
@@ -132,8 +132,6 @@ find_package(BLAS REQUIRED)
 find_package(LAPACK REQUIRED)
 set(TM_LAPACK ON)
 find_package(FLEX REQUIRED)
-# do we need bison ?
-find_package(BISON REQUIRED)
 
 set(PACKAGE_NAME ${PROJECT_DESCRIPTION})
 set(PACKAGE_VERSION ${PROJECT_VERSION})
@@ -201,12 +199,6 @@ set(TM_USE_LIME ON)
 
 if(TM_USE_QUDA)
   find_package(QUDA REQUIRED config)
-  if(TM_USE_QUDA_EXPERIMENTAL)
-    set(TM_QUDA_EXPERIMENTAL ON)
-  endif()
-  if(TM_QUDA_FERMIONIC_FORCES)
-    set(TM_QUDA_FERMIONIC_FORCES ON)
-  endif()
 endif()
 
 if(TM_USE_SHMEM)
@@ -293,6 +285,9 @@ if(TM_USE_MPI)
   endif()
 endif()
 
+if (TM_USE_HALFSPINOR AND NOT TM_USE_GAUGE_COPY)
+  message(FATAL_ERROR "The TM_USE_GAUGE_COPY option should also be set to ON when TM_USE_HALFSPINOR is ON")
+endif()
 # keep the autotool config.h header.
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
                "${PROJECT_BINARY_DIR}/tmlqcd_config_internal.h" @ONLY)
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..84f1e2172
--- /dev/null
+++ b/README.md
@@ -0,0 +1,131 @@
+The software ships with a CMake environment, which will configure and build the
+programmes. It is recommended to configure and build the executables in a
+separate build directory. This also allows to have several builds with different
+options from the same source code directory.
+
+## Prerequisites
+
+In order to compile the programmes the `LAPACK` library (fortran version) needs to be installed. CMake will search for the
+library in all default directories. Also the latest version (tested is version
+1.2.3) of `C-LIME` must be available, which is used as
+a packaging scheme to read and write gauge configurations and propagators to
+files.
+
+## Configuring the hmc package
+:label{sec:config}
+
+The build system uses CMake to configure and build the hmc package. The
+following list gives all options (OFF by default unless specified):
+- `CMAKE_POSITION_INDEPENDENT_CODE`: Build a position independent
+  code. **ON** by default.
+- `BUILD_SHARED_LIBS`: Build the shared version of the hmc library.
+- `TM_USE_FFTW`: Enable fftw support. 
+- `TM_USE_CUDA`: Enable CUDA support.
+- `TM_USE_HIP`: Enable HIP support (AMD or NVidia GPUs)
+- `TM_USE_DDalphaAMG`: Enable DDalphaAMG support.
+- `TM_USE_LEMON`: Use the lemon io library.
+- `TM_USE_OMP`: Enable OpenMP (**ON** by default)
+- `TM_FIXEDVOLUME`: Fix volume at compile time.
+- `TM_ENABLE_ALIGNMENT`: Automatically or expliclty align arrays to
+  byte number. auto, none, 16, 32, 64.
+- `TM_USE_GAUGE_COPY`: Enable use of a copy of the gauge field (**ON**
+  by default). See section ref{sec:dirac} for details on this option. It will
+  increase the memory requirement of the code.
+- `TM_USE_HALFSPINOR`: Use a Dirac Op. with halfspinor exchange (**ON**
+  by default). See sub-section ref{sec:dirac} for details. 
+- `TM_USE_QUDA`: Enable QUDA support.
+- `TM_USE_SHMEM`: Use shmem API.
+- `TM_ENABLE_WARNINGS`: Enable all warnings (**ON** by default).
+- `TM_ENABLE_TESTS`: Enable tests.
+- `TM_USE_QPHIX`: Enable QPhiX.
+  - `TM_QPHIX_SOALEN`: QPhiX specific parameter (default is 4)
+  - **QPHIX_DIR**: Directory where QPhiX is installed.
+    The QPhiX current CMake build system does not export all information (
+    include and lib directories) that are needed to compile hmc.
+  - **QMP_DIR**: Directory where QMP is installed (
+    QPhiX dependency).
+    The QPhiX current CMake build system does not export all information about the
+    include and lib directories nor its dependencies (QMP in that case).
+- `TM_USE_MPI`: Enable MPI support.
+  - `TM_PERSISTENT_MPI`: Use persistent MPI calls for halfspinor.
+  - `TM_NONBLOCKING_MPI`: Use non-blocking MPI calls for spinor and
+    gauge.
+  - `TM_MPI_DIMENSION`: Use $n$ dimensional parallelisation ($XYZT$)
+    [default=4]. The number of parallel directions can be specified. $1, 2, 3$ and $4$
+    dimensional parallelisation is supported.
+  - `TM_USE_LEMON` Use the lemon io library
+
+The following minimal list of commands will configure and build the hmc package with
+minimal dependencies
+
+```bash
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path ..
+make -j
+make install
+'''
+
+These instructions assume that the `c-lime` package is installed in `/my_c_line_path`. By default `CMAKE_PREFIX_PATH` variable is a list
+of paths separated by a semi-colunm containing the path of all installed to
+dependencies.
+
+Adding `-DTM_USE_MPI=ON` will enable MPI support with parallelization
+over spatial and temporal dimensions. The command line is then
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path -DTM_USE_MPI=ON ..
+'''
+
+We can combine it with the lemon-io library (isntalled in `/my_lemon_path`)
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON ..
+'''
+
+`QUDA` support (installed in `/my_quda_path`) can be added with
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;/my_quda_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 ..
+'''
+
+Note that the command assumes that QUDA is compiled with `CUDA` support. AMD GPU
+are also supported after replacing `-DTM_USE_CUDA=ON` with
+`-DTM_USE_HIP=ON` and compiling `QUDA` with `HIP` support. The ROCM architecture is defined by the variable
+`CMAKE_HIP_ARCHITECTURES=gfxxxx`.
+
+`QPhiX` and/or `DDalphaAMG` support can be added with
+
+```bash
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;/my_quda_path;/my_path_ddalphaamg" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA=ON \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 \
+      -DTM_USE_QPHIX=ON \
+      -DQPHIX_DIR=/my_qphix_dir \
+      -DTM_USE_DDalphaAMG=ON \
+      -DQMP_DIR=/my_qmp_dir \
+      -DTM_USE_OMP=ON ..
+'''
+
+`QPhiX` cmake config support is incomplete and requires both the QPhiX
+and QMP installation directories to work properly.
+
+`CMake` has several relevant specific options that control the build. Compiler
+options are defined by the variable `CMAKE_C_FLAGS` and `CMAKE_CXX_FLAGS`. CUDA and HIP compilations options are controlled by their
+equivalent `CMAKE_{CUDA/HIP}_FLAGS`.
+
+Adding for instance `-GNinja` to the `CMake` command line will use
+ninja instead of make.
diff --git a/cmake/FindCLime.cmake b/cmake/FindCLime.cmake
index 0c3eabe48..c9d94ea95 100644
--- a/cmake/FindCLime.cmake
+++ b/cmake/FindCLime.cmake
@@ -1,27 +1,26 @@
 include(FindPackageHandleStandardArgs)
 
 find_library(
-  TMLQCD_CLIME_LIBRARIES
+  TM_CLIME_LIBRARIES
   NAMES lime
   PATH_SUFFIXES "lib" "lib64")
 
 find_path(
-  TMLQCD_CLIME_INCLUDE_DIRS
+  TM_CLIME_INCLUDE_DIRS
   NAMES lime.h
   PATH_SUFFIXES "include" "include/${_pacakge_name}" "${_package_name}")
 
-message("${TMLQCD_CLIME_INCLUDE_DIRS}")
-find_package_handle_standard_args(CLime DEFAULT_MSG TMLQCD_CLIME_LIBRARIES
-                                  TMLQCD_CLIME_INCLUDE_DIRS)
+find_package_handle_standard_args(CLime DEFAULT_MSG TM_CLIME_LIBRARIES
+                                  TM_CLIME_INCLUDE_DIRS)
 
 if(NOT TARGET tmlqcd::clime)
   add_library(tmlqcd::clime INTERFACE IMPORTED)
   set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_LINK_LIBRARIES
-                                                 "${TMLQCD_CLIME_LIBRARIES}")
+                                                 "${TM_CLIME_LIBRARIES}")
   set_target_properties(tmlqcd::clime PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                 "${TMLQCD_CLIME_INCLUDE_DIRS}")
+                                                 "${TM_CLIME_INCLUDE_DIRS}")
 endif()
 
-set(TMLQCD_CLIME_FOUND ON)
-mark_as_advanced(TMLQCD_CLIME_FOUND TMLQCD_CLIME_LIBRARIES
-                 TMLQCD_CLIME_INCLUDE_DIRS)
+set(TM_CLIME_FOUND ON)
+mark_as_advanced(TM_CLIME_FOUND TM_CLIME_LIBRARIES
+                 TM_CLIME_INCLUDE_DIRS)
diff --git a/cmake/tmlqcd_config_internal.h.in b/cmake/tmlqcd_config_internal.h.in
index 145df156a..7c11d0446 100644
--- a/cmake/tmlqcd_config_internal.h.in
+++ b/cmake/tmlqcd_config_internal.h.in
@@ -79,7 +79,7 @@
 //#cmakedefine YYTEXT_POINTER
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#define TM_FILE_OFFSET_BITS @TMLQCD_FILE_OFFSET_BITS@
+#define TM_FILE_OFFSET_BITS @TM_FILE_OFFSET_BITS@
 
 /* Construct an extra copy of the gauge fields */
 #cmakedefine TM_USE_GAUGE_COPY
diff --git a/doc/install.tex b/doc/install.tex
index e4d86c2da..9d5e6f887 100644
--- a/doc/install.tex
+++ b/doc/install.tex
@@ -1,103 +1,136 @@
-The software ships with a GNU autoconf environment and a configure
-script, which will generate GNU Makefiles to build the programmes. It
-is supported and recommended to configure and build the executables in
-a separate build directory. This also allows to have several builds with
-different options from the same source code directory. 
+The software ships with a CMake environment, which will configure and build the
+programmes. It is recommended to configure and build the executables in a
+separate build directory. This also allows to have several builds with different
+options from the same source code directory.
 
 \subsection{Prerequisites}
 
-In order to compile the programmes the {\ttfamily
-  LAPACK}~\cite{lapack:web} library (fortran version) needs to be
-installed. In addition it must be known which linker options are
-needed to link against {\ttfamily LAPACK}, e.g. {\ttfamily
-  -Lpath-to-lapack -llapack  -lblas}. Also a the latest
-version (tested is version 1.2.3) of {\ttfamily
-  C-LIME}~\cite{lime:web} must be available, which is used as a
-packaging scheme to read and write gauge configurations and
-propagators to files.
+In order to compile the programmes the {\ttfamily LAPACK}~\cite{lapack:web}
+library (fortran version) needs to be installed. CMake will search for the
+library in all default directories. Also the latest version (tested is version
+1.2.3) of {\ttfamily C-LIME}~\cite{lime:web} must be available, which is used as
+a packaging scheme to read and write gauge configurations and propagators to
+files.
 
 \subsection{Configuring the hmc package}
 \label{sec:config}
 
-In order to get a simple configuration of the hmc package it is enough
-to just type 
-\begin{verbatim}
-path-to-src-code/configure   --with-lime=<path-to-lime> \
-     --with-lapack=<linker-flags> CC=<mycc> \
-     F77=<myf77> CFLAGS=<c-compiler flags>
-\end{verbatim}
-in the build directory. If 
-{\ttfamily CC, F77} and {\ttfamily CFLGAS} are not specified,
-{\ttfamily configure} will guess them.
-
-The code was successfully compiled and run at least on the following
-platforms: i686 and compatible, x64 and compatible, IBM Regatta
-systems, IBM Blue Gene/L, IBM Blue Gene/P, SGI Altix and SGI PC
-clusters, powerpc clusters.
-
-The configure script accepts certain options to influence the building
-procedure. One can get an overview over all supported options with
-{\ttfamily configure --help}. There are {\ttfamily enable|disable}
-options switching on and off optional features and {\ttfamily
-  with|without} switches usually related to optional packages. In the
-following we describe the most important of them (check {\ttfamily
-  configure --help} for the defaults and more options):
-
+The build system uses CMake to configure and build the hmc package. The
+following list gives all options (OFF by default unless specified):
 \begin{itemize}
-\item {\ttfamily --enable-mpi}:\\
-  This option switches on the support for MPI. On certain platforms it
-  automatically chooses the correct parallel compiler or searches for
-  a command {\ttfamily mpicc} in the search path.
-
-\item {\ttfamily --enable-gaugecopy}:\\
-  See section \ref{sec:dirac} for details on this option. It will
+\item {\ttfamily CMAKE\_POSITION\_INDEPENDENT\_CODE}: Build a position independent
+  code. ON by default.
+\item {\ttfamily BUILD\_SHARED\_LIBS}: Build the shared version of the hmc library.
+\item {\ttfamily TM\_USE\_FFTW}: Enable fftw support. 
+\item {\ttfamily TM\_USE\_CUDA}: Enable CUDA support.
+\item {\ttfamily TM\_USE\_HIP}: Enable HIP support (AMD or NVidia GPUs)
+\item {\ttfamily TM\_USE\_DDalphaAMG}: Enable DDalphaAMG support.
+\item {\ttfamily TM\_USE\_LEMON}: Use the lemon io library.
+\item {\ttfamily TM\_USE\_OMP}: Enable OpenMP ({\bf ON} by default)
+\item {\ttfamily TM\_FIXEDVOLUME}: Fix volume at compile time.
+\item {\ttfamily TM\_ENABLE\_ALIGNMENT}: Automatically or expliclty align arrays to
+  byte number. auto, none, 16, 32, 64.
+\item {\ttfamily TM\_USE\_GAUGE\_COPY}: Enable use of a copy of the gauge field (ON
+  by default). See section \ref{sec:dirac} for details on this option. It will
   increase the memory requirement of the code.
+\item {\ttfamily TM\_USE\_HALFSPINOR}: Use a Dirac Op. with halfspinor exchange (ON
+  by default). See sub-section \ref{sec:dirac} for details. 
+\item {\ttfamily TM\_USE\_QUDA}: Enable QUDA support.
+\item {\ttfamily TM\_USE\_SHMEM}: Use shmem API.
+\item {\ttfamily TM\_ENABLE\_WARNINGS}: Enable all warnings (ON by default).
+\item {\ttfamily TM\_ENABLE\_TESTS}: Enable tests.
+\item {\ttfamily TM\_USE\_QPHIX}: Enable QPhiX.
+  \begin{itemize}
+  \item {\ttfamily TM\_QPHIX\_SOALEN}: QPhiX specific parameter (default is 4)
+  \item \textcolor{red}{{\ttfamily QPHIX\_DIR}}: Directory where QPhiX is installed.
+    The QPhiX current CMake build system does not export all information (
+    include and lib directories) that are needed to compile hmc.
+  \item \textcolor{red}{\ttfamily QMP\_DIR}: Directory where QMP is installed (
+    QPhiX dependency).
+    The QPhiX current CMake build system does not export all information about the
+    include and lib directories nor its dependencies (QMP in that case).
+  \end{itemize}
+\item {\ttfamily TM\_USE\_MPI}: Enable MPI support.
+  \begin{itemize}
+  \item {\ttfamily TM\_PERSISTENT\_MPI}: Use persistent MPI calls for halfspinor.
+  \item {\ttfamily TM\_NONBLOCKING\_MPI}: Use non-blocking MPI calls for spinor and
+    gauge.
+  \item {\ttfamily TM\_MPI\_DIMENSION}: Use $n$ dimensional parallelisation ($XYZT$)
+    [default=4]. The number of parallel directions can be specified. $1, 2, 3$ and $4$
+    dimensional parallelisation is supported.
+  \item {\ttfamily TM\_USE\_LEMON} Use the lemon io library
+  \end{itemize}
+\end{itemize}
 
-\item {\ttfamily --enable-halfspinor}:\\
-  If this option is enabled the Dirac operator using half spinor
-  fields is used. See sub-section \ref{sec:dirac} for details. If this
-  feature is switched on, also the gauge copy feature is switched
-  on automatically. 
-
-%\item {\ttfamily --enable-shmem}:\\
-%  Use shared memory API instead of MPI for the communication of spinor
-%  fields. This is currently only usable on the Munich Altix machine.
-
-\item {\ttfamily --with-mpidimension=n}:\\
-  This option has only effect if the preceding one is switched
-  on. The number of parallel directions can be specified. 1,2,3 and 4
-  dimensional parallelisation is supported.
-
-\item {\ttfamily --with-lapack="<linker flags>"}:\\
-  the code requires lapack to be linked. All linker flags necessary
-  to do so must be specified here. Note, that {\ttfamily LIBS="..."}
-  works similar.
+The following minimal list of commands will configure and build the hmc package with
+minimal dependencies
+\begin{verbatim}
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path ..
+make -j
+make install
+\end{verbatim}
 
-\item {\ttfamily --with-limedir=<dir>}:\\
-  Tells configure where to find the lime package, which is required for
-  the build of the HMC. It is used for the ILDG file format.
- 
-\end{itemize}
+These instructions assume that the {\ttfamily c-lime} package is installed in {\ttfamily
+  /my\_c\_line\_path}. By default {\ttfamily CMAKE\_PREFIX\_PATH} variable is a list
+of paths separated by a semi-colunm containing the path of all installed to
+dependencies.
 
-The configure script will guess at the very beginning on which
-platform the build is done. In case this fails or a cross compilation
-must be performed please use the option {\ttfamily --host=HOST}. For
-instance in order to compile for the BG/P one needs to specify
-{\ttfamily --host=ppc-ibm-bprts --build=ppc64-ibm-linux}. 
+Adding {\ttfamily -DTM\_USE\_MPI=ON} will enable MPI support with parallelization
+over spatial and temporal dimensions. The command line is then
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path -DCMAKE_PREFIX_PATH=/my_c_line_path -DTM_USE_MPI=ON ..
+\end{verbatim}
+We can combine it with the lemon-io library (isntalled in {\ttfamily /my\_lemon\_path})
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON ..
+\end{verbatim}
 
-For certain architectures like the Blue Gene systems there are
-{\ttfamily README.arch} files in the top source directory with
-example configure calls.
+{\ttfamily QUDA} support (installed in {\ttfamily my\_quda\_path}) can be added with
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;\my_quda_path" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 ..
+\end{verbatim}
+Note that the command assumes that QUDA is compiled with CUDA support. AMD GPU
+are also supported after replacing {\ttfamily -DTM\_USE\_CUDA=ON} with
+{\ttfamily -DTM\_USE\_HIP=ON} and compiling {\ttfamily QUDA} with {\ttfamily
+  HIP} support. The {\ttfamily ROCM} architecture is defined by the variable
+{\ttfamily CMAKE\_HIP\_ARCHITECTURES=gfxxxx}.
 
-\subsection{Building and Installing}
+{\ttfamily QPhiX} and/or {\ttfamily DDalphaAMG} support can be added with
+\begin{verbatim}
+cmake -DCMAKE_INSTALL_PREFIX=/my_path \
+      -DCMAKE_PREFIX_PATH="/my_c_line_path;/my_lemon_path;/my_quda_path;/my_path_ddalphaamg" \
+      -DTM_USE_MPI=ON \
+      -DTM_USE_LEMON=ON \
+      -DTM_USE_QUDA=ON \
+      -DTM_USE_CUDA=ON \
+      -DCMAKE_CUDA_ARCHITECTURES=90 \
+      -DTM_USE_QPHIX=ON \
+      -DQPHIX_DIR=/my_qphix_dir \
+      -DTM_USE_DDalphaAMG=ON \
+      -DQMP_DIR=/my_qmp_dir \
+      -DTM_USE_OMP=ON ..
+\end{verbatim}
+{\ttfamily QPhiX} cmake config support is incomplete and requires both the {\ttfamily QPhiX}
+and {\ttfamily QMP} installation directories to work properly.
 
-After successfully configuring the package the code can be build by
-simply typing {\ttfamily make} in the build directory. This will
-compile the standard executables. Typing {\ttfamily make install} will
-copy these executables into the install directory. The default install
-directory is {\ttfamily \$HOME/bin}, which can be influenced e.g. with
-the {\ttfamily --prefix} option to {\ttfamily configure}. 
+CMake has several relevant specific options that control the build. Compiler
+options are defined by the variable {\ttfamily CMAKE\_C\_FLAGS} and {\ttfamily
+  CMAKE\_CXX\_FLAGS}. CUDA and HIP compilations options are controlled by their
+equivalent {\ttfamily CMAKE\_\{CUDA/HIP\}\_FLAGS}. 
 
+Adding for instance {\ttfamily -GNinja} to the {\ttfamily CMake} command line will use
+{\ttfamily ninja} instead of {\ttfamily make}.
 
 %%% Local Variables: 
 %%% mode: latex
diff --git a/install-sh b/install-sh
deleted file mode 100644
index e69de29bb..000000000
diff --git a/quda_gauge_paths.inc b/quda_gauge_paths.inc
deleted file mode 100644
index d2c898e6c..000000000
--- a/quda_gauge_paths.inc
+++ /dev/null
@@ -1,158 +0,0 @@
-/***********************************************************************
- *
- * Copyright (C) 2021 Bartosz Kostrzewa, Ferenc Pittler, Simone Bacchio
- *
- * This file is part of tmLQCD.
- *
- * tmLQCD is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * tmLQCD is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
- *
- *
- ***********************************************************************/
-
-const int plaq_rect_length[24] = {
-    3, 3, 3, 3, 3, 3,
-    5, 5, 5, 5, 5, 5,
-    5, 5, 5, 5, 5, 5,
-    5, 5, 5, 5, 5, 5,
-  };
-
-const int plaq_rect_path[4][24][5] = {
-    { {1, 7, 6 },
-      {6, 7, 1 },
-      {2, 7, 5 },
-      {5, 7, 2 },
-      {3, 7, 4 },
-      {4, 7, 3 }, 
-      {1, 1, 7, 6, 6 },
-      {6, 6, 7, 1, 1 },
-      {2, 2, 7, 5, 5 },
-      {5, 5, 7, 2, 2 },
-      {3, 3, 7, 4, 4 },
-      {4, 4, 7, 3, 3 },
-      {0, 1, 7, 7, 6 },
-      {6, 7, 7, 1, 0 },
-      {0, 2, 7, 7, 5 },
-      {5, 7, 7, 2, 0 },
-      {0, 3, 7, 7, 4 },
-      {4, 7, 7, 3, 0 },
-      {0, 4, 7, 7, 3 },
-      {3, 7, 7, 4, 0 },
-      {0, 5, 7, 7, 2 },
-      {2, 7, 7, 5, 0 },
-      {0, 6, 7, 7, 1 },
-      {1, 7, 7, 6, 0 } },
-    { { 2, 6, 5 },
-      { 5, 6, 2 },
-      { 3, 6, 4 },
-      { 4, 6, 3 },
-      { 0, 6, 7 },
-      { 7, 6, 0 },
-      { 1, 2, 6, 6, 5 },
-      { 2, 6, 6, 5, 1 },
-      { 5, 6, 6, 2, 1 },
-      { 1, 5, 6, 6, 2 },
-      { 1, 3, 6, 6, 4 },
-      { 3, 6, 6, 4, 1 },
-      { 4, 6, 6, 3, 1 },
-      { 1, 4, 6, 6, 3 },
-      { 1, 0, 6, 6, 7 },
-      { 0, 6, 6, 7, 1 },
-      { 7, 6, 6, 0, 1 },
-      { 1, 7, 6, 6, 0 },
-      { 5, 5, 6, 2, 2 },
-      { 2, 2, 6, 5, 5 },
-      { 4, 4, 6, 3, 3 },
-      { 3, 3, 6, 4, 4 },
-      { 7, 7, 6, 0, 0 },
-      { 0, 0, 6, 7, 7 } },
-    { {3, 5, 4},
-      {4, 5, 3},
-      {0, 5, 7},
-      {7, 5, 0},
-      {1, 5, 6},
-      {6, 5, 1},
-      {2, 3, 5, 5, 4},
-      {3, 5, 5, 4, 2}, 
-      {4, 5, 5, 3, 2}, 
-      {2, 4, 5, 5, 3}, 
-      {2, 0, 5, 5, 7}, 
-      {0, 5, 5, 7, 2}, 
-      {7, 5, 5, 0, 2}, 
-      {2, 7, 5, 5, 0},
-      {2, 1, 5, 5, 6}, 
-      {1, 5, 5, 6, 2}, 
-      {6, 5, 5, 1, 2}, 
-      {2, 6, 5, 5, 1}, 
-      {4, 4, 5, 3, 3}, 
-      {3, 3, 5, 4, 4}, 
-      {7, 7, 5, 0, 0},
-      {0, 0, 5, 7, 7}, 
-      {6, 6, 5, 1, 1}, 
-      {1, 1, 5, 6, 6} }, 
-    { { 0, 4, 7 },
-      { 7, 4, 0 },
-      { 1, 4, 6 },
-      { 6, 4, 1 },
-      { 2, 4, 5 },
-      { 5, 4, 2 },
-      { 3, 0, 4, 4, 7 },
-      { 0, 4, 4, 7, 3 },
-      { 7, 4, 4, 0, 3 },
-      { 3, 7, 4, 4, 0 },
-      { 3, 1, 4, 4, 6 },
-      { 1, 4, 4, 6, 3 },
-      { 6, 4, 4, 1, 3 },
-      { 3, 6, 4, 4, 1 },
-      { 3, 2, 4, 4, 5 },
-      { 2, 4, 4, 5, 3 },
-      { 5, 4, 4, 2, 3 },
-      { 3, 5, 4, 4, 2 },
-      { 7, 7, 4, 0, 0 },
-      { 0, 0, 4, 7, 7 },
-      { 6, 6, 4, 1, 1 },
-      { 1, 1, 4, 6, 6 },
-      { 5, 5, 4, 2, 2 },
-      { 2, 2, 4, 5, 5 } } 
-  };
-
-const int plaq_length[] = {
-    3, 3, 3, 3, 3, 3 };
-
-const int plaq_path[4][6][3] = {
-    { { 1, 7, 6 },
-      { 6, 7, 1 },
-      { 2, 7, 5 },
-      { 5, 7, 2 },
-      { 3, 7, 4 },
-      { 4, 7, 3 } },
-    { { 2, 6, 5 },
-      { 5, 6, 2 },
-      { 3, 6, 4 },
-      { 4, 6, 3 },
-      { 0, 6, 7 },
-      { 7, 6, 0 } },
-    { { 3, 5, 4},
-      { 4, 5, 3},
-      { 0, 5, 7},
-      { 7, 5, 0},
-      { 1, 5, 6},
-      { 6, 5, 1} },
-    { { 0, 4, 7 },
-      { 7, 4, 0 },
-      { 1, 4, 6 },
-      { 6, 4, 1 },
-      { 2, 4, 5 },
-      { 5, 4, 2 } } 
-  };
-

From 065d6d0abfe0eeebba099623a8268e9482fec785 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Feb 2026 09:58:58 +0100
Subject: [PATCH 08/19] Add alignment detection at configuration time

---
 CMakeLists.txt                     |  10 +-
 cmake/DetectSimdAndAlignment.cmake | 288 +++++++++++++++++++++++++++++
 2 files changed, 295 insertions(+), 3 deletions(-)
 create mode 100644 cmake/DetectSimdAndAlignment.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d363e407c..82880ef60 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -146,21 +146,25 @@ set(ALIGN32 " ")
 
 message("${TM_ENABLE_ALIGNMENT}")
 if(${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
+  include(cmake/DetectSimdAndAlignment.cmake)
+  message(STATUS "SIMD: ${SIMD_LEVEL} (${SIMD_ARCH_FAMILY}), align=${SIMD_ALIGNMENT}")
+endif()
+if (${TM_ENABLE_ALIGNMENT} STREQUAL "none")
   set(ALIGN_BASE "0x00")
   set(ALIGN " ")
   set(ALIGN_BASE32 "0x00")
   set(ALIGN32 " ")
-elseif(TM_ENABLE_ALIGNMENT EQUAL 16)
+elseif((${TM_ENABLE_ALIGNMENT} STREQUAL "16") OR (${SIMD_ALIGNMENT} EQUAL 16))
   set(ALIGN_BASE "0x0F")
   set(ALIGN "__attribute__ ((aligned (16)))")
   set(ALIGN_BASE32 "0x0F")
   set(ALIGN32 "__attribute__ ((aligned (16)))")
-elseif(TM_ENABLE_ALIGNMENT EQUAL 32)
+elseif((${TM_ENABLE_ALIGNMENT} STREQUAL "32") OR (${SIMD_ALIGNMENT} EQUAL 32))
   set(ALIGN_BASE "0x2F")
   set(ALIGN "__attribute__ ((aligned (32)))")
   set(ALIGN_BASE32 "0x2F")
   set(ALIGN32 "__attribute__ ((aligned (32)))")
-elseif(TM_ENABLE_ALIGNMENT EQUAL 64)
+elseif((${TM_ENABLE_ALIGNMENT} STREQUAL "64") OR (${SIMD_ALIGNMENT} EQUAL 64))
   set(ALIGN_BASE "0x3F")
   set(ALIGN "__attribute__ ((aligned (64)))")
   set(ALIGN_BASE32 "0x3F")
diff --git a/cmake/DetectSimdAndAlignment.cmake b/cmake/DetectSimdAndAlignment.cmake
new file mode 100644
index 000000000..707b9b65b
--- /dev/null
+++ b/cmake/DetectSimdAndAlignment.cmake
@@ -0,0 +1,288 @@
+# DetectSimdAndAlignment.cmake
+#
+# Detect SIMD architecture family, SIMD level and a reasonable alignment value.
+#
+# Exposed cache variables:
+#   SIMD_ARCH_FAMILY : x86 / ARM / PPC / UNKNOWN
+#   SIMD_LEVEL       : AVX512 / AVX2 / SSE2 / NEON / ALTIVEC / SCALAR
+#   SIMD_ALIGNMENT   : integer, in bytes (16, 32, 64, ...)
+#
+# Optional (if you want a configured header):
+#   SIMD_CONFIG_HEADER : path to the generated header (see bottom).
+#
+# Usage:
+#   include(cmake/DetectSimdAndAlignment.cmake)
+#   message(STATUS "SIMD: ${SIMD_ARCH_FAMILY} ${SIMD_LEVEL}, alignment=${SIMD_ALIGNMENT}")
+#
+#   # Example: propagate as defines
+#   target_compile_definitions(my_target PRIVATE
+#       SIMD_ALIGNMENT=${SIMD_ALIGNMENT}
+#       SIMD_LEVEL_${SIMD_LEVEL}
+#   )
+# DetectSimdAndAlignment.cmake - COMPLETE: x86 + ARM NEON + NVIDIA + PowerPC
+
+
+include_guard(GLOBAL) #
+
+include(CheckCXXSourceCompiles)
+include(CheckCXXSourceRuns) # For runtime CPU detection fallback
+
+# ------------------------------
+# 1. Detect architecture family
+# ------------------------------
+if(NOT DEFINED SIMD_ARCH_FAMILY)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _simd_proc)
+
+    if(_simd_proc MATCHES "x86_64|amd64|i[3-6]86")
+        set(_detected_arch "x86")
+    elseif(_simd_proc MATCHES "armv[0-9]+|aarch64|arm64")
+        set(_detected_arch "ARM")
+    elseif(_simd_proc MATCHES "ppc64(le|el)?|powerpc|ppc")
+        set(_detected_arch "PPC")
+    elseif(_simd_proc MATCHES "nvcl|sm_89|sm_90")
+        set(_detected_arch "NVIDIA")
+    else()
+        set(_detected_arch "UNKNOWN")
+    endif()
+
+    set(SIMD_ARCH_FAMILY "${_detected_arch}" CACHE STRING "SIMD architecture family")
+endif()
+
+# Defaults
+set(SIMD_LEVEL "SCALAR" CACHE STRING "Detected SIMD level")
+set(SIMD_ALIGNMENT 16 CACHE STRING "Alignment in bytes")
+set(SIMD_HAS_FLOAT ON CACHE BOOL "Float SIMD support")
+set(SIMD_HAS_DOUBLE ON CACHE BOOL "Double SIMD support")
+
+# Save/restore flags helper
+set(_SIMD_SAVED_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+macro(_simd_restore_flags)
+    if(DEFINED _SIMD_SAVED_REQUIRED_FLAGS)
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS}")
+    endif()
+endmacro()
+
+# ------------------------------------------------
+# 2. x86: SSE2 → AVX2 → AVX512
+# ------------------------------------------------
+if(SIMD_ARCH_FAMILY STREQUAL "x86")
+    # AVX512 double (64-byte)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mavx512f -mavx512dq")
+    check_cxx_source_compiles("
+        #include <immintrin.h>
+        int main() { __m512d v = _mm512_set1_pd(1.0); (void)v; return 0; }
+    " _HAVE_AVX512_DOUBLE)
+
+    if(_HAVE_AVX512_DOUBLE)
+        set(SIMD_LEVEL "AVX512" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 64 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # AVX2 double (32-byte)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mavx2")
+    check_cxx_source_compiles("
+        #include <immintrin.h>
+        int main() { __m256d v = _mm256_set1_pd(1.0); (void)v; return 0; }
+    " _HAVE_AVX2_DOUBLE)
+
+    if(_HAVE_AVX2_DOUBLE)
+        set(SIMD_LEVEL "AVX2" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 32 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # SSE2 double minimum (16-byte)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -msse2")
+    check_cxx_source_compiles("
+        #include <emmintrin.h>
+        int main() { __m128d v = _mm_set1_pd(1.0); (void)v; return 0; }
+    " _HAVE_SSE2_DOUBLE)
+
+    if(_HAVE_SSE2_DOUBLE)
+        set(SIMD_LEVEL "SSE2" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+# --------------------------------------
+# 3. ARM NEON - ALL FAMILIES
+# --------------------------------------
+elseif(SIMD_ARCH_FAMILY STREQUAL "ARM")
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _arm_proc)
+
+    # AArch64 + SVE
+    if(_arm_proc MATCHES "aarch64|arm64")
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -march=armv8-a+sve")
+        check_cxx_source_compiles("
+            #include <arm_sve.h>
+            int main() { svfloat32_t v = svdup_f32(1.0f); (void)v; return 0; }
+        " _HAVE_SVE)
+
+        if(_HAVE_SVE)
+            set(SIMD_LEVEL "SVE" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+
+        # AArch64 NEON (double safe)
+        check_cxx_source_compiles("
+            #include <arm_neon.h>
+            int main() {
+                float64x2_t vd = vdupq_n_f64(1.0);
+                float32x4_t vf = vdupq_n_f32(1.0f);
+                (void)vd; (void)vf; return 0;
+            }" _HAVE_NEON_AARCH64)
+
+        if(_HAVE_NEON_AARCH64)
+            set(SIMD_LEVEL "NEON_AARCH64" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+
+    # ARMv8 32-bit
+    elseif(_arm_proc MATCHES "armv8")
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -march=armv8-a+simd")
+        check_cxx_source_compiles("
+            #include <arm_neon.h>
+            int main() { float32x4_t v = vdupq_n_f32(1.0f); (void)v; return 0; }
+        " _HAVE_ARMv8_NEON)
+
+        if(_HAVE_ARMv8_NEON)
+            set(SIMD_LEVEL "NEON_ARMv8" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            set(SIMD_HAS_DOUBLE OFF CACHE BOOL "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+
+    # ARMv7 NEON
+    elseif(_arm_proc MATCHES "armv7")
+        set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mfpu=neon -march=armv7-a")
+        check_cxx_source_compiles("
+            #include <arm_neon.h>
+            int main() { float32x4_t v = vdupq_n_f32(1.0f); (void)v; return 0; }
+        " _HAVE_ARMv7_NEON)
+
+        if(_HAVE_ARMv7_NEON)
+            set(SIMD_LEVEL "NEON_ARMv7" CACHE STRING "" FORCE)
+            set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+            set(SIMD_HAS_DOUBLE OFF CACHE BOOL "" FORCE)
+            _simd_restore_flags()
+            return()
+        endif()
+    endif()
+
+# --------------------------------------
+# 4. POWERPC - COMPLETE COVERAGE (NEW!)
+# --------------------------------------
+elseif(SIMD_ARCH_FAMILY STREQUAL "PPC")
+
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _ppc_proc)
+
+    # === Power10+ (512-bit vectors, POWER10)
+    # Note: Power10 needs -mcpu=power10 or -mtune=power10
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mcpu=power10")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector double vd = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; // 512-bit
+            vector float vf = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f};
+            (void)vd; (void)vf; return 0;
+        }" _HAVE_POWER10)
+
+    if(_HAVE_POWER10)
+        set(SIMD_LEVEL "POWER10" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 64 CACHE STRING "" FORCE)  # 512-bit = 64 bytes
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE ON CACHE BOOL "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # === Power9 VSX (256-bit, POWER8+)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mcpu=power9 -mvsx")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector double vd = {1.0,1.0,1.0,1.0};  // 256-bit VSX double
+            vector float vf = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f}; // 256-bit
+            (void)vd; (void)vf; return 0;
+        }" _HAVE_VSX_POWER9)
+
+    if(_HAVE_VSX_POWER9)
+        set(SIMD_LEVEL "VSX_POWER9" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 32 CACHE STRING "" FORCE)  # 256-bit = 32 bytes
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE ON CACHE BOOL "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # === Power7+ VSX (128-bit double, POWER7+)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -mcpu=power7 -mvsx")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector double vd = {1.0,1.0};  // VSX 128-bit double
+            (void)vd; return 0;
+        }" _HAVE_VSX_POWER7)
+
+    if(_HAVE_VSX_POWER7)
+        set(SIMD_LEVEL "VSX_POWER7" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE ON CACHE BOOL "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+    # === Classic AltiVec/VMX (PowerPC baseline, 128-bit)
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} -maltivec -mabi=altivec")
+    check_cxx_source_compiles("
+        #include <altivec.h>
+        int main() {
+            vector float vf = (vector float){1.0f,1.0f,1.0f,1.0f};
+            (void)vf; return 0;
+        }" _HAVE_ALTIVEC)
+
+    if(_HAVE_ALTIVEC)
+        set(SIMD_LEVEL "ALTIVEC" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        set(SIMD_HAS_FLOAT ON CACHE BOOL "" FORCE)
+        set(SIMD_HAS_DOUBLE OFF CACHE BOOL "" FORCE)  # AltiVec: float primary
+        _simd_restore_flags()
+        return()
+    endif()
+
+# --------------------------------------
+# 5. NVIDIA GH200 (sm_89)
+# --------------------------------------
+elseif(SIMD_ARCH_FAMILY STREQUAL "NVIDIA")
+    set(CMAKE_REQUIRED_FLAGS "${_SIMD_SAVED_REQUIRED_FLAGS} --gpu-arch=sm_89")
+    check_cxx_source_compiles("
+        #include <cuda_runtime.h>
+        int main() { double d = 1.0; (void)d; return 0; }
+    " _HAVE_CUDA_SM89)
+
+    if(_HAVE_CUDA_SM89)
+        set(SIMD_LEVEL "CUDA_SM89" CACHE STRING "" FORCE)
+        set(SIMD_ALIGNMENT 16 CACHE STRING "" FORCE)
+        _simd_restore_flags()
+        return()
+    endif()
+
+# --------------------------------------
+# 6. Fallback
+# --------------------------------------
+else()
+    _simd_restore_flags()
+    return()
+endif()
+
+_simd_restore_flags()

From e8b187daba85b3924a28b1db632b81e836a4f4e5 Mon Sep 17 00:00:00 2001
From: Mathieu Taillefumier <mathieu.taillefumier@free.fr>
Date: Tue, 24 Feb 2026 17:02:41 +0100
Subject: [PATCH 09/19] Build DDalphaAMG automatically when DDalphaAMG is
 enabled

---
 CMakeLists.txt            |  62 ++++++++++--
 DDalphaAMG/CMakeLists.txt | 193 ++++++++++++++++++++++++++++++++++++++
 src/lib/CMakeLists.txt    |   6 +-
 3 files changed, 251 insertions(+), 10 deletions(-)
 create mode 100644 DDalphaAMG/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82880ef60..5e587a1a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 # PROJECT AND VERSION
 include(CMakeDependentOption)
 include(GNUInstallDirs)
+include(FetchContent)
 
 cmake_policy(SET CMP0048 NEW)
 
@@ -100,7 +101,9 @@ option(TM_USE_SHMEM "Use shmem API" OFF)
 option(TM_USE_QUDA "Enable QUDA support" OFF)
 option(TM_ENABLE_WARNINGS "Enable all warnings" ON)
 option(TM_ENABLE_TESTS "Enable tests" OFF)
-set(TM_QPHIX_SOALEN "4" CACHE STRING "QPhiX specific parameter")
+set(TM_QPHIX_SOALEN
+    "4"
+    CACHE STRING "QPhiX specific parameter")
 # MPI dependent options
 cmake_dependent_option(
   TM_PERSISTENT_MPI "Use persistent MPI calls for halfspinor [default=no]" OFF
@@ -126,6 +129,44 @@ cmake_dependent_option(TM_USE_LEMON "Use the lemon io library" OFF "TM_USE_MPI"
 cmake_dependent_option(TM_USE_NVHPC "Enable Nvidia HPC toolkit" OFF
                        "TM_USE_CUDA" OFF)
 
+# DDAlphaAMG specific options
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_PARAMOUNT_OUTPUT "Enable paramount output support" ON
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_FGMRES_RESTEST "Enable GMRES test" ON
+                       "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_PROFILING "Enable paramount output support" OFF
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_TRACK_RES "Enable track res support"
+                       ON "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI OFF
+  "Enable paramount output support" OFF "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_COARSE_RES OFF "Enable paramount output support" OFF
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_SCHWARZ_RES OFF "Enable paramount output support" OFF
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_OMP OFF "Enable OpenMP support" ON
+                       "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(
+  DDalphaAMG_ENABLE_TESTVECTOR_ANALYSIS "Enable vector analysis support" OFF
+  "TM_USE_DDalphaAMG" OFF)
+
+cmake_dependent_option(DDalphaAMG_ENABLE_HDF5 "Enable HDF5 support" OFF
+                       "TM_USE_DDalphaAMG" OFF)
+
 # search for blas and lapack
 find_package(BLAS REQUIRED)
 #
@@ -144,12 +185,16 @@ set(ALIGN_BASE "0")
 set(ALIGN_BASE32 "0")
 set(ALIGN32 " ")
 
-message("${TM_ENABLE_ALIGNMENT}")
+# DO NOT MERGE the two if statements as otherwise the automatic alignment will
+# not be taken into account
+
 if(${TM_ENABLE_ALIGNMENT} STREQUAL "auto")
   include(cmake/DetectSimdAndAlignment.cmake)
-  message(STATUS "SIMD: ${SIMD_LEVEL} (${SIMD_ARCH_FAMILY}), align=${SIMD_ALIGNMENT}")
+  message(
+    STATUS "SIMD: ${SIMD_LEVEL} (${SIMD_ARCH_FAMILY}), align=${SIMD_ALIGNMENT}")
 endif()
-if (${TM_ENABLE_ALIGNMENT} STREQUAL "none")
+
+if(${TM_ENABLE_ALIGNMENT} STREQUAL "none")
   set(ALIGN_BASE "0x00")
   set(ALIGN " ")
   set(ALIGN_BASE32 "0x00")
@@ -255,7 +300,7 @@ if(TM_USE_FFTW)
 endif()
 
 if(TM_USE_DDalphaAMG)
-  find_package(DDalphaAMG REQUIRED)
+  add_subdirectory(DDalphaAMG)
 endif()
 
 if(TM_ENABLE_WARNINGS)
@@ -289,8 +334,11 @@ if(TM_USE_MPI)
   endif()
 endif()
 
-if (TM_USE_HALFSPINOR AND NOT TM_USE_GAUGE_COPY)
-  message(FATAL_ERROR "The TM_USE_GAUGE_COPY option should also be set to ON when TM_USE_HALFSPINOR is ON")
+if(TM_USE_HALFSPINOR AND NOT TM_USE_GAUGE_COPY)
+  message(
+    FATAL_ERROR
+      "The TM_USE_GAUGE_COPY option should also be set to ON when TM_USE_HALFSPINOR is ON"
+  )
 endif()
 # keep the autotool config.h header.
 configure_file("${PROJECT_SOURCE_DIR}/cmake/tmlqcd_config_internal.h.in"
diff --git a/DDalphaAMG/CMakeLists.txt b/DDalphaAMG/CMakeLists.txt
new file mode 100644
index 000000000..5b54acc24
--- /dev/null
+++ b/DDalphaAMG/CMakeLists.txt
@@ -0,0 +1,193 @@
+# there is a lot of custom directories to circonvent the deletion of the
+# CMakeLists.txt contained in the DDalphaAMG directory. CMake will clone the
+# source code and build it with the default options used in the ci/cd. More
+# options are available in the main CMakeLists.txt.
+
+set(DDalphaAMG_SRC_DIR ${CMAKE_SOURCE_DIR}/DDalphaAMG/deps)
+
+FetchContent_Declare(
+  DDalphaAMG
+  GIT_REPOSITORY https://github.com/etmc/DDalphaAMG.git
+  SOURCE_DIR ${DDalphaAMG_SRC_DIR})
+
+FetchContent_MakeAvailable(DDalphaAMG)
+
+list(
+  APPEND
+  DDalphaAMG_SRC_GENERIC
+  interpolation_generic.c
+  gathering_generic.c
+  sse_interpolation_generic.c
+  coarse_oddeven_generic.c
+  operator_generic.c
+  oddeven_generic.c
+  linalg_generic.c
+  init_generic.c
+  vcycle_generic.c
+  dirac_generic.c
+  coarse_operator_generic.c
+  coarsening_generic.c
+  schwarz_generic.c
+  ghost_generic.c
+  vectorization_dirac_generic.c
+  linsolve_generic.c
+  sse_coarse_operator_generic.c
+  data_generic.c
+  setup_generic.c
+  sse_linalg_generic.c)
+
+list(
+  APPEND
+  DDalphaAMG_HEADER_GENERIC
+  interpolation_generic.h
+  gathering_generic.h
+  sse_interpolation_generic.h
+  coarse_oddeven_generic.h
+  operator_generic.h
+  oddeven_generic.h
+  linalg_generic.h
+  init_generic.h
+  vcycle_generic.h
+  dirac_generic.h
+  coarse_operator_generic.h
+  coarsening_generic.h
+  schwarz_generic.h
+  ghost_generic.h
+  vectorization_dirac_generic.h
+  linsolve_generic.h
+  sse_coarse_operator_generic.h
+  data_generic.h
+  setup_generic.h
+  sse_linalg_generic.h
+  main_pre_def_generic.h
+  main_post_def_generic.h)
+
+list(
+  APPEND
+  DDalphaAMG_SRC_GENERAL
+  ${DDalphaAMG_SRC_DIR}/src/preconditioner.c
+  ${DDalphaAMG_SRC_DIR}/src/threading.c
+  ${DDalphaAMG_SRC_DIR}/src/main.c
+  ${DDalphaAMG_SRC_DIR}/src/sse_dirac.c
+  ${DDalphaAMG_SRC_DIR}/src/var_table.c
+  ${DDalphaAMG_SRC_DIR}/src/data_layout.c
+  ${DDalphaAMG_SRC_DIR}/src/linsolve.c
+  ${DDalphaAMG_SRC_DIR}/src/ghost.c
+  ${DDalphaAMG_SRC_DIR}/src/top_level.c
+  ${DDalphaAMG_SRC_DIR}/src/dirac.c
+  ${DDalphaAMG_SRC_DIR}/src/linalg.c
+  ${DDalphaAMG_SRC_DIR}/src/init.c
+  ${DDalphaAMG_SRC_DIR}/src/DDalphaAMG_interface.c
+  ${DDalphaAMG_SRC_DIR}/src/lime_io.c
+  ${DDalphaAMG_SRC_DIR}/src/sse_linalg.c
+  ${DDalphaAMG_SRC_DIR}/src/solver_analysis.c
+  ${DDalphaAMG_SRC_DIR}/src/io.c)
+
+message(STATUS "${DDalphaAMG_SRC_GENERAL}")
+foreach(f IN LISTS DDalphaAMG_SRC_GENERIC)
+  string(REPLACE "_generic" "_float" f_float "${f}")
+  message(STATUS "${f_float}")
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    COMMAND
+      sed -f "${DDalphaAMG_SRC_DIR}/float.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+      > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+            "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/float.sed"
+    VERBATIM)
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done")
+  list(APPEND DDalphaAMG_SRC_SINGLE_DOUBLE ${f_float})
+
+  string(REPLACE "_generic" "_double" f_double "${f}")
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    COMMAND
+      sed -f "${DDalphaAMG_SRC_DIR}/double.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+      > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+            "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/double.sed"
+    VERBATIM)
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done")
+  list(APPEND DDalphaAMG_SRC_SINGLE_DOUBLE ${f_double})
+endforeach()
+
+# now parse the header
+foreach(f IN LISTS DDalphaAMG_HEADER_GENERIC)
+  string(REPLACE "_generic" "_float" f_float "${f}")
+  message(STATUS "${f_float}")
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    COMMAND
+      sed -f "${DDalphaAMG_SRC_DIR}/float.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+      > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+            "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/float.sed"
+    VERBATIM)
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_float}.sed-done")
+  list(APPEND DDalphaAMG_HEADER_SINGLE_DOUBLE ${f_float})
+
+  string(REPLACE "_generic" "_double" f_double "${f}")
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    COMMAND
+      sed -f "${DDalphaAMG_SRC_DIR}/double.sed" "${DDalphaAMG_SRC_DIR}/src/${f}"
+      > "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}"
+    COMMAND ${CMAKE_COMMAND} -E touch
+            "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done"
+    DEPENDS "${DDalphaAMG_SRC_DIR}/src/${f}" "${DDalphaAMG_SRC_DIR}/double.sed"
+    VERBATIM)
+
+  list(APPEND SED_MARKERS "${CMAKE_BINARY_DIR}/DDalphaAMG/${f_double}.sed-done")
+  list(APPEND DDalphaAMG_HEADER_SINGLE_DOUBLE ${f_double})
+endforeach()
+
+foreach(outfile IN LISTS DDalphaAMG_SRC_SINGLE_DOUBLE
+                         DDalphaAMG_HEADER_SINGLE_DOUBLE)
+  set_source_files_properties("${CMAKE_BINARY_DIR}/DDalphaAMG/${outfile}"
+                              PROPERTIES GENERATED TRUE)
+endforeach()
+
+# message(STATUS "${DDalphaAMG_SRC_SINGLE_DOUBLE}")
+
+add_custom_target(run_sed ALL DEPENDS ${SED_MARKERS})
+
+add_library(DDalphaAMG ${DDalphaAMG_SRC_GENERAL}
+                       ${DDalphaAMG_SRC_SINGLE_DOUBLE})
+
+target_compile_options(DDalphaAMG
+                       PRIVATE "$<$<COMPILE_LANG_AND_ID:C,GNU>:-mavx2;-mfma>")
+
+add_dependencies(DDalphaAMG run_sed)
+
+target_link_libraries(
+  DDalphaAMG
+  PUBLIC MPI::MPI_C $<$<BOOL:${DDalphaAMG_ENABLE_HDF5}>:hdf5:hdf5>
+         $<$<BOOL:${DDalphaAMG_ENABLE_OMP}>:OpenMP::OpenMP_C> tmlqcd::clime)
+
+target_include_directories(
+  DDalphaAMG
+  PUBLIC $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/src>
+         $<BUILD_INTERFACE:${DDalphaAMG_SRC_DIR}/include>
+         $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/DDalphaAMG>)
+
+target_compile_definitions(
+  DDalphaAMG
+  PUBLIC
+    $<$<BOOL:${DDalphaAMG_ENABLE_PARAMOUNT_OUTPUT}>:PARAMOUNTOUTPUT>
+    $<$<BOOL:${DDalphaAMG_ENABLE_FGMRES_RESTEST}>:FGMRES_RESTEST>
+    $<$<BOOL:${DDalphaAMG_ENABLE_PROFILING}>:PROFILING>
+    $<$<BOOL:${DDalphaAMG_ENABLE_SINGLE_ALLREDUCE_ARNOLDI}>:SINGLE_ALLREDUCE_ARNOLDI>
+    $<$<BOOL:${DDalphaAMG_ENABLE_COARSE_RES}>:COARSE_RES>
+    $<$<BOOL:${DDalphaAMG_ENABLE_SCHWARZ_RES}>:SCHWARZ_RES>
+    $<$<BOOL:${DDalphaAMG_ENABLE_OMP}>:OPENMP>
+    $<$<BOOL:${DDalphaAMG_ENABLE_TRACK_RES}>:TRACK_RES>
+    $<$<BOOL:${DDalphaAMG_ENABLE_TESTVECTOR_ANALYSIS}>:TESTVECTOR_ANALYSIS>
+    $<$<BOOL:${DDalphaAMG_ENABLE_HDF5}>:HAVE_HDF5>
+    $<$<CONFIG:Debug>:DEBUG>
+    SSE)
+
+# add_library(tmlqcd::DDalphaAMG alias DDalphaAMG)
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index ebed35308..b70e7a80d 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -407,8 +407,8 @@ if(CMAKE_MAJOR_VERSION LESS 4)
   flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c
               COMPILE_FLAGS "-Ca -Ptmlqcd -i")
 else()
-  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c OPTIONS
-              "-Ca -Ptmlqcd -i")
+  flex_target(tmlqcd_input_read read_input.l ${CMAKE_BINARY_DIR}/read_input.c
+              OPTIONS "-Ca -Ptmlqcd -i")
 endif()
 
 # create a target library with namespacing because cmake does not know name
@@ -425,7 +425,7 @@ set_target_properties(hmc PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1)
 # define a library and add the dependencies
 target_link_libraries(
   hmc
-  PUBLIC $<$<BOOL:${TM_USE_DDalphaAMG}>:tmlqcd::DDalphaAMG>
+  PUBLIC $<$<BOOL:${TM_USE_DDalphaAMG}>:DDalphaAMG>
          $<$<BOOL:${TM_USE_QPHIX}>:tmlqcd::qphix>
          $<$<BOOL:${TM_USE_FFTW}>:tmlqcd::fftw3>
          $<$<BOOL:${TM_USE_QUDA}>:QUDA::quda>

From d6284c018d934abb9209bbf5210236aeafe60660 Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Wed, 25 Feb 2026 14:50:17 +0100
Subject: [PATCH 10/19] first draft for a global application context

---
 src/lib/app_context/app_context.c | 59 +++++++++++++++++++++++
 src/lib/include/mpi.h             | 77 +++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 src/lib/app_context/app_context.c
 create mode 100644 src/lib/include/mpi.h

diff --git a/src/lib/app_context/app_context.c b/src/lib/app_context/app_context.c
new file mode 100644
index 000000000..00d5d94b2
--- /dev/null
+++ b/src/lib/app_context/app_context.c
@@ -0,0 +1,59 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2026 Roman Gruber
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * App context module
+ *
+ * Author: Roman Gruber
+ *         roman.gruber@unibe.ch
+ *
+ *******************************************************************************/
+
+#include <stdbool.h>
+#include <mpi.h>
+#include "fatal_error.h"
+
+
+static AppContext app_instance = {
+    .mpi = {
+        .comm = MPI_COMM_WORLD // default communicator
+    }
+};
+
+
+const AppContext* app(void)
+{
+    return &app_instance;
+}
+
+
+void app_context_init(const MPI_Comm comm)
+{
+    static bool initialized = false;
+
+    if (initialized) fatal_error("Application context already initialized", __func__);
+
+    app_instance.mpi.comm = comm;
+    initialized = true;
+}
+
+
+void app_context_finalize(void)
+{
+    
+}
diff --git a/src/lib/include/mpi.h b/src/lib/include/mpi.h
new file mode 100644
index 000000000..0d5b02993
--- /dev/null
+++ b/src/lib/include/mpi.h
@@ -0,0 +1,77 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2026 Roman Gruber
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Simple MPI header wrapper
+ *
+ * Author: Roman Gruber
+ *         roman.gruber@unibe.ch
+ *
+ *******************************************************************************/
+
+#ifndef MY_MPI_WRAPPER_H
+#define MY_MPI_WRAPPER_H
+
+
+// include *real* MPI header
+#include_next <mpi.h>
+
+
+/**
+ * @brief      MPI context
+ *
+ * @var        comm MPI communicator
+ */
+typedef struct {
+    MPI_Comm comm;
+} MPIContext;
+
+
+/**
+ * @brief      The global application context struct
+ *
+ * @var        mpi MPI context
+ */
+typedef struct {
+    MPIContext mpi;
+} AppContext;
+
+
+/**
+ * @brief      Return the global application context struct
+ *
+ * @return     Global application context struct
+ */
+const AppContext* app(void);
+
+
+/**
+ * @brief      Initialize application context
+ *
+ * @param[in]  comm  The MPI communicator to use throughout the application
+ */
+void app_context_init(const MPI_Comm comm);
+
+
+/**
+ * @brief      Finalize application context
+ */
+void app_context_finalize(void);
+
+
+#endif

From 6822bb3e348052c68d39d2e1d674d569e8d1b700 Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Wed, 25 Feb 2026 14:59:20 +0100
Subject: [PATCH 11/19] add app_context to cmake build

---
 src/lib/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index b70e7a80d..367f33d5f 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -68,6 +68,11 @@ list(
   # init/init_stout_smear_vars.c
   init/init_moment_field.c)
 
+list(
+  APPEND
+  APP_CONTEXT_SRC_C
+  app_context/app_context.c)
+
 list(
   APPEND
   SOLVER_SRC_C
@@ -390,6 +395,7 @@ list(
   ${LINALG_SRC_C}
   ${IO_SRC_C}
   ${INIT_SRC_C}
+  ${APP_CONTEXT_SRC_C}
   ${SOLVER_SRC_C}
   ${TEST_SRC_C}
   ${MEAS_SRC_C}

From a4250d97de84a6f07e9b303b832c5a7ca02274a9 Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Wed, 25 Feb 2026 15:00:08 +0100
Subject: [PATCH 12/19] search/replace on whole codebase: MPI_COMM_WORLD ->
 app()->mpi.comm

---
 src/bin/benchmark.c                           |  28 +-
 src/bin/deriv_mg_tune.c                       |   2 +-
 src/bin/hmc_tm.c                              |   6 +-
 src/bin/invert.c                              |   2 +-
 src/bin/offline_measurement.c                 |   2 +-
 src/bin/tests/hopping_test.c                  |   2 +-
 src/bin/tests/qphix_test_Dslash.c             |  16 +-
 src/bin/tests/test_eigenvalues.c              |   8 +-
 src/lib/DDalphaAMG_interface.c                |   8 +-
 src/lib/compare_derivative.c                  |   4 +-
 src/lib/fatal_error.c                         |   2 +-
 src/lib/init/init_parallel.c                  |   2 +-
 src/lib/io/deri_write_stdout.c                |   2 +-
 src/lib/io/dml.c                              |   2 +-
 src/lib/io/eospinor_read.c                    |   2 +-
 src/lib/io/eospinor_write.c                   |   8 +-
 src/lib/io/gauge_read_binary.c                |   2 +-
 src/lib/io/gauge_write.c                      |   2 +-
 src/lib/io/gauge_write_binary.c               |   2 +-
 src/lib/io/io_cm.c                            |   2 +-
 src/lib/io/spinor_read_binary.c               |   4 +-
 src/lib/io/spinor_write_binary.c              |   4 +-
 src/lib/io/spinor_write_stdout.c              |   2 +-
 src/lib/io/sw_write_stdout.c                  |   2 +-
 src/lib/linalg/assign_mul_add_r_and_square.c  |   2 +-
 src/lib/linalg/diff_and_square_norm.c         |   2 +-
 src/lib/linalg/scalar_prod_body.c             |   4 +-
 src/lib/linalg/scalar_prod_i.c                |   2 +-
 src/lib/linalg/scalar_prod_r.c                |   2 +-
 src/lib/linalg/scalar_prod_r_32.c             |   2 +-
 src/lib/linalg/square_and_minmax.c            |  32 +-
 src/lib/linalg/square_and_prod_r.c            |   4 +-
 src/lib/linalg/square_norm.c                  |   4 +-
 src/lib/linalg/square_norm_32.c               |   4 +-
 src/lib/meas/correlators.c                    |   4 +-
 ...easure_clover_field_strength_observables.c |   4 +-
 src/lib/meas/oriented_plaquettes.c            |   2 +-
 src/lib/meas/pion_norm.c                      |   2 +-
 src/lib/meas/polyakov_loop.c                  |   4 +-
 src/lib/measure_gauge_action.c                |   4 +-
 src/lib/measure_rectangles.c                  |   2 +-
 src/lib/monomial/moment_energy.c              |   2 +-
 src/lib/monomial/monitor_forces.c             |   4 +-
 src/lib/mpi_init.c                            |  10 +-
 src/lib/operator/clover_det.c                 |   4 +-
 src/lib/prepare_source.c                      |   8 +-
 src/lib/qphix/qphix_interface.cpp             |   4 +-
 src/lib/quda_interface.c                      |  14 +-
 src/lib/reweighting_factor.c                  |   2 +-
 src/lib/sighandler.c                          |   2 +-
 src/lib/solver/dfl_projector.c                |   6 +-
 src/lib/solver/gcr4complex_body.c             |   6 +-
 src/lib/solver/little_project_eo_body.c       |   2 +-
 src/lib/start.c                               |  12 +-
 src/lib/test/check_xchange.c                  | 400 +++++++++---------
 src/lib/update_tm.c                           |   4 +-
 src/lib/wrapper/lib_wrapper.c                 |   4 +-
 tests/test_buffers.c                          |   2 +-
 58 files changed, 341 insertions(+), 341 deletions(-)

diff --git a/src/bin/benchmark.c b/src/bin/benchmark.c
index b2f4ee68c..f0b15ce24 100644
--- a/src/bin/benchmark.c
+++ b/src/bin/benchmark.c
@@ -102,7 +102,7 @@ int main(int argc, char *argv[]) {
 #else
   MPI_Init(&argc, &argv);
 #endif
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+  MPI_Comm_rank(app()->mpi.comm, &g_proc_id);
 
 #else
   g_proc_id = 0;
@@ -234,7 +234,7 @@ int main(int argc, char *argv[]) {
     antioptaway = 0.0;
     /* compute approximately how many applications we need to do to get a reliable measurement */
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 #endif
     t1 = gettime();
     for (j = 0; j < j_max; j++) {
@@ -248,14 +248,14 @@ int main(int argc, char *argv[]) {
     // division by g_nproc because we will average over processes
     j = (int)(ceil(j_max * 31.0 / dt / g_nproc));
 #ifdef TM_USE_MPI
-    MPI_Allreduce(&j, &j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&j, &j_max, 1, MPI_INT, MPI_SUM, app()->mpi.comm);
 #else
     j_max = j;
 #endif
 
     /* perform the actual benchmark */
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 #endif
     t1 = gettime();
     antioptaway = 0.0;
@@ -268,14 +268,14 @@ int main(int argc, char *argv[]) {
     }
     dt = gettime() - t1;
 #ifdef TM_USE_MPI
-    MPI_Allreduce(&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 #else
     sdt = dt;
 #endif
 
     qdt = dt * dt;
 #ifdef TM_USE_MPI
-    MPI_Allreduce(&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 #else
     sqdt = qdt;
 #endif
@@ -321,9 +321,9 @@ int main(int argc, char *argv[]) {
     dt2 = t2 - t1;
     /* compute the bandwidth */
     dt = dts - dt2;
-    MPI_Allreduce(&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
     sdt = sdt / ((double)g_nproc);
-    MPI_Allreduce(&dt2, &dt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&dt2, &dt, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
     dt = dt / ((double)g_nproc);
     dt = 1.0e6f * dt / ((double)(k_max * j_max * (VOLUME)));
     if (g_proc_id == 0) {
@@ -365,7 +365,7 @@ int main(int argc, char *argv[]) {
 
     /* estimate a reasonable number of applications to get a reliable measurement */
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 #endif
     t1 = gettime();
     for (j = 0; j < j_max; j++) {
@@ -379,14 +379,14 @@ int main(int argc, char *argv[]) {
     // division by g_nproc because we will average over processes using  MPI_SUM
     j = (int)(ceil(j_max * 31.0 / dt / g_nproc));
 #ifdef TM_USE_MPI
-    MPI_Allreduce(&j, &j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&j, &j_max, 1, MPI_INT, MPI_SUM, app()->mpi.comm);
 #else
     j_max = j;
 #endif
 
     /* perform the actual measurement */
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 #endif
     t1 = gettime();
     for (j = 0; j < j_max; j++) {
@@ -398,13 +398,13 @@ int main(int argc, char *argv[]) {
     t2 = gettime();
     dt = t2 - t1;
 #ifdef TM_USE_MPI
-    MPI_Allreduce(&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 #else
     sdt = dt;
 #endif
     qdt = dt * dt;
 #ifdef TM_USE_MPI
-    MPI_Allreduce(&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 #else
     sqdt = qdt;
 #endif
@@ -451,7 +451,7 @@ int main(int argc, char *argv[]) {
   free_spinor_field();
   free_moment_field();
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
   MPI_Finalize();
 #endif
   return (0);
diff --git a/src/bin/deriv_mg_tune.c b/src/bin/deriv_mg_tune.c
index 7c45524de..2f5563337 100644
--- a/src/bin/deriv_mg_tune.c
+++ b/src/bin/deriv_mg_tune.c
@@ -357,7 +357,7 @@ int main(int argc, char *argv[]) {
   _endQuda();
 #endif
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
   MPI_Finalize();
 #endif
 
diff --git a/src/bin/hmc_tm.c b/src/bin/hmc_tm.c
index b68a5250f..2082358ee 100644
--- a/src/bin/hmc_tm.c
+++ b/src/bin/hmc_tm.c
@@ -479,7 +479,7 @@ int main(int argc, char *argv[]) {
 
           sleep(io_timeout);
 #ifdef TM_USE_MPI
-          MPI_Barrier(MPI_COMM_WORLD);
+          MPI_Barrier(app()->mpi.comm);
 #endif
         }
       /* Now move .conf.tmp into place */
@@ -528,7 +528,7 @@ int main(int argc, char *argv[]) {
     }
 
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 #endif
     if (ix == 0 && g_proc_id == 0) {
       countfile = fopen("history_hmc_tm", "a");
@@ -581,7 +581,7 @@ int main(int argc, char *argv[]) {
   _endQuda();
 #endif
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
   MPI_Finalize();
 #endif
 
diff --git a/src/bin/invert.c b/src/bin/invert.c
index b5040ba88..7e004188c 100644
--- a/src/bin/invert.c
+++ b/src/bin/invert.c
@@ -448,7 +448,7 @@ int main(int argc, char *argv[]) {
   _endQuda();
 #endif
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
   MPI_Finalize();
 #endif
   return (0);
diff --git a/src/bin/offline_measurement.c b/src/bin/offline_measurement.c
index b6cbc13fa..c42d51958 100644
--- a/src/bin/offline_measurement.c
+++ b/src/bin/offline_measurement.c
@@ -297,7 +297,7 @@ int main(int argc, char *argv[]) {
   free(input_filename);
 
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
   MPI_Finalize();
 #endif
   return (0);
diff --git a/src/bin/tests/hopping_test.c b/src/bin/tests/hopping_test.c
index 0e5ff03e7..a09c0e46f 100644
--- a/src/bin/tests/hopping_test.c
+++ b/src/bin/tests/hopping_test.c
@@ -318,7 +318,7 @@ int main(int argc, char *argv[]) {
     }
 
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     MPI_Finalize();
 #endif
   }
diff --git a/src/bin/tests/qphix_test_Dslash.c b/src/bin/tests/qphix_test_Dslash.c
index 41e2602a4..b998fe52f 100644
--- a/src/bin/tests/qphix_test_Dslash.c
+++ b/src/bin/tests/qphix_test_Dslash.c
@@ -221,7 +221,7 @@ int main(int argc, char* argv[]) {
                    source_location, 12345 /* seed */);
 
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 #endif
 
     tm_t1 = gettime();
@@ -231,7 +231,7 @@ int main(int argc, char* argv[]) {
     tm_t2 = gettime();
 
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 #endif
     q_t1 = gettime();
     Mfull_qphix(qphix_out_cb_spinors[0], qphix_out_cb_spinors[1], op->sr0, op->sr1, op->type);
@@ -297,7 +297,7 @@ int main(int argc, char* argv[]) {
   free_spinor_field();
   free_moment_field();
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
   MPI_Finalize();
 #endif
   return (failed);
@@ -305,7 +305,7 @@ int main(int argc, char* argv[]) {
 
 double compare_spinors(spinor* s1, spinor* s2) {
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 #endif
   int coords[4];
   int x, y, z, t, id = 0;
@@ -352,7 +352,7 @@ double compare_spinors(spinor* s1, spinor* s2) {
               }
             }
 #ifdef TM_USE_MPI
-            MPI_Barrier(MPI_COMM_WORLD);
+            MPI_Barrier(app()->mpi.comm);
 #endif
           }  // z
         }  // y
@@ -361,7 +361,7 @@ double compare_spinors(spinor* s1, spinor* s2) {
   }  // if( SourceInfo.type == SRC_TYPE_POINT )
 
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 #endif
   if (g_proc_id == 0) {
     printf("\n");
@@ -376,7 +376,7 @@ double compare_spinors(spinor* s1, spinor* s2) {
   double squarenorm = diff_and_square_norm(s1, s2, VOLUME);
 
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 #endif
   id = 0;
   for (int t_global = 0; t_global < g_nproc_t * T; t_global++) {
@@ -408,7 +408,7 @@ double compare_spinors(spinor* s1, spinor* s2) {
             }
           }
 #ifdef TM_USE_MPI
-          MPI_Barrier(MPI_COMM_WORLD);
+          MPI_Barrier(app()->mpi.comm);
 #endif
         }  // z
       }  // y
diff --git a/src/bin/tests/test_eigenvalues.c b/src/bin/tests/test_eigenvalues.c
index c52d29cf8..6d162117e 100644
--- a/src/bin/tests/test_eigenvalues.c
+++ b/src/bin/tests/test_eigenvalues.c
@@ -390,14 +390,14 @@ int main(int argc, char *argv[]) {
       }
       rlxd_get(rlxd_state);
 #ifdef TM_USE_MPI
-      MPI_Send(&rlxd_state[0], 105, MPI_INT, 1, 99, MPI_COMM_WORLD);
-      MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_nproc - 1, 99, MPI_COMM_WORLD, &status);
+      MPI_Send(&rlxd_state[0], 105, MPI_INT, 1, 99, app()->mpi.comm);
+      MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_nproc - 1, 99, app()->mpi.comm, &status);
       rlxd_reset(rlxd_state);
 #endif
     }
 #ifdef TM_USE_MPI
     else {
-      MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_proc_id - 1, 99, MPI_COMM_WORLD, &status);
+      MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_proc_id - 1, 99, app()->mpi.comm, &status);
       rlxd_reset(rlxd_state);
       /* hot */
       if (startoption == 1) {
@@ -408,7 +408,7 @@ int main(int argc, char *argv[]) {
         k = 0;
       }
       rlxd_get(rlxd_state);
-      MPI_Send(&rlxd_state[0], 105, MPI_INT, k, 99, MPI_COMM_WORLD);
+      MPI_Send(&rlxd_state[0], 105, MPI_INT, k, 99, app()->mpi.comm);
     }
 #endif
 
diff --git a/src/lib/DDalphaAMG_interface.c b/src/lib/DDalphaAMG_interface.c
index bf2da4bef..dda8a826e 100644
--- a/src/lib/DDalphaAMG_interface.c
+++ b/src/lib/DDalphaAMG_interface.c
@@ -350,7 +350,7 @@ static int MG_pre_solve(su3 **gf) {
     MG_init();
     mg_initialized = 1;
     if (g_proc_id == 0) printf("TM_USE_DDalphaAMG initialized\n");
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
   }
 
   if (mg_update_gauge == 1) {
@@ -1243,7 +1243,7 @@ int MG_solver(spinor *const phi_new, spinor *const phi_old, const double precisi
     if (g_proc_id == 0) printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
     // TODO: handle abort
     DDalphaAMG_finalize();
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     MPI_Finalize();
     exit(1);
   }
@@ -1319,7 +1319,7 @@ int MG_solver_nd(spinor *const up_new, spinor *const dn_new, spinor *const up_ol
     if (g_proc_id == 0) printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
     // TODO: handle abort
     DDalphaAMG_finalize();
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     MPI_Finalize();
     exit(1);
   }
@@ -1393,7 +1393,7 @@ int MG_mms_solver_nd(spinor **const up_new, spinor **const dn_new, spinor *const
     if (g_proc_id == 0) printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
     // TODO: handle abort
     DDalphaAMG_finalize();
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     MPI_Finalize();
     exit(1);
   }
diff --git a/src/lib/compare_derivative.c b/src/lib/compare_derivative.c
index 6ac39f222..b7802481f 100644
--- a/src/lib/compare_derivative.c
+++ b/src/lib/compare_derivative.c
@@ -64,8 +64,8 @@ void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native, const
 
   int red_n_diff = 0;
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
-  MPI_Reduce(&n_diff, &red_n_diff, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
+  MPI_Reduce(&n_diff, &red_n_diff, 1, MPI_INT, MPI_MAX, 0, app()->mpi.comm);
 #else
   red_n_diff = n_diff;
 #endif
diff --git a/src/lib/fatal_error.c b/src/lib/fatal_error.c
index b4f5d5be9..8e00dfbe7 100644
--- a/src/lib/fatal_error.c
+++ b/src/lib/fatal_error.c
@@ -46,7 +46,7 @@ void fatal_error(char const *error, char const *function) {
   }
 
 #ifdef TM_USE_MPI
-  MPI_Abort(MPI_COMM_WORLD, 1);
+  MPI_Abort(app()->mpi.comm, 1);
   MPI_Finalize();
 #endif
 
diff --git a/src/lib/init/init_parallel.c b/src/lib/init/init_parallel.c
index 194404d7d..9dfdbb0c5 100644
--- a/src/lib/init/init_parallel.c
+++ b/src/lib/init/init_parallel.c
@@ -71,7 +71,7 @@ void init_parallel_and_read_input(int argc, char *argv[], const char input_filen
 #endif  // QPHIX_QMP_COMMS
 
 #if defined(TM_USE_MPI) || defined(QPHIX_QMP_COMMS)
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+  MPI_Comm_rank(app()->mpi.comm, &g_proc_id);
 #else
   g_proc_id = 0;
 #endif
diff --git a/src/lib/io/deri_write_stdout.c b/src/lib/io/deri_write_stdout.c
index 6b095d22e..db1c4f98e 100644
--- a/src/lib/io/deri_write_stdout.c
+++ b/src/lib/io/deri_write_stdout.c
@@ -66,7 +66,7 @@ void deri_write_stdout(su3adj** const df) {
             }
           }
 #ifdef TM_USE_MPI
-          MPI_Barrier(MPI_COMM_WORLD);
+          MPI_Barrier(app()->mpi.comm);
 #endif
         }
       }
diff --git a/src/lib/io/dml.c b/src/lib/io/dml.c
index 2650e108d..0465f1206 100644
--- a/src/lib/io/dml.c
+++ b/src/lib/io/dml.c
@@ -31,7 +31,7 @@ int DML_global_xor(uint32_t *x) {
   int status;
 
   status =
-      MPI_Allreduce((void *)&work, (void *)&dest, 1, MPI_UNSIGNED_LONG, MPI_BXOR, MPI_COMM_WORLD);
+      MPI_Allreduce((void *)&work, (void *)&dest, 1, MPI_UNSIGNED_LONG, MPI_BXOR, app()->mpi.comm);
 
   if (status == MPI_SUCCESS) {
     *x = (uint32_t)dest;
diff --git a/src/lib/io/eospinor_read.c b/src/lib/io/eospinor_read.c
index 2767d8b13..d23623595 100644
--- a/src/lib/io/eospinor_read.c
+++ b/src/lib/io/eospinor_read.c
@@ -99,7 +99,7 @@ int read_eospinor(spinor *const s, char *filename) {
                   "LIME read error occured with status = %d while reading file %s!\n Aborting...\n",
                   status, filename);
 #ifdef TM_USE_MPI
-              MPI_Abort(MPI_COMM_WORLD, 1);
+              MPI_Abort(app()->mpi.comm, 1);
               MPI_Finalize();
 #endif
               exit(500);
diff --git a/src/lib/io/eospinor_write.c b/src/lib/io/eospinor_write.c
index 619e11ac8..52c3e688d 100644
--- a/src/lib/io/eospinor_write.c
+++ b/src/lib/io/eospinor_write.c
@@ -67,7 +67,7 @@ int write_eospinor(spinor* const s, char* filename, const double evalue, const d
     if (limewriter == (LimeWriter*)NULL) {
       fprintf(stderr, "LIME error in file %s for writing!\n Aboring...\n", filename);
 #ifdef TM_USE_MPI
-      MPI_Abort(MPI_COMM_WORLD, 1);
+      MPI_Abort(app()->mpi.comm, 1);
       MPI_Finalize();
 #endif
       exit(500);
@@ -78,7 +78,7 @@ int write_eospinor(spinor* const s, char* filename, const double evalue, const d
     if (status < 0) {
       fprintf(stderr, "LIME write header (xlf-info) error %d\n", status);
 #ifdef TM_USE_MPI
-      MPI_Abort(MPI_COMM_WORLD, 1);
+      MPI_Abort(app()->mpi.comm, 1);
       MPI_Finalize();
 #endif
       exit(500);
@@ -94,7 +94,7 @@ int write_eospinor(spinor* const s, char* filename, const double evalue, const d
     if (status < 0) {
       fprintf(stderr, "LIME write header (eospinor-binary-data) error %d\n", status);
 #ifdef TM_USE_MPI
-      MPI_Abort(MPI_COMM_WORLD, 1);
+      MPI_Abort(app()->mpi.comm, 1);
       MPI_Finalize();
 #endif
       exit(500);
@@ -137,7 +137,7 @@ int write_eospinor(spinor* const s, char* filename, const double evalue, const d
               if (status < 0) {
                 fprintf(stderr, "LIME write error %d\n", status);
 #ifdef TM_USE_MPI
-                MPI_Abort(MPI_COMM_WORLD, 1);
+                MPI_Abort(app()->mpi.comm, 1);
                 MPI_Finalize();
 #endif
                 exit(500);
diff --git a/src/lib/io/gauge_read_binary.c b/src/lib/io/gauge_read_binary.c
index 473e4d9c7..28cebd7f7 100644
--- a/src/lib/io/gauge_read_binary.c
+++ b/src/lib/io/gauge_read_binary.c
@@ -219,7 +219,7 @@ int read_binary_gauge_data(LimeReader *limereader, DML_Checksum *checksum, param
                 "LIME read error occurred with status = %d while reading in gauge_read_binary.c!\n",
                 status);
 #ifdef TM_USE_MPI
-            MPI_Abort(MPI_COMM_WORLD, 1);
+            MPI_Abort(app()->mpi.comm, 1);
             MPI_Finalize();
 #endif
             return (-2);
diff --git a/src/lib/io/gauge_write.c b/src/lib/io/gauge_write.c
index 43ff7fd83..20605bc8a 100644
--- a/src/lib/io/gauge_write.c
+++ b/src/lib/io/gauge_write.c
@@ -50,7 +50,7 @@ int write_gauge_field(char *filename, const int prec, paramsXlfInfo const *xlfIn
     fflush(stdout);
   }
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 #endif /* MPI */
 
   destruct_writer(writer);
diff --git a/src/lib/io/gauge_write_binary.c b/src/lib/io/gauge_write_binary.c
index ad3c7882e..64f504264 100644
--- a/src/lib/io/gauge_write_binary.c
+++ b/src/lib/io/gauge_write_binary.c
@@ -222,7 +222,7 @@ int write_binary_gauge_data(LimeWriter* limewriter, const int prec, DML_Checksum
               fprintf(stderr, "id = %d, bytes = %lu, size = %d\n", g_cart_id, bytes,
                       (int)(4 * sizeof(su3) / 8));
 #ifdef TM_USE_MPI
-              MPI_Abort(MPI_COMM_WORLD, 1);
+              MPI_Abort(app()->mpi.comm, 1);
               MPI_Finalize();
 #endif
               exit(500);
diff --git a/src/lib/io/io_cm.c b/src/lib/io/io_cm.c
index e08fd37c6..2f4bfa5a4 100644
--- a/src/lib/io/io_cm.c
+++ b/src/lib/io/io_cm.c
@@ -104,7 +104,7 @@ int read_spinorfield_cm_swap_single(spinor *const s, spinor *const r, char *file
   if (ifs == (FILE *)NULL) {
     fprintf(stderr, "Could not open file %s\n Aborting...\n", filename);
 #ifdef TM_USE_MPI
-    MPI_Abort(MPI_COMM_WORLD, 1);
+    MPI_Abort(app()->mpi.comm, 1);
     MPI_Finalize();
 #endif
     exit(500);
diff --git a/src/lib/io/spinor_read_binary.c b/src/lib/io/spinor_read_binary.c
index 81607a700..fbf97c32c 100644
--- a/src/lib/io/spinor_read_binary.c
+++ b/src/lib/io/spinor_read_binary.c
@@ -198,7 +198,7 @@ int read_binary_spinor_data(spinor *const s, spinor *const r, LimeReader *limere
                     "spinor_read_binary.c!\n",
                     status);
 #ifdef TM_USE_MPI
-            MPI_Abort(MPI_COMM_WORLD, 1);
+            MPI_Abort(app()->mpi.comm, 1);
             MPI_Finalize();
 #endif
             return (-2);
@@ -376,7 +376,7 @@ int read_binary_spinor_data_l(spinor *const s, LimeReader *limereader, DML_Check
                     "spinor_read_binary.c!\n",
                     status);
 #ifdef TM_USE_MPI
-            MPI_Abort(MPI_COMM_WORLD, 1);
+            MPI_Abort(app()->mpi.comm, 1);
             MPI_Finalize();
 #endif
             return (-2);
diff --git a/src/lib/io/spinor_write_binary.c b/src/lib/io/spinor_write_binary.c
index 560b5ce65..b8718cc9e 100644
--- a/src/lib/io/spinor_write_binary.c
+++ b/src/lib/io/spinor_write_binary.c
@@ -217,7 +217,7 @@ int write_binary_spinor_data(spinor *const s, spinor *const r, LimeWriter *limew
                       "write_binary_spinor_data (spinor_write_binary.c)!\n",
                       status);
 #ifdef TM_USE_MPI
-              MPI_Abort(MPI_COMM_WORLD, 1);
+              MPI_Abort(app()->mpi.comm, 1);
               MPI_Finalize();
 #endif
               exit(500);
@@ -458,7 +458,7 @@ int write_binary_spinor_data_l(spinor *const s, LimeWriter *limewriter, DML_Chec
                       "write_binary_spinor_data_l (spinor_write_binary.c)!\n",
                       status);
 #ifdef TM_USE_MPI
-              MPI_Abort(MPI_COMM_WORLD, 1);
+              MPI_Abort(app()->mpi.comm, 1);
               MPI_Finalize();
 #endif
               exit(500);
diff --git a/src/lib/io/spinor_write_stdout.c b/src/lib/io/spinor_write_stdout.c
index 9c13abf47..6c1fa1034 100644
--- a/src/lib/io/spinor_write_stdout.c
+++ b/src/lib/io/spinor_write_stdout.c
@@ -57,7 +57,7 @@ void spinor_write_stdout(spinor* const s) {
             fflush(stdout);
           }
 #ifdef TM_USE_MPI
-          MPI_Barrier(MPI_COMM_WORLD);
+          MPI_Barrier(app()->mpi.comm);
 #endif
         }
       }
diff --git a/src/lib/io/sw_write_stdout.c b/src/lib/io/sw_write_stdout.c
index 027edd172..b2cec0b29 100644
--- a/src/lib/io/sw_write_stdout.c
+++ b/src/lib/io/sw_write_stdout.c
@@ -66,7 +66,7 @@ void sw_write_stdout(su3** u) {
             }
           }
 #ifdef TM_USE_MPI
-          MPI_Barrier(MPI_COMM_WORLD);
+          MPI_Barrier(app()->mpi.comm);
 #endif
         }
       }
diff --git a/src/lib/linalg/assign_mul_add_r_and_square.c b/src/lib/linalg/assign_mul_add_r_and_square.c
index f9fb60f72..fc45c313f 100644
--- a/src/lib/linalg/assign_mul_add_r_and_square.c
+++ b/src/lib/linalg/assign_mul_add_r_and_square.c
@@ -99,7 +99,7 @@ double assign_mul_add_r_and_square(spinor *const R, const double c, const spinor
 
 #ifdef TM_USE_MPI
   if (parallel) {
-    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
     return (mres);
   }
 #endif
diff --git a/src/lib/linalg/diff_and_square_norm.c b/src/lib/linalg/diff_and_square_norm.c
index 01a134fc8..0b7d6612a 100644
--- a/src/lib/linalg/diff_and_square_norm.c
+++ b/src/lib/linalg/diff_and_square_norm.c
@@ -72,7 +72,7 @@ double diff_and_square_norm(spinor *const Q, spinor *const R, const int N) {
   }
   kc = ks + kc;
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   return ks;
 #else
   return kc;
diff --git a/src/lib/linalg/scalar_prod_body.c b/src/lib/linalg/scalar_prod_body.c
index 9a38cf5fb..3114d6119 100644
--- a/src/lib/linalg/scalar_prod_body.c
+++ b/src/lib/linalg/scalar_prod_body.c
@@ -71,7 +71,7 @@ _Complex double _PSWITCH(scalar_prod)(const _PTSWITCH(spinor) *const S,
 
 #ifdef TM_USE_MPI
   if (parallel == 1) {
-    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, app()->mpi.comm);
     return (mres);
   }
 #endif
@@ -113,7 +113,7 @@ _Complex double _PSWITCH(scalar_prod_ts)(const _PTSWITCH(spinor) *const S,
 
 #ifdef TM_USE_MPI
   if (parallel == 1) {
-    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, app()->mpi.comm);
     return (mres);
   }
 #endif
diff --git a/src/lib/linalg/scalar_prod_i.c b/src/lib/linalg/scalar_prod_i.c
index ca59c65d3..ca1d378b7 100644
--- a/src/lib/linalg/scalar_prod_i.c
+++ b/src/lib/linalg/scalar_prod_i.c
@@ -64,7 +64,7 @@ double scalar_prod_i(spinor *const S, spinor *const R, const int N, const int pa
 
 #if defined TM_USE_MPI
   if (parallel == 1) {
-    MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
     kc = ks;
   }
 #endif
diff --git a/src/lib/linalg/scalar_prod_r.c b/src/lib/linalg/scalar_prod_r.c
index c5288aa34..8e1c07af1 100644
--- a/src/lib/linalg/scalar_prod_r.c
+++ b/src/lib/linalg/scalar_prod_r.c
@@ -91,7 +91,7 @@ double scalar_prod_r(const spinor *const S, const spinor *const R, const int N,
 
 #if defined TM_USE_MPI
   if (parallel) {
-    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
     return mres;
   }
 #endif
diff --git a/src/lib/linalg/scalar_prod_r_32.c b/src/lib/linalg/scalar_prod_r_32.c
index 5bc512806..23f673ee2 100644
--- a/src/lib/linalg/scalar_prod_r_32.c
+++ b/src/lib/linalg/scalar_prod_r_32.c
@@ -64,7 +64,7 @@ float scalar_prod_r_32(const spinor32 *const S, const spinor32 *const R, const i
 
 #if defined TM_USE_MPI
   if (parallel) {
-    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, app()->mpi.comm);
     return mres;
   }
 #endif
diff --git a/src/lib/linalg/square_and_minmax.c b/src/lib/linalg/square_and_minmax.c
index 3a97c1ede..6432d2602 100644
--- a/src/lib/linalg/square_and_minmax.c
+++ b/src/lib/linalg/square_and_minmax.c
@@ -75,12 +75,12 @@ void square_and_minmax(double *const sum, double *const min, double *const max,
 
 #if defined TM_USE_MPI
 
-  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 
-  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, app()->mpi.comm);
   *min = kc;
 
-  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, app()->mpi.comm);
   *max = kc;
 
 #endif
@@ -129,12 +129,12 @@ void square_and_minmax_rel(double *const sum, double *const min, double *const m
 
 #if defined TM_USE_MPI
 
-  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 
-  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, app()->mpi.comm);
   *min = kc;
 
-  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, app()->mpi.comm);
   *max = kc;
 
 #endif
@@ -256,18 +256,18 @@ void square_and_minmax_abs(double *const sum, double *const min, double *const m
 
 #if defined TM_USE_MPI
 
-  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 
-  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, app()->mpi.comm);
   *min = kc;
 
-  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, app()->mpi.comm);
   *max = kc;
 
-  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, app()->mpi.comm);
   *min_abs = kc;
 
-  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, app()->mpi.comm);
   *max_abs = kc;
 
 #endif
@@ -428,18 +428,18 @@ void square_and_minmax_rel_abs(double *const sum, double *const min, double *con
 
 #if defined TM_USE_MPI
 
-  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 
-  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, app()->mpi.comm);
   *min = kc;
 
-  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, app()->mpi.comm);
   *max = kc;
 
-  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, app()->mpi.comm);
   *min_abs = kc;
 
-  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, app()->mpi.comm);
   *max_abs = kc;
 
 #endif
diff --git a/src/lib/linalg/square_and_prod_r.c b/src/lib/linalg/square_and_prod_r.c
index 212b46593..b02babf42 100644
--- a/src/lib/linalg/square_and_prod_r.c
+++ b/src/lib/linalg/square_and_prod_r.c
@@ -80,7 +80,7 @@ void square_and_prod_r(double *const x1, double *const x2, spinor *const S, spin
 
 #if defined TM_USE_MPI
 
-  MPI_Allreduce(&xkc, x1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&xkc, x1, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 
 #endif
   kc = ks + kc;
@@ -88,7 +88,7 @@ void square_and_prod_r(double *const x1, double *const x2, spinor *const S, spin
 
 #if defined TM_USE_MPI
 
-  MPI_Allreduce(&kc, x2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&kc, x2, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 
 #endif
 }
diff --git a/src/lib/linalg/square_norm.c b/src/lib/linalg/square_norm.c
index fd4b77781..7c871f41b 100644
--- a/src/lib/linalg/square_norm.c
+++ b/src/lib/linalg/square_norm.c
@@ -91,7 +91,7 @@ double square_norm(const spinor *const P, const int N, const int parallel) {
 
 #ifdef TM_USE_MPI
   if (parallel) {
-    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
     return mres;
   }
 #endif
@@ -142,7 +142,7 @@ double square_norm_ts(const spinor *const P, const int N, const int parallel) {
 
 #ifdef TM_USE_MPI
   if (parallel) {
-    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
     return mres;
   }
 #endif
diff --git a/src/lib/linalg/square_norm_32.c b/src/lib/linalg/square_norm_32.c
index b331620cd..53207f454 100644
--- a/src/lib/linalg/square_norm_32.c
+++ b/src/lib/linalg/square_norm_32.c
@@ -65,7 +65,7 @@ float square_norm_32(const spinor32 *const P, const int N, const int parallel) {
 
 #ifdef TM_USE_MPI
   if (parallel) {
-    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, app()->mpi.comm);
     return mres;
   }
 #endif
@@ -113,7 +113,7 @@ float square_norm_ts_32(const spinor32 *const P, const int N, const int parallel
 
 #ifdef TM_USE_MPI
   if (parallel) {
-    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, app()->mpi.comm);
     return mres;
   }
 #endif
diff --git a/src/lib/meas/correlators.c b/src/lib/meas/correlators.c
index 36c8d2776..ed50eba3f 100644
--- a/src/lib/meas/correlators.c
+++ b/src/lib/meas/correlators.c
@@ -125,7 +125,7 @@ void correlators_measurement(const int traj, const int id, const int ieo) {
         t0 = (int)(measurement_list[id].max_source_slice * tmp);
       }
 #ifdef TM_USE_MPI
-      MPI_Bcast(&t0, 1, MPI_INT, 0, MPI_COMM_WORLD);
+      MPI_Bcast(&t0, 1, MPI_INT, 0, app()->mpi.comm);
 #endif
       if (g_debug_level > 1 && g_proc_id == 0) {
         printf("# timeslice set to %d (T=%d) for online measurement\n", t0, g_nproc_t * T);
@@ -252,7 +252,7 @@ void correlators_measurement(const int traj, const int id, const int ieo) {
       free(sCpp);
       free(sCpa);
       free(sCp4);
-      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(app()->mpi.comm);
 #else
       free(Cpp);
       free(Cpa);
diff --git a/src/lib/meas/measure_clover_field_strength_observables.c b/src/lib/meas/measure_clover_field_strength_observables.c
index 3d33ae999..b95a2390a 100644
--- a/src/lib/meas/measure_clover_field_strength_observables.c
+++ b/src/lib/meas/measure_clover_field_strength_observables.c
@@ -210,9 +210,9 @@ void measure_clover_field_strength_observables(const su3 **const gf,
 #endif
 
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&Eres, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&Eres, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   Eres = mres;
-  MPI_Allreduce(&Qres, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&Qres, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   Qres = mres;
 #endif
   fso->E = energy_density_normalization * Eres;
diff --git a/src/lib/meas/oriented_plaquettes.c b/src/lib/meas/oriented_plaquettes.c
index 51e73acf2..1fe9207fd 100644
--- a/src/lib/meas/oriented_plaquettes.c
+++ b/src/lib/meas/oriented_plaquettes.c
@@ -80,7 +80,7 @@ void measure_oriented_plaquettes(const su3 **const gf, double *plaq) {
   }
 
 #ifdef TM_USE_MPI
-  MPI_Allreduce(plaq, mplaq, 6, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(plaq, mplaq, 6, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   for (int j = 0; j < 6; j++) plaq[j] = mplaq[j];
 #endif
   tm_stopwatch_pop(&g_timers, 0, 2, "");
diff --git a/src/lib/meas/pion_norm.c b/src/lib/meas/pion_norm.c
index 1ee756145..38d13320b 100644
--- a/src/lib/meas/pion_norm.c
+++ b/src/lib/meas/pion_norm.c
@@ -68,7 +68,7 @@ void pion_norm_measurement(const int traj, const int id, const int ieo) {
   ranlxs(&tmp, 1);
   z0 = (int)(measurement_list[id].max_source_slice * tmp);
 #ifdef TM_USE_MPI
-  MPI_Bcast(&z0, 1, MPI_INT, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&z0, 1, MPI_INT, 0, app()->mpi.comm);
 #endif
 
   Cpp = (double *)calloc(g_nproc_z * LZ, sizeof(double));
diff --git a/src/lib/meas/polyakov_loop.c b/src/lib/meas/polyakov_loop.c
index 25deea402..14d7c9ba5 100644
--- a/src/lib/meas/polyakov_loop.c
+++ b/src/lib/meas/polyakov_loop.c
@@ -66,7 +66,7 @@ void polyakov_loop(_Complex double *pl_, const int mu) {
     fprintf(stderr, "Only direction %d and %d are allowed.\n", 2, 3);
     fprintf(stderr, "Actual value is %d! Aborting...\n", mu);
 #ifdef TM_USE_MPI
-    MPI_Abort(MPI_COMM_WORLD, 10);
+    MPI_Abort(app()->mpi.comm, 10);
     MPI_Finalize();
 #endif
     exit(0);
@@ -137,7 +137,7 @@ void polyakov_loop(_Complex double *pl_, const int mu) {
 
   /* Collect the results and return:*/
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&pl, &pls, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&pl, &pls, 2, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   pl = pls;
 #endif
 
diff --git a/src/lib/measure_gauge_action.c b/src/lib/measure_gauge_action.c
index ecbe7a888..654990166 100644
--- a/src/lib/measure_gauge_action.c
+++ b/src/lib/measure_gauge_action.c
@@ -96,7 +96,7 @@ double measure_plaquette(const su3 *const *const gf) {
   for (int i = 0; i < omp_num_threads; ++i) res += g_omp_acc_re[i];
 #endif
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   res = mres;
 #endif
   return res;
@@ -178,7 +178,7 @@ double measure_gauge_action(const su3 *const *const gf, const double lambda) {
   for (int i = 0; i < omp_num_threads; ++i) res += g_omp_acc_re[i];
 #endif
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   res = mres;
 #endif
   GaugeInfo.plaquetteEnergy = res;
diff --git a/src/lib/measure_rectangles.c b/src/lib/measure_rectangles.c
index e6f59b8b2..4f0ca07fd 100644
--- a/src/lib/measure_rectangles.c
+++ b/src/lib/measure_rectangles.c
@@ -129,7 +129,7 @@ double measure_rectangles(const su3 **const gf) {
 #else
 #endif
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   res = mres;
 #endif
 
diff --git a/src/lib/monomial/moment_energy.c b/src/lib/monomial/moment_energy.c
index 01776edfa..c9dfe16e6 100644
--- a/src/lib/monomial/moment_energy.c
+++ b/src/lib/monomial/moment_energy.c
@@ -69,7 +69,7 @@ double moment_energy(su3adj **const momenta) {
   kc = 0.5 * (ks + kc);
 #ifdef TM_USE_MPI
   ks = kc;
-  MPI_Allreduce(&ks, &kc, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ks, &kc, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
 #endif
   etime = gettime();
   if (g_proc_id == 0) {
diff --git a/src/lib/monomial/monitor_forces.c b/src/lib/monomial/monitor_forces.c
index dc186ac05..ea64ddcfa 100644
--- a/src/lib/monomial/monitor_forces.c
+++ b/src/lib/monomial/monitor_forces.c
@@ -89,9 +89,9 @@ void monitor_forces(hamiltonian_field_t* const hf) {
 
       // output for force monitoring
 #ifdef TM_USE_MPI
-      MPI_Reduce(&sum, &sum2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+      MPI_Reduce(&sum, &sum2, 1, MPI_DOUBLE, MPI_SUM, 0, app()->mpi.comm);
       sum = sum2;
-      MPI_Reduce(&max, &sum2, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+      MPI_Reduce(&max, &sum2, 1, MPI_DOUBLE, MPI_MAX, 0, app()->mpi.comm);
       max = sum2;
 #endif
       if (g_proc_id == 0) {
diff --git a/src/lib/mpi_init.c b/src/lib/mpi_init.c
index f245f0556..6a8c81e40 100644
--- a/src/lib/mpi_init.c
+++ b/src/lib/mpi_init.c
@@ -159,7 +159,7 @@ void reduce_su3_ray(void *u_i /* in */, void *u_io /* in/out */, int *len /* in
 
   if (*dt != mpi_su3) {
     fprintf(stderr, "\nInvalid datatype for reduce_su3_ray(); abort.\n");
-    MPI_Abort(MPI_COMM_WORLD, 1);
+    MPI_Abort(app()->mpi.comm, 1);
   }
   for (n = 0; n < *len; n++) {
     _su3_times_su3(tmp, *(u + n), *(v + n)) _su3_assign(*(v + n), tmp)
@@ -249,8 +249,8 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   dims[2] = N_PROC_Y;
   dims[3] = N_PROC_Z;
 
-  MPI_Comm_size(MPI_COMM_WORLD, &g_nproc);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+  MPI_Comm_size(app()->mpi.comm, &g_nproc);
+  MPI_Comm_rank(app()->mpi.comm, &g_proc_id);
   MPI_Get_processor_name(processor_name, &namelen);
   MPI_Dims_create(g_nproc, nalldims, dims);
   if (g_proc_id == 0) {
@@ -273,7 +273,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
       fprintf(stderr, "Please check your number of processors and the Nr?Procs input variables\n");
       fprintf(stderr, "Aborting...!\n");
     }
-    MPI_Abort(MPI_COMM_WORLD, 1);
+    MPI_Abort(app()->mpi.comm, 1);
     MPI_Finalize();
     exit(-1);
   }
@@ -332,7 +332,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
   halffield_buffer_z2 = (halfspinor *)malloc(T * LX * LY / 2 * sizeof(halfspinor));
 #endif
 
-  MPI_Cart_create(MPI_COMM_WORLD, nalldims, dims, periods, reorder, &g_cart_grid);
+  MPI_Cart_create(app()->mpi.comm, nalldims, dims, periods, reorder, &g_cart_grid);
   MPI_Comm_rank(g_cart_grid, &g_cart_id);
   MPI_Cart_coords(g_cart_grid, g_cart_id, nalldims, g_proc_coords);
   if (g_debug_level > 1) {
diff --git a/src/lib/operator/clover_det.c b/src/lib/operator/clover_det.c
index 20984ba7f..f7f3d28cb 100644
--- a/src/lib/operator/clover_det.c
+++ b/src/lib/operator/clover_det.c
@@ -169,7 +169,7 @@ double sw_trace(const int ieo, const double mu) {
 #endif
 
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return (mres);
 #else
@@ -259,7 +259,7 @@ double sw_trace_nd(const int ieo, const double mu, const double eps) {
 #endif
 
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return (mres);
 #else
diff --git a/src/lib/prepare_source.c b/src/lib/prepare_source.c
index 3b45c35a1..8f15c9c7b 100644
--- a/src/lib/prepare_source.c
+++ b/src/lib/prepare_source.c
@@ -91,7 +91,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
               }
             }
 #ifdef TM_USE_MPI
-            MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD);
+            MPI_Bcast(&t, 1, MPI_INT, 0, app()->mpi.comm);
 #endif
             SourceInfo.t = t;
           }
@@ -180,7 +180,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
           t = (int)(u * g_nproc_t * T);
         }
 #ifdef TM_USE_MPI
-        MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        MPI_Bcast(&t, 1, MPI_INT, 0, app()->mpi.comm);
 #endif
         SourceInfo.t = t;
       }
@@ -206,7 +206,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
           }
         }
 #ifdef TM_USE_MPI
-        MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD);
+        MPI_Bcast(&t, 1, MPI_INT, 0, app()->mpi.comm);
 #endif
         SourceInfo.t = t;
       }
@@ -315,7 +315,7 @@ void prepare_source(const int nstore, const int isample, const int ix, const int
         if (read_spinor(g_spinor_field[2], g_spinor_field[3], source_filename, 0) != 0) {
           fprintf(stderr, "Error reading source! Aborting...\n");
 #ifdef TM_USE_MPI
-          MPI_Abort(MPI_COMM_WORLD, 1);
+          MPI_Abort(app()->mpi.comm, 1);
           MPI_Finalize();
 #endif
           exit(-1);
diff --git a/src/lib/qphix/qphix_interface.cpp b/src/lib/qphix/qphix_interface.cpp
index 2c61427dd..4469d3109 100644
--- a/src/lib/qphix/qphix_interface.cpp
+++ b/src/lib/qphix/qphix_interface.cpp
@@ -224,9 +224,9 @@ void _initQphix(int argc, char **argv, tm_QPhiXParams_t params, int c12, QphixPr
                  g_proc_coords[2], g_proc_coords[3], g_proc_coords[0]);
           free(qmp_coords);
           fflush(stdout);
-          MPI_Barrier(MPI_COMM_WORLD);
+          MPI_Barrier(app()->mpi.comm);
         } else {
-          MPI_Barrier(MPI_COMM_WORLD);
+          MPI_Barrier(app()->mpi.comm);
         }
       }
     }
diff --git a/src/lib/quda_interface.c b/src/lib/quda_interface.c
index 0e55f5cb9..f2586c7cf 100644
--- a/src/lib/quda_interface.c
+++ b/src/lib/quda_interface.c
@@ -2926,20 +2926,20 @@ void quda_mg_tune_params(void *spinorOut, void *spinorIn, const int max_iter) {
     copy_quda_mg_tunable_params(&tunable_params[0], &cur_params);
     print_tunable_params_pair(&cur_params, &tunable_params[0], mg_n_level);
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     tm_stopwatch_push(&g_timers, "updateMultigridQuda", "");
     updateMultigridQuda(quda_mg_preconditioner, &quda_mg_param);
     tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
   }
 
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
   tm_stopwatch_push(&g_timers, "invertQuda", "");
   invertQuda(spinorOut, spinorIn, &inv_param);
   tunable_params[0].tts = inv_param.secs;
   tunable_params[0].iter = inv_param.iter;
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 
   for (i = 1; i < quda_mg_tuning_plan.mg_tuning_iterations; i++) {
     // the best params from all previous iterations
@@ -2984,16 +2984,16 @@ void quda_mg_tune_params(void *spinorOut, void *spinorIn, const int max_iter) {
 
     print_tunable_params_pair(&cur_params, &tunable_params[i], mg_n_level);
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     tm_stopwatch_push(&g_timers, "updateMultigridQuda", "");
     updateMultigridQuda(quda_mg_preconditioner, &quda_mg_param);
     tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
     tm_stopwatch_push(&g_timers, "invertQuda", "");
     invertQuda(spinorOut, spinorIn, &inv_param);
     tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
     tunable_params[i].tts = inv_param.secs;
     tunable_params[i].iter = inv_param.iter;
diff --git a/src/lib/reweighting_factor.c b/src/lib/reweighting_factor.c
index df283f6c0..8e39d3148 100644
--- a/src/lib/reweighting_factor.c
+++ b/src/lib/reweighting_factor.c
@@ -163,6 +163,6 @@ void reweighting_factor(const int N, const int nstore) {
   free(data);
   free(trlog);
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 #endif  // TM_USE_MPI
 }
diff --git a/src/lib/sighandler.c b/src/lib/sighandler.c
index 7bcb36d15..1447a678e 100644
--- a/src/lib/sighandler.c
+++ b/src/lib/sighandler.c
@@ -49,7 +49,7 @@ void catch_ill_inst(int s) {
   fprintf(stderr, "Aborting...\n");
   fflush(stdout);
 #ifdef TM_USE_MPI
-  MPI_Abort(MPI_COMM_WORLD, 1);
+  MPI_Abort(app()->mpi.comm, 1);
   MPI_Finalize();
 #endif
   exit(0);
diff --git a/src/lib/solver/dfl_projector.c b/src/lib/solver/dfl_projector.c
index b840aabfa..f54e0b998 100644
--- a/src/lib/solver/dfl_projector.c
+++ b/src/lib/solver/dfl_projector.c
@@ -428,7 +428,7 @@ void little_project(_Complex double *const out, _Complex double *const in, const
   }
 
 #ifdef TM_USE_MPI
-  MPI_Allreduce(phi, psi, N, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(phi, psi, N, MPI_DOUBLE_COMPLEX, MPI_SUM, app()->mpi.comm);
 #else
   memcpy(psi, phi, N * sizeof(_Complex double));
 #endif
@@ -487,7 +487,7 @@ void little_project2(_Complex double *const out, _Complex double *const in, cons
     phi[i] = lscalar_prod(little_dfl_fields[i], in, nb_blocks * N, 0);
   }
 #ifdef TM_USE_MPI
-  MPI_Allreduce(phi, psi, g_N_s, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(phi, psi, g_N_s, MPI_DOUBLE_COMPLEX, MPI_SUM, app()->mpi.comm);
 #else
   memcpy(psi, phi, g_N_s * sizeof(_Complex double));
 #endif
@@ -1023,7 +1023,7 @@ void check_little_D_inversion(const int repro) {
   little_D(result, invvec); /* This should be a proper inverse now */
 
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 #endif
 
   ldiff(invvec, result, inprod, nb_blocks * g_N_s);
diff --git a/src/lib/solver/gcr4complex_body.c b/src/lib/solver/gcr4complex_body.c
index 29351d87f..556838c83 100644
--- a/src/lib/solver/gcr4complex_body.c
+++ b/src/lib/solver/gcr4complex_body.c
@@ -218,7 +218,7 @@ _F_TYPE _PSWITCH(lsquare_norm)(_C_TYPE *const Q, const int N, const int parallel
 #ifdef TM_USE_MPI
   if (parallel) {
     double nrm2 = nrm;
-    MPI_Allreduce(&nrm2, &nrm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&nrm2, &nrm, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   }
 #endif
 
@@ -253,7 +253,7 @@ _C_TYPE _PSWITCH(lscalar_prod)(_C_TYPE *const R, _C_TYPE *const S, const int N,
 #ifdef TM_USE_MPI
   if (parallel) {
     _Complex double res2 = res;
-    MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, app()->mpi.comm);
   }
 #endif
 
@@ -289,7 +289,7 @@ _F_TYPE _PSWITCH(lscalar_prod_r)(_C_TYPE *const R, _C_TYPE *const S, const int N
 #ifdef TM_USE_MPI
   if (parallel) {
     double res2 = res;
-    MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   }
 #endif
 
diff --git a/src/lib/solver/little_project_eo_body.c b/src/lib/solver/little_project_eo_body.c
index eaca17537..7d43100eb 100644
--- a/src/lib/solver/little_project_eo_body.c
+++ b/src/lib/solver/little_project_eo_body.c
@@ -37,7 +37,7 @@ void _PSWITCH(little_project_eo)(_Complex _F_TYPE *const out, _Complex _F_TYPE *
   }
 
 #ifdef TM_USE_MPI
-  MPI_Allreduce(phi, psi, N, _MPI_C_TYPE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(phi, psi, N, _MPI_C_TYPE, MPI_SUM, app()->mpi.comm);
 #else
   memcpy(psi, phi, N * sizeof(_Complex _F_TYPE));
 #endif
diff --git a/src/lib/start.c b/src/lib/start.c
index 7316ec9bd..5504a83a2 100644
--- a/src/lib/start.c
+++ b/src/lib/start.c
@@ -218,7 +218,7 @@ void random_spinor_field_lexic(spinor *const k, const int repro, const enum RN_T
     } else if (g_proc_id == 0) {
       rlxd_get(rlxd_state);
     }
-    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, app()->mpi.comm);
     if (g_proc_id != 0) {
       rlxd_reset(rlxd_state);
     }
@@ -289,7 +289,7 @@ void random_spinor_field_eo(spinor *const k, const int repro, const enum RN_TYPE
     } else if (g_proc_id == 0) {
       rlxd_get(rlxd_state);
     }
-    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, app()->mpi.comm);
     if (g_proc_id != 0) {
       rlxd_reset(rlxd_state);
     }
@@ -432,7 +432,7 @@ void random_gauge_field(const int repro, su3 **const gf) {
     } else if (g_proc_id == 0) {
       rlxd_get(rlxd_state);
     }
-    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, app()->mpi.comm);
     rlxd_reset(rlxd_state);
 #endif
     for (t0 = 0; t0 < g_nproc_t * T; t0++) {
@@ -506,7 +506,7 @@ double random_su3adj_field(const int repro, su3adj **const momenta) {
     } else if (g_proc_id == 0) {
       rlxd_get(rlxd_state);
     }
-    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(rlxd_state, 105, MPI_INT, 0, app()->mpi.comm);
     rlxd_reset(rlxd_state);
 #endif
     for (int t0 = 0; t0 < g_nproc_t * T; t0++) {
@@ -589,7 +589,7 @@ double random_su3adj_field(const int repro, su3adj **const momenta) {
     kc = 0.5 * (ks + kc);
   }
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   return ks;
 #endif
   return kc;
@@ -838,7 +838,7 @@ void start_ranlux(int level, int seed) {
 #ifdef TM_USE_MPI
   unsigned int *seeds = calloc(g_nproc, sizeof(unsigned int));
   if (seeds == NULL) fatal_error("Memory allocation for seeds buffer failed!", "start_ranlux");
-  MPI_Gather(&loc_seed, 1, MPI_UNSIGNED, seeds, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
+  MPI_Gather(&loc_seed, 1, MPI_UNSIGNED, seeds, 1, MPI_UNSIGNED, 0, app()->mpi.comm);
   if (g_proc_id == 0) {
     for (int i = 0; i < g_nproc; ++i) {
       for (int j = i + 1; j < g_nproc; ++j) {
diff --git a/src/lib/test/check_xchange.c b/src/lib/test/check_xchange.c
index a20f86df4..7b4c2067b 100644
--- a/src/lib/test/check_xchange.c
+++ b/src/lib/test/check_xchange.c
@@ -85,9 +85,9 @@ int check_xchange() {
     }
 #endif
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     xchange_field(g_spinor_field[0], 0);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
     x = (double*)&g_spinor_field[0][VOLUME / 2];
     for (i = 0; i < LX * LY * LZ / 2 * 24; i++, x++) {
@@ -95,7 +95,7 @@ int check_xchange() {
         printf("The exchange up of fields in time direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -107,7 +107,7 @@ int check_xchange() {
         printf("The exchange down of fields in time direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -120,7 +120,7 @@ int check_xchange() {
         printf("The exchange up of fields in x direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -132,7 +132,7 @@ int check_xchange() {
         printf("The exchange down of fields in x direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -146,7 +146,7 @@ int check_xchange() {
         printf("The exchange up of fields in y direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -159,7 +159,7 @@ int check_xchange() {
         printf("The exchange down of fields in y direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -178,9 +178,9 @@ int check_xchange() {
       }
     }
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     xchange_field(g_spinor_field[0], 1);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
     x = (double*)&g_spinor_field[0][VOLUME / 2 + 2 * LX * LY * LZ / 2 + 2 * T * LY * LZ / 2 +
                                     2 * T * LX * LZ / 2];
@@ -189,7 +189,7 @@ int check_xchange() {
         printf("The exchange up of fields in z (1) direction up\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -202,7 +202,7 @@ int check_xchange() {
         printf("The exchange down of fields in z (1) direction down\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -219,9 +219,9 @@ int check_xchange() {
       }
     }
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     xchange_field(g_spinor_field[0], 1);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
     x = (double*)&g_spinor_field[0][VOLUME / 2 + 2 * LX * LY * LZ / 2 + 2 * T * LY * LZ / 2 +
                                     2 * T * LX * LZ / 2];
@@ -230,7 +230,7 @@ int check_xchange() {
         printf("The exchange up of fields in z (0) direction up\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -243,7 +243,7 @@ int check_xchange() {
         printf("The exchange down of fields in z (0) direction down\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -312,9 +312,9 @@ int check_xchange() {
     }
 #endif
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     xchange_gauge(g_gauge_field);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
     x = (double*)&g_gauge_field[T * LX * LY * LZ][0];
     for (i = 0; i < LX * LY * LZ * 72; i++, x++) {
@@ -322,7 +322,7 @@ int check_xchange() {
         printf("The exchange up of gaugefields in time direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -334,7 +334,7 @@ int check_xchange() {
         printf("The exchange down of gaugefields in time direction\n");
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -348,7 +348,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up);
         printf("%d %d %d\n", g_cart_id, i, (int)(*x));
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -361,7 +361,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn);
         printf("%d %d %d\n", g_cart_id, i, (int)(*x));
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -376,7 +376,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up);
         printf("%d %d %d\n", g_cart_id, i, (int)(*x));
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -389,7 +389,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn);
         printf("%d %d %d\n", g_cart_id, i, (int)(*x));
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -404,7 +404,7 @@ int check_xchange() {
         printf("between %d and %d is not correct, down is %d\n", g_cart_id, g_nb_z_up, g_nb_z_dn);
         printf("%d %d %d\n", g_cart_id, i, (int)(*x));
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -418,7 +418,7 @@ int check_xchange() {
         printf("between %d and %d is not correct, up is %d\n", g_cart_id, g_nb_z_dn, g_nb_z_up);
         printf("%d %d %d\n", g_cart_id, i, (int)(*x));
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -499,9 +499,9 @@ int check_xchange() {
       }
     }
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     xchange_gauge(g_gauge_field);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
     /* The edges */
 #if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
@@ -531,7 +531,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -544,7 +544,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -557,7 +557,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -570,7 +570,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -600,7 +600,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -613,7 +613,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -626,7 +626,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -639,7 +639,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -667,7 +667,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -680,7 +680,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -693,7 +693,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -706,7 +706,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -735,7 +735,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -748,7 +748,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -761,7 +761,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -774,7 +774,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -802,7 +802,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -816,7 +816,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -830,7 +830,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -844,7 +844,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -873,7 +873,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -887,7 +887,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mp);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -901,7 +901,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, pm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -915,7 +915,7 @@ int check_xchange() {
         printf("between %d and %d is not correct\n", g_cart_id, mm);
         printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
         printf("Program aborted\n");
-        MPI_Abort(MPI_COMM_WORLD, 5);
+        MPI_Abort(app()->mpi.comm, 5);
         MPI_Finalize();
         exit(0);
       }
@@ -973,9 +973,9 @@ int check_xchange() {
         }
       }
 
-      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(app()->mpi.comm);
       xchange_gauge(g_gauge_field);
-      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(app()->mpi.comm);
 
       x = (double*)&g_gauge_field[VOLUMEPLUSRAND][0];
       for (i = 0; i < LX * LY * LZ * 72; i++, x++) {
@@ -983,7 +983,7 @@ int check_xchange() {
           printf("The exchange up of gaugefields in time direction\n");
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -995,7 +995,7 @@ int check_xchange() {
           printf("The exchange up of gaugefields in time direction\n");
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1009,7 +1009,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up);
           printf("%d %d %d\n", g_cart_id, i, (int)(*x));
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1022,7 +1022,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn);
           printf("%d %d %d\n", g_cart_id, i, (int)(*x));
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1037,7 +1037,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up);
           printf("%d %d %d\n", g_cart_id, i, (int)(*x));
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1051,7 +1051,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn);
           printf("%d %d %d\n", g_cart_id, i, (int)(*x));
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1067,7 +1067,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up);
           printf("%d %d %d\n", g_cart_id, i, (int)(*x));
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1081,7 +1081,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn);
           printf("%d %d %d\n", g_cart_id, i, (int)(*x));
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1108,9 +1108,9 @@ int check_xchange() {
         }
       }
 
-      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(app()->mpi.comm);
       xchange_gauge(g_gauge_field);
-      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(app()->mpi.comm);
 
       /* Now there should be in the t and t2 Rand certain values set */
 
@@ -1128,7 +1128,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1141,7 +1141,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1154,7 +1154,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1167,7 +1167,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1181,7 +1181,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1194,7 +1194,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1207,7 +1207,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1220,7 +1220,7 @@ int check_xchange() {
               printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn);
               printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn);
               printf("Program aborted\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -1328,9 +1328,9 @@ int check_xchange() {
         }
       }
 #endif
-      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(app()->mpi.comm);
       xchange_gauge(g_gauge_field);
-      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Barrier(app()->mpi.comm);
 
 #if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
       di[0] = (g_proc_coords[0] - 1) % g_nproc_t;
@@ -1355,7 +1355,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1368,7 +1368,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1381,7 +1381,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1394,7 +1394,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1407,7 +1407,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1420,7 +1420,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1433,7 +1433,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1446,7 +1446,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1477,7 +1477,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1490,7 +1490,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1503,7 +1503,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1516,7 +1516,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1529,7 +1529,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1542,7 +1542,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1555,7 +1555,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1568,7 +1568,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1596,7 +1596,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1609,7 +1609,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1622,7 +1622,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1635,7 +1635,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1648,7 +1648,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1661,7 +1661,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1674,7 +1674,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1687,7 +1687,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1717,7 +1717,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1730,7 +1730,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1744,7 +1744,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1758,7 +1758,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1772,7 +1772,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1786,7 +1786,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1800,7 +1800,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1814,7 +1814,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1844,7 +1844,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1858,7 +1858,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1872,7 +1872,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1886,7 +1886,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1900,7 +1900,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1914,7 +1914,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1928,7 +1928,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1942,7 +1942,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1972,7 +1972,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -1986,7 +1986,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -2000,7 +2000,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -2014,7 +2014,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -2028,7 +2028,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -2042,7 +2042,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, pm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -2056,7 +2056,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mp);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -2070,7 +2070,7 @@ int check_xchange() {
           printf("between %d and %d is not correct\n", g_cart_id, mm);
           printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm);
           printf("Program aborted\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -2190,9 +2190,9 @@ int check_xchange() {
     }
 #endif
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     xchange_deri(df0);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
 #if defined TM_PARALLELT
     for (x1 = 0; x1 < LX; x1++) {
@@ -2205,7 +2205,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_t_up) {
                 printf("Exchange of derivatives is working not correctly (1u)!\n");
                 printf("Aborting program!");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2218,7 +2218,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_t_dn) {
                 printf("Exchange of derivatives is working not correctly (1d)!\n");
                 printf("Aborting program!");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2239,7 +2239,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_t_up) {
                 printf("Exchange of derivatives is working not correctly (2u)!\n");
                 printf("Aborting program!");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2252,7 +2252,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_t_dn) {
                 printf("Exchange of derivatives is working not correctly (2d)!\n");
                 printf("Aborting program!");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2261,7 +2261,7 @@ int check_xchange() {
         }
       }
     }
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     for (x0 = 1; x0 < T - 1; x0++) {
       for (x2 = 0; x2 < LY; x2++) {
         for (x3 = 0; x3 < LZ; x3++) {
@@ -2272,7 +2272,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_x_up) {
                 printf("Exchange of derivatives is working not correctly (3u)!\n");
                 printf("Aborting program!");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2285,7 +2285,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_x_dn) {
                 printf("Exchange of derivatives is working not correctly (3d)!\n");
                 printf("Aborting program!");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2303,7 +2303,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_up + g_nb_t_up) {
               printf("Exchange of derivatives is working not correctly (4uu)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2316,7 +2316,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_up + g_nb_t_dn) {
               printf("Exchange of derivatives is working not correctly (4ud)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2329,7 +2329,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_dn + g_nb_t_up) {
               printf("Exchange of derivatives is working not correctly (4du)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2342,7 +2342,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_dn + g_nb_t_dn) {
               printf("Exchange of derivatives is working not correctly (4dd)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2362,7 +2362,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_t_up) {
                 printf("Exchange of derivatives is working not correctly (5u)!\n");
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2375,7 +2375,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_t_dn) {
                 printf("Exchange of derivatives is working not correctly (5d)!\n");
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2394,7 +2394,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_x_up) {
                 printf("Exchange of derivatives is working not correctly (6u)!\n");
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2407,7 +2407,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_x_dn) {
                 printf("Exchange of derivatives is working not correctly (6d)!\n");
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2426,7 +2426,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_y_up) {
                 printf("Exchange of derivatives is working not correctly (7u)!\n");
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2439,7 +2439,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_y_dn) {
                 printf("Exchange of derivatives is working not correctly (7d)!\n");
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2457,7 +2457,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_up + g_nb_t_up) {
               printf("Exchange of derivatives is working not correctly (8uu)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2470,7 +2470,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_up + g_nb_t_dn) {
               printf("Exchange of derivatives is working not correctly (8ud)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2483,7 +2483,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_dn + g_nb_t_up) {
               printf("Exchange of derivatives is working not correctly (8du)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2496,7 +2496,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_dn + g_nb_t_dn) {
               printf("Exchange of derivatives is working not correctly (8dd)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2513,7 +2513,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_t_up + g_nb_y_up) {
               printf("Exchange of derivatives is working not correctly (9uu)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2526,7 +2526,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_t_up + g_nb_y_dn) {
               printf("Exchange of derivatives is working not correctly (9ud)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2541,7 +2541,7 @@ int check_xchange() {
               printf("%d %d %d %d %d %d %d\n", (int)x[j], g_nb_t_dn, g_nb_t_up, g_nb_y_dn,
                      g_nb_y_up, x1, x3);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2554,7 +2554,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_t_dn + g_nb_y_dn) {
               printf("Exchange of derivatives is working not correctly (9dd)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2571,7 +2571,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_up + g_nb_y_up) {
               printf("Exchange of derivatives is working not correctly (10uu)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2584,7 +2584,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_up + g_nb_y_dn) {
               printf("Exchange of derivatives is working not correctly (10ud)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2597,7 +2597,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_dn + g_nb_y_up) {
               printf("Exchange of derivatives is working not correctly (10du)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2610,7 +2610,7 @@ int check_xchange() {
             if ((int)x[j] != g_nb_x_dn + g_nb_y_dn) {
               printf("Exchange of derivatives is working not correctly (10dd)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2626,7 +2626,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_up + g_nb_y_up + g_nb_t_up) {
             printf("Exchange of derivatives is working not correctly (11uuu)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2639,7 +2639,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_dn + g_nb_y_up + g_nb_t_up) {
             printf("Exchange of derivatives is working not correctly (11duu)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2652,7 +2652,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_dn + g_nb_y_up + g_nb_t_dn) {
             printf("Exchange of derivatives is working not correctly (11dud)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2665,7 +2665,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_dn + g_nb_y_dn + g_nb_t_up) {
             printf("Exchange of derivatives is working not correctly (11ddu)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2678,7 +2678,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_up + g_nb_y_dn + g_nb_t_dn) {
             printf("Exchange of derivatives is working not correctly (11udd)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2691,7 +2691,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_up + g_nb_y_up + g_nb_t_dn) {
             printf("Exchange of derivatives is working not correctly (11uud)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2704,7 +2704,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_up + g_nb_y_dn + g_nb_t_up) {
             printf("Exchange of derivatives is working not correctly (11udu)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2717,7 +2717,7 @@ int check_xchange() {
           if ((int)x[j] != g_nb_x_dn + g_nb_y_dn + g_nb_t_dn) {
             printf("Exchange of derivatives is working not correctly (11ddd)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2735,7 +2735,7 @@ int check_xchange() {
                 if ((int)x[j] != 0) {
                   printf("Exchange of derivatives is working not correctly (bulk XYT)!\n");
                   printf("Aborting program!\n");
-                  MPI_Abort(MPI_COMM_WORLD, 5);
+                  MPI_Abort(app()->mpi.comm, 5);
                   MPI_Finalize();
                   exit(0);
                 }
@@ -2761,7 +2761,7 @@ int check_xchange() {
                 printf("%d %d %d %d %d\n", x1, x2, x3, ix, g_proc_id);
                 printf("%f %d %d\n", df0[ix][mu].d8, g_nb_t_up, g_nb_t_dn);
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2780,7 +2780,7 @@ int check_xchange() {
               if ((int)x[j] != g_nb_x_up) {
                 printf("Exchange of derivatives is working not correctly (13)!\n");
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2800,7 +2800,7 @@ int check_xchange() {
                 printf("Exchange of derivatives is working not correctly (14)!\n");
                 printf("%d %d %d %d %d\n", x0, x1, x3, ix, g_proc_id);
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2820,7 +2820,7 @@ int check_xchange() {
                 printf("Exchange of derivatives is working not correctly (15)!\n");
                 printf("%d %d %d %d %d\n", x0, x1, x3, ix, g_proc_id);
                 printf("Aborting program!\n");
-                MPI_Abort(MPI_COMM_WORLD, 5);
+                MPI_Abort(app()->mpi.comm, 5);
                 MPI_Finalize();
                 exit(0);
               }
@@ -2838,7 +2838,7 @@ int check_xchange() {
             if ((int)x[j] != (g_nb_x_up + g_nb_t_up)) {
               printf("Exchange of derivatives is working not correctly (16)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2855,7 +2855,7 @@ int check_xchange() {
             if ((int)x[j] != (g_nb_y_up + g_nb_t_up)) {
               printf("Exchange of derivatives is working not correctly (17)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2872,7 +2872,7 @@ int check_xchange() {
             if ((int)x[j] != (g_nb_y_up + g_nb_x_up)) {
               printf("Exchange of derivatives is working not correctly (18)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2890,7 +2890,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (19)!\n");
               printf("%f %d %d %d\n", df0[ix][mu].d1, g_nb_x_up + g_nb_z_up, g_nb_x_up, g_nb_z_up);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2907,7 +2907,7 @@ int check_xchange() {
             if ((int)x[j] != (g_nb_y_up + g_nb_z_up)) {
               printf("Exchange of derivatives is working not correctly (20)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2924,7 +2924,7 @@ int check_xchange() {
             if ((int)x[j] != (g_nb_t_up + g_nb_z_up)) {
               printf("Exchange of derivatives is working not correctly (21)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -2940,7 +2940,7 @@ int check_xchange() {
           if ((int)x[j] != (g_nb_t_up + g_nb_x_up + g_nb_y_up)) {
             printf("Exchange of derivatives is working not correctly (22)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2955,7 +2955,7 @@ int check_xchange() {
           if ((int)x[j] != (g_nb_t_up + g_nb_x_up + g_nb_z_up)) {
             printf("Exchange of derivatives is working not correctly (23)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2970,7 +2970,7 @@ int check_xchange() {
           if ((int)x[j] != (g_nb_t_up + g_nb_z_up + g_nb_y_up)) {
             printf("Exchange of derivatives is working not correctly (24)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2985,7 +2985,7 @@ int check_xchange() {
           if ((int)x[j] != (g_nb_z_up + g_nb_x_up + g_nb_y_up)) {
             printf("Exchange of derivatives is working not correctly (25)!\n");
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -2999,7 +2999,7 @@ int check_xchange() {
         if ((int)x[j] != (g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up)) {
           printf("Exchange of derivatives is working not correctly (26)!\n");
           printf("Aborting program!\n");
-          MPI_Abort(MPI_COMM_WORLD, 5);
+          MPI_Abort(app()->mpi.comm, 5);
           MPI_Finalize();
           exit(0);
         }
@@ -3135,9 +3135,9 @@ int check_xchange() {
 
 #endif
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
     xchange_deri(df0);
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(app()->mpi.comm);
 
 #if (defined TM_PARALLELXT || defined TM_PARALLELXYT || defined TM_PARALLELXYZT)
 
@@ -3171,7 +3171,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e5mm)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pp, mm);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3185,7 +3185,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e5mp)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pm, mp);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3199,7 +3199,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e5pm)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pm, mp);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3213,7 +3213,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e5pp)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pp, mm);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3251,7 +3251,7 @@ int check_xchange() {
             if ((int)x[j] != mm) {
               printf("Exchange of derivatives is working not correctly (e6mm)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3265,7 +3265,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e6pm)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_x_up, g_nb_y_up, pm, mp);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3279,7 +3279,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e6mp)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_x_up, g_nb_y_up, pm, mp);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3293,7 +3293,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e6pp)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_x_up, g_nb_y_up, pp, mm);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3326,7 +3326,7 @@ int check_xchange() {
             if ((int)x[j] != mm) {
               printf("Exchange of derivatives is working not correctly (e7mm)!\n");
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3340,7 +3340,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e7pm)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_y_up, pm, pm);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3354,7 +3354,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e7mp)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_y_up, pm, mp);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3368,7 +3368,7 @@ int check_xchange() {
               printf("Exchange of derivatives is working not correctly (e7pp)!\n");
               printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_y_up, pp, mm);
               printf("Aborting program!\n");
-              MPI_Abort(MPI_COMM_WORLD, 5);
+              MPI_Abort(app()->mpi.comm, 5);
               MPI_Finalize();
               exit(0);
             }
@@ -3402,7 +3402,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8mmm)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3435,7 +3435,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8pmm)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3468,7 +3468,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8pmp)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3501,7 +3501,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8ppp)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3534,7 +3534,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8mpm)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3567,7 +3567,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8mmp)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3600,7 +3600,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8mpp)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3633,7 +3633,7 @@ int check_xchange() {
             printf("Exchange of derivatives is working not correctly (e8ppm)!\n");
             printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp);
             printf("Aborting program!\n");
-            MPI_Abort(MPI_COMM_WORLD, 5);
+            MPI_Abort(app()->mpi.comm, 5);
             MPI_Finalize();
             exit(0);
           }
@@ -3651,7 +3651,7 @@ int check_xchange() {
                 if ((int)x[j] != 0) {
                   printf("Exchange of derivatives is working not correctly (ebulk XYT)!\n");
                   printf("Aborting program!\n");
-                  MPI_Abort(MPI_COMM_WORLD, 5);
+                  MPI_Abort(app()->mpi.comm, 5);
                   MPI_Finalize();
                   exit(0);
                 }
diff --git a/src/lib/update_tm.c b/src/lib/update_tm.c
index 3f1cdc5d5..df0e0fd54 100644
--- a/src/lib/update_tm.c
+++ b/src/lib/update_tm.c
@@ -174,7 +174,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
      the other sites */
   ranlxd(yy, 1);
 #ifdef TM_USE_MPI
-  MPI_Bcast(&yy[0], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&yy[0], 1, MPI_DOUBLE, 0, app()->mpi.comm);
 #endif
 
   /* when acctest is 0 (i.e. do not perform acceptance test), the trajectory is accepted whatever
@@ -277,7 +277,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy, char *filename
 
 #ifdef TM_USE_MPI
     tmp = ret_gauge_diff;
-    MPI_Reduce(&tmp, &ret_gauge_diff, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&tmp, &ret_gauge_diff, 1, MPI_DOUBLE, MPI_SUM, 0, app()->mpi.comm);
 #endif
     /* compute the total H */
     tmp = enep;
diff --git a/src/lib/wrapper/lib_wrapper.c b/src/lib/wrapper/lib_wrapper.c
index 19d36ddc6..71d78bed4 100644
--- a/src/lib/wrapper/lib_wrapper.c
+++ b/src/lib/wrapper/lib_wrapper.c
@@ -98,7 +98,7 @@ int tmLQCD_invert_init(int argc, char* argv[], const int _verbose, const int ext
   g_use_clover_flag = 0;
 
 #ifdef TM_USE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+  MPI_Comm_rank(app()->mpi.comm, &g_proc_id);
 #else
   g_proc_id = 0;
 #endif
@@ -341,7 +341,7 @@ int tmLQCD_finalise() {
   free_moment_field();
   free_chi_spinor_field();
 #ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Barrier(app()->mpi.comm);
 #endif
   return (0);
 }
diff --git a/tests/test_buffers.c b/tests/test_buffers.c
index d398c9768..7fedf2fe7 100644
--- a/tests/test_buffers.c
+++ b/tests/test_buffers.c
@@ -13,7 +13,7 @@ TEST_SUITES{TEST_SUITE_ADD(BUFFERS_GAUGE), TEST_SUITES_CLOSURE};
 int main(int argc, char *argv[]) {
 #ifdef TM_USE_MPI
   MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
+  MPI_Comm_rank(app()->mpi.comm, &g_proc_id);
 #else
   g_proc_id = 0;
 #endif

From 33da95cd4043949acf074f16894b0186332b4686 Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Wed, 25 Feb 2026 15:09:58 +0100
Subject: [PATCH 13/19] we have to give QUDA the current MPI commnicator

---
 src/lib/quda_interface.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lib/quda_interface.c b/src/lib/quda_interface.c
index f2586c7cf..b52d4983d 100644
--- a/src/lib/quda_interface.c
+++ b/src/lib/quda_interface.c
@@ -448,6 +448,7 @@ void _initQuda() {
   // initialize the QUDA library
 #ifdef TM_USE_MPI
   initQuda(-1);  // sets device numbers automatically
+  setMPICommHandleQuda(app()->mpi.comm); // pass the proper MPI communicator to QUDA
 #else
   initQuda(0);  // scalar build: use device 0
 #endif

From 06aa7e7d8dd5f527ad0cb4f5fe4637b7d0e80c44 Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Wed, 25 Feb 2026 15:37:29 +0100
Subject: [PATCH 14/19] more MPI_COMM_WORLD which where forgotten

---
 src/lib/io/utils_kill_with_error.c      | 2 +-
 src/lib/mpi_init.c                      | 2 +-
 src/lib/test/measure_rectangles.debug.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lib/io/utils_kill_with_error.c b/src/lib/io/utils_kill_with_error.c
index 322536bd7..282e5bc4b 100644
--- a/src/lib/io/utils_kill_with_error.c
+++ b/src/lib/io/utils_kill_with_error.c
@@ -14,7 +14,7 @@ void kill_with_error(LIME_FILE *fh, int const rank, char const *error) {
 #endif /* TM_USE_LEMON */
 
 #ifdef TM_USE_MPI
-  MPI_Abort(MPI_COMM_WORLD, 1);
+  MPI_Abort(app()->mpi.comm, 1);
   MPI_Finalize();
 #endif
   exit(500);
diff --git a/src/lib/mpi_init.c b/src/lib/mpi_init.c
index 6a8c81e40..25c79809a 100644
--- a/src/lib/mpi_init.c
+++ b/src/lib/mpi_init.c
@@ -189,7 +189,7 @@ void tmlqcd_mpi_init(int argc, char *argv[]) {
 
 #ifdef TM_USE_MPI
 #ifdef TM_USE_SHMEM
-  /* we need that the PE number in MPI_COMM_WORL  */
+  /* we need that the PE number in MPI_COMM_WORLD  */
   /* exactly correspond to the one in g_cart_grid */
   reorder = 0;
 #endif
diff --git a/src/lib/test/measure_rectangles.debug.c b/src/lib/test/measure_rectangles.debug.c
index 422f681b2..9c4f51215 100644
--- a/src/lib/test/measure_rectangles.debug.c
+++ b/src/lib/test/measure_rectangles.debug.c
@@ -131,7 +131,7 @@ double measure_rectangles() {
   fclose(debugfile);
   ga = (kc + ks) / 3.0;
 #ifdef TM_USE_MPI
-  MPI_Allreduce(&ga, &gas, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ga, &gas, 1, MPI_DOUBLE, MPI_SUM, app()->mpi.comm);
   return gas;
 #else
   return ga;

From 8e0208188b5366f83c7ab908f1065afd0a29d429 Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Thu, 26 Feb 2026 10:45:15 +0100
Subject: [PATCH 15/19] prevent c++ name mangling

---
 src/lib/include/mpi.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/lib/include/mpi.h b/src/lib/include/mpi.h
index 0d5b02993..57c645566 100644
--- a/src/lib/include/mpi.h
+++ b/src/lib/include/mpi.h
@@ -31,6 +31,10 @@
 // include *real* MPI header
 #include_next <mpi.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 
 /**
  * @brief      MPI context
@@ -74,4 +78,8 @@ void app_context_init(const MPI_Comm comm);
 void app_context_finalize(void);
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif

From 48998f4957b85f952a6494bbbf14a04cde58d25e Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Mon, 2 Mar 2026 11:34:51 +0100
Subject: [PATCH 16/19] added safe allocator

---
 src/lib/CMakeLists.txt  |  6 +++
 src/lib/alloc/alloc.c   | 81 +++++++++++++++++++++++++++++++++++++++++
 src/lib/include/alloc.h | 49 +++++++++++++++++++++++++
 3 files changed, 136 insertions(+)
 create mode 100755 src/lib/alloc/alloc.c
 create mode 100755 src/lib/include/alloc.h

diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 367f33d5f..70fec08c3 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -73,6 +73,11 @@ list(
   APP_CONTEXT_SRC_C
   app_context/app_context.c)
 
+list(
+  APPEND
+  ALLOC_SRC_C
+  alloc/alloc.c)
+
 list(
   APPEND
   SOLVER_SRC_C
@@ -396,6 +401,7 @@ list(
   ${IO_SRC_C}
   ${INIT_SRC_C}
   ${APP_CONTEXT_SRC_C}
+  ${ALLOC_SRC_C}
   ${SOLVER_SRC_C}
   ${TEST_SRC_C}
   ${MEAS_SRC_C}
diff --git a/src/lib/alloc/alloc.c b/src/lib/alloc/alloc.c
new file mode 100755
index 000000000..a853d58d4
--- /dev/null
+++ b/src/lib/alloc/alloc.c
@@ -0,0 +1,81 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2026 Roman Gruber
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Allocation utils
+ *
+ * Author: Roman Gruber
+ *         roman.gruber@unibe.ch
+ *
+ *******************************************************************************/
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+
+
+/**
+ * @brief      Safe malloc implementation that checks malloc for NULL. Never use
+ *             this function instead use the macro safe_malloc()
+ *
+ * @param[in]  size  The allocation size in bytes
+ * @param[in]  file  __FILE__
+ * @param[in]  line  __LINE__
+ * @param[in]  func  __func__
+ *
+ * @return     Pointer returned by malloc()
+ */
+void *safe_malloc_impl(size_t size, const char *file, int line, const char *func)
+{
+    if (size <= 0) {
+        fprintf(stderr, "safe_malloc: zero-size allocation at %s:%d (%s)\n", file, line, func);
+        abort();
+    }
+
+    void *p = malloc(size);
+    if (p == NULL) {
+        fprintf(stderr, "safe_malloc: failed to allocate %zu bytes at %s:%d (%s): %s\n",
+                size, file, line, func, strerror(errno));
+        abort();
+    }
+    return p;
+}
+
+
+/**
+ * @brief      Identical to safe_malloc_impl above just that is call calloc
+ *             instead of malloc. Never use this function instead use the macro
+ *             safe_calloc()
+ */
+void *safe_calloc_impl(size_t size, const char *file, int line, const char *func)
+{
+   if (size <= 0) {
+      fprintf(stderr, "safe_calloc: zero-size allocation at %s:%d (%s)\n", file, line, func);
+      abort();
+   }
+
+   void *p = calloc(size, 1);
+   if (p == NULL) {
+      fprintf(stderr, "safe_calloc: failed to allocate %zu bytes at %s:%d (%s): %s\n",
+              size, file, line, func, strerror(errno));
+      abort();
+   }
+   return p;
+}
\ No newline at end of file
diff --git a/src/lib/include/alloc.h b/src/lib/include/alloc.h
new file mode 100755
index 000000000..a4545b333
--- /dev/null
+++ b/src/lib/include/alloc.h
@@ -0,0 +1,49 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2026 Roman Gruber
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Allocation utils
+ *
+ * Author: Roman Gruber
+ *         roman.gruber@unibe.ch
+ *
+ *******************************************************************************/
+
+#ifndef ALLOC_H
+#define ALLOC_H
+
+
+#include <stdbool.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void *safe_malloc_impl(size_t size, const char *file, int line, const char *func);
+void *safe_calloc_impl(size_t size, const char *file, int line, const char *func);
+#define safe_malloc(size) safe_malloc_impl((size), __FILE__, __LINE__, __func__)
+#define safe_calloc(size) safe_calloc_impl((size), __FILE__, __LINE__, __func__)
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

From e8078d9725c42d6b4b55b5bf7c59d0d7954e572c Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Mon, 2 Mar 2026 11:35:40 +0100
Subject: [PATCH 17/19] added ptbc instance and defect structs

---
 src/lib/app_context/app_context.c | 192 ++++++++++++++++++++++++++++--
 src/lib/include/app.h             | 108 +++++++++++++++++
 src/lib/include/mpi.h             |  58 +--------
 3 files changed, 296 insertions(+), 62 deletions(-)
 create mode 100755 src/lib/include/app.h

diff --git a/src/lib/app_context/app_context.c b/src/lib/app_context/app_context.c
index 00d5d94b2..aa52e3c39 100644
--- a/src/lib/app_context/app_context.c
+++ b/src/lib/app_context/app_context.c
@@ -25,35 +25,211 @@
  *******************************************************************************/
 
 #include <stdbool.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
 #include <mpi.h>
 #include "fatal_error.h"
 
 
+/**
+ * @brief      The rank topology struct
+ */
+typedef struct {
+    int number_of_nodes;    // total number of nodes in the job
+    int number_of_ranks;    // total number of processes in the job, i.e. size of the world communicator
+    int ranks_per_node;     // number of ranks per node
+    int node_index;         // index enumerating the node (unique per node)
+    int node_rank;          // rank number inside the node
+} RankTopology;
+
+
+/**
+ * @brief      Gracefully error with function name, file and line number along
+ *             the error message
+ *
+ * @param      test  The test
+ * @param      ...   Format parameters
+ */
+#define err(test, ...) err_impl(test, __func__, __FILE__, __LINE__, __VA_ARGS__)
+static void err_impl(const bool test, const char* func, const char* file, const int line, const char* format, ...)
+{
+    if (test) {
+        va_list args;
+        char message[1024];
+        va_start(args, format);
+        vsnprintf(message, 1024, format, args);
+        va_end(args);
+        char location[1024];
+        snprintf(location, 1024, "%s:%d %s", file, line, func);
+        fatal_error(message, location);
+    }
+}
+
+
+static void initialize(void);
 static AppContext app_instance = {
     .mpi = {
-        .comm = MPI_COMM_WORLD // default communicator
+        .comm = MPI_COMM_WORLD, // default communicator
+        .world_comm = MPI_COMM_WORLD,
+    },
+    .ptbc = {
+        .instance_id = 0,
+        .n_instances = 1,
+        .n_defects = 0,
+        .active = false,
+        .initialize = initialize,
+        .instances = {{.active = false}},
+        .defects = {{.active = false}}
     }
 };
 
 
+/**
+ * @brief      Return the global *immutable* application context struct. To be
+ *             used when reading parameters.
+ *
+ * @return     Global application context struct
+ */
 const AppContext* app(void)
 {
     return &app_instance;
 }
 
 
-void app_context_init(const MPI_Comm comm)
+/**
+ * @brief      Return the global *mutable* application context struct. To be
+ *             used when initializing/setting parameters.
+ *
+ * @return     Global application context struct
+ */
+AppContext* appm(void)
+{
+    return &app_instance;
+}
+
+
+/**
+ * @brief      Return rank topology.
+ *
+ * @return     The topology.
+ */
+static RankTopology get_topology(void)
 {
-    static bool initialized = false;
+    int world_rank;
+    RankTopology topo;
+    MPI_Comm node_comm, leader_comm;
+
+    MPI_Comm_rank(app_instance.mpi.world_comm, &world_rank);
+    MPI_Comm_size(app_instance.mpi.world_comm, &topo.number_of_ranks);
+    MPI_Comm_split_type(app_instance.mpi.world_comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &node_comm);
+    MPI_Comm_rank(node_comm, &topo.node_rank);
+    bool is_leader = topo.node_rank == 0;
+    MPI_Comm_split(app_instance.mpi.world_comm, is_leader ? 0 : MPI_UNDEFINED, world_rank, &leader_comm);
+    if (is_leader) MPI_Comm_size(leader_comm, &topo.number_of_nodes);
+    if (is_leader) MPI_Comm_rank(leader_comm, &topo.node_index);
+    MPI_Bcast(&topo.number_of_nodes, 1, MPI_INT, 0, app_instance.mpi.world_comm);
+    MPI_Bcast(&topo.node_index, 1, MPI_INT, 0, node_comm);
 
-    if (initialized) fatal_error("Application context already initialized", __func__);
+    topo.ranks_per_node = topo.number_of_ranks/topo.number_of_nodes;
 
-    app_instance.mpi.comm = comm;
-    initialized = true;
+    return topo;
 }
 
 
-void app_context_finalize(void)
+/**
+ * @brief      Initializes the difference application instances.
+ */
+static void initialize(void)
 {
-    
+    printf("\033[0;31m[PTBC] Number of chains = %d\033[0m\n", app_instance.ptbc.n_instances);
+
+    int flag;
+    MPI_Initialized(&flag);
+    err(!flag, "Initialize has to be called *after* MPI_Init().");
+
+    MPI_Comm_rank(app_instance.mpi.world_comm, &app_instance.mpi.world_rank);
+
+    // do nothing in case of a single chain
+    if (app_instance.ptbc.n_instances == 1) return;
+
+    app_instance.ptbc.active = true;
+    RankTopology topo = get_topology();
+
+    int instance_size = topo.number_of_ranks / app_instance.ptbc.n_instances;
+
+    err(topo.number_of_ranks % app_instance.ptbc.n_instances != 0,
+        "PTBC_NCHAINS = %d must divide total number of ranks = %d",
+        app_instance.ptbc.n_instances, topo.number_of_ranks);
+    err(instance_size % topo.ranks_per_node != 0 && topo.ranks_per_node % instance_size != 0,
+        "The number of processes per node = %d and instance_size = %d: one must be divisible by the other",
+        topo.ranks_per_node, instance_size);
+
+    // We perform a topology-aware splitting of processes into instances using
+    // MPI_Comm_split. Processes within the same node should preferably be
+    // associated to the same instance. We have 3 cases:
+    //
+    // Case 1: If we have one instance per node, there is nothing special to
+    // consider.
+    //
+    // Case 2: If instances span multiple nodes, they should span over the
+    // minimal number of nodes possible. Instances only cover whole nodes. We
+    // have no notion of nodes being "close" to each other, alhtough we group
+    // nodes together with adjacent node indices. Node indices are inherited
+    // from rank numbers. If ranks with adjacent indices are "close", then nodes
+    // with adjacent node indices are "close".
+    //
+    // Case 3: If we have multiple instances per node, ranks in the same
+    // instance should have adjacent world rank numbers, i.e. they should be
+    // "close" to each other cache-wise. No instance covers more than one node.
+    int color;
+    int key = app_instance.mpi.world_rank; // order of the ranks is kept
+    if (instance_size == topo.ranks_per_node) { // case 1: one instance per node
+        color = topo.node_index;
+    } else if (instance_size % topo.ranks_per_node == 0) { // case 2: one instance spans multiple nodes
+        int nodes_per_instance = instance_size / topo.ranks_per_node;
+        int remainder = topo.node_index % nodes_per_instance;
+        color = (topo.node_index - remainder) / nodes_per_instance; // group nodes with adjacent node indices
+    } else if (topo.ranks_per_node % instance_size == 0) { // case 3: multiple instances per node
+        int instances_per_node = topo.ranks_per_node / instance_size;
+        int remainder = topo.node_rank % instance_size;
+        int per_node_instance_index = (topo.node_rank - remainder) / instance_size;
+        color = topo.node_index*instances_per_node + per_node_instance_index;
+    }
+
+    MPI_Comm_split(app_instance.mpi.world_comm, color, key, &app_instance.mpi.comm);
+
+    int n;
+    MPI_Comm_size(app_instance.mpi.comm, &n);
+    err(instance_size != n, "Rank topology is not uniform");
+
+    int instance_rank;
+    MPI_Comm_rank(app_instance.mpi.comm, &instance_rank);
+    app_instance.ptbc.instance_id = color;
+
+    printf("\033[0;31m[PTBC] world rank = %d/%d in instance_id = %d/%d, as instance_rank = %d/%d\033[0m\n",
+        app_instance.mpi.world_rank, topo.number_of_ranks,
+        app_instance.ptbc.instance_id, app_instance.ptbc.n_instances,
+        instance_rank, instance_size);
+
+
+    //err(true, "bailing out");
+    /*if (app_instance.ptbc.instance_id != 0) {
+        char logfile[1024];
+        snprintf(logfile, 1024, "logfile_%.2d.log", app_instance.ptbc.instance_id);
+        freopen(logfile, "w", stdout);
+    }*/
+
+    // Every instance just changes into a subdirectory "instance_xx". Relative
+    // paths work, absolute paths not.
+    struct stat st = {0};
+    char subdir[1024];
+    snprintf(subdir, 1024, "instance_%.2d", app_instance.ptbc.instance_id);
+    if (stat(subdir, &st) == -1)
+        mkdir(subdir, 0700);
+
+    chdir(subdir);
 }
diff --git a/src/lib/include/app.h b/src/lib/include/app.h
new file mode 100755
index 000000000..dfc3d9f28
--- /dev/null
+++ b/src/lib/include/app.h
@@ -0,0 +1,108 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2026 Roman Gruber
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Simple MPI header wrapper
+ *
+ * Author: Roman Gruber
+ *         roman.gruber@unibe.ch
+ *
+ *******************************************************************************/
+
+#ifndef APP_H
+#define APP_H
+
+
+#include <stdbool.h>
+
+
+#if defined(TM_USE_MPI)
+#include <mpi.h>
+#endif
+
+
+#ifndef MPI_VERSION
+typedef int MPI_Comm;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define MAX_N_DEFECTS 10
+#define MAX_N_INSTANCES 10
+
+
+typedef enum direction_t {
+   DIRECTION_T = 0,
+   DIRECTION_X = 1,
+   DIRECTION_Y = 2,
+   DIRECTION_Z = 3
+} direction_t;
+
+
+typedef struct {
+    MPI_Comm comm;          // MPI instance communicator
+    MPI_Comm world_comm;    // MPI world communicator
+    int world_rank;         // MPI world rank
+} MPIContext;
+
+
+typedef struct {
+    bool active;        // Whether the defect is active or not
+    int Ld[3];          // Extents of the defect
+    direction_t along;  // Along which dimension
+} PTBCDefect;
+
+
+typedef struct {
+    bool active;          // Whether the instance is active or not
+    int n_coeffs;         // Number of coefficients / defect this instance is associated to
+    PTBCDefect** defects; // List of defects where this instance is associated to
+    double* coefficients; // List of coefficients for the defects
+} PTBCInstance;
+
+
+typedef struct {
+    bool active;                                // Whether PTBC mode is active or not
+    int instance_id;                            // Instance ID
+    int n_instances;                            // Number of instances
+    int n_defects;                              // Number of defects
+    PTBCInstance instances[MAX_N_INSTANCES];    // List of all instances
+    PTBCDefect defects[MAX_N_DEFECTS];          // List of all defects
+    void (*initialize)(void);                   // PTBC algorithm initializer
+} PTBCContext;
+
+
+typedef struct {
+    MPIContext mpi;     // MPI context
+    PTBCContext ptbc;   // PTBC context
+} AppContext;
+
+
+const AppContext* app(void);
+AppContext* appm(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/lib/include/mpi.h b/src/lib/include/mpi.h
index 57c645566..7d816632e 100644
--- a/src/lib/include/mpi.h
+++ b/src/lib/include/mpi.h
@@ -24,62 +24,12 @@
  *
  *******************************************************************************/
 
-#ifndef MY_MPI_WRAPPER_H
-#define MY_MPI_WRAPPER_H
+#ifndef MPI_WRAPPER_H
+#define MPI_WRAPPER_H
 
 
-// include *real* MPI header
-#include_next <mpi.h>
+#include_next <mpi.h> // include *real* MPI header
+#include "app.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/**
- * @brief      MPI context
- *
- * @var        comm MPI communicator
- */
-typedef struct {
-    MPI_Comm comm;
-} MPIContext;
-
-
-/**
- * @brief      The global application context struct
- *
- * @var        mpi MPI context
- */
-typedef struct {
-    MPIContext mpi;
-} AppContext;
-
-
-/**
- * @brief      Return the global application context struct
- *
- * @return     Global application context struct
- */
-const AppContext* app(void);
-
-
-/**
- * @brief      Initialize application context
- *
- * @param[in]  comm  The MPI communicator to use throughout the application
- */
-void app_context_init(const MPI_Comm comm);
-
-
-/**
- * @brief      Finalize application context
- */
-void app_context_finalize(void);
-
-
-#ifdef __cplusplus
-}
-#endif
 
 #endif

From d97bafd99a38d9771bc06c60bf0f40b346b57a0f Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Mon, 2 Mar 2026 11:36:14 +0100
Subject: [PATCH 18/19] added PTBC read in logic

---
 src/lib/init/init_parallel.c |   2 +
 src/lib/read_input.l         | 227 ++++++++++++++++++++++++++++++++++-
 2 files changed, 227 insertions(+), 2 deletions(-)

diff --git a/src/lib/init/init_parallel.c b/src/lib/init/init_parallel.c
index 9dfdbb0c5..7d830e8bb 100644
--- a/src/lib/init/init_parallel.c
+++ b/src/lib/init/init_parallel.c
@@ -85,6 +85,8 @@ void init_parallel_and_read_input(int argc, char *argv[], const char input_filen
     exit(-1);
   }
 
+  app()->ptbc.initialize();
+
 #ifdef TM_USE_OMP
   init_openmp();
 #endif
diff --git a/src/lib/read_input.l b/src/lib/read_input.l
index 5eb542f87..3a917bd46 100644
--- a/src/lib/read_input.l
+++ b/src/lib/read_input.l
@@ -52,6 +52,7 @@ EQL {SPC}*={SPC}*
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <stdarg.h>
 #define INIT_GLOBALS
 #include "global.h"
 #undef INIT_GLOBALS
@@ -61,12 +62,14 @@ EQL {SPC}*={SPC}*
 #include "solver/solver_types.h"
 #include "meas/measurements.h"
 #include "integrator.h"
+#include "include/app.h"
 #include "operator.h"
 #include "phmc.h"
 #include <io/params.h>
 #include "qphix_types.h"
 #include "quda_types.h"
 #include "misc_types.h"
+#include "alloc.h"
 
 #include <ctype.h>
 
@@ -87,6 +90,28 @@ static inline void rmQuotes(char *str){
   *strsave='\0';
 }
 
+
+static inline int strlist_count_tokens(char const * const input)
+{
+  int n = 0;
+  for (int i = 0; i < strlen(input); i++)
+    if (input[i] == ',') n++;
+  return n+1;
+}
+
+static inline void fail(const bool test, const char* fmt, ...)
+{
+  if (test) {
+    va_list args;
+    char msg[1024];
+    va_start(args, fmt);
+    vsnprintf(msg, 1024, fmt, args);
+    va_end(args);
+    yy_fatal_error(msg);
+  }
+}
+
+
 /* tokenize the comma-delimited list 'input' of the form
     'list = token1, token2, ...'
    and return the first token, which is the name of the list
@@ -472,7 +497,61 @@ static inline double fltlist_next_token(int * const list_end){
     }
     free(input_copy);
   }
-  
+
+  static inline void parse_int_par_array(char const * const input, int * par_array, const int max_size) {
+    char paramname[100];
+    char error_message[ERR_MSG_LEN];
+    int list_end = 0;
+    int element = 0;
+
+    char * input_copy = (char*)NULL;
+    strlist_tokenize(input, &input_copy, paramname, 100);
+    int parval = (int) fltlist_next_token(&list_end);
+    while( list_end != 1 ){
+      if( element >= max_size ){
+        snprintf(error_message, ERR_MSG_LEN, "Exceeded maximum number of elements (%d) parsing %s!\n", max_size, paramname);
+        yy_fatal_error(error_message);
+      }
+
+      par_array[element] = parval;
+      if(myverbose){
+        printf("  %s, element %d set to %d line %d\n", paramname,
+                element, par_array[element], line_of_file);
+      }
+
+      element++;
+      parval = (int) fltlist_next_token(&list_end);
+    }
+    free(input_copy);
+  }
+
+  static inline void parse_dbl_par_array(char const * const input, double * par_array, const int max_size) {
+    char paramname[100];
+    char error_message[ERR_MSG_LEN];
+    int list_end = 0;
+    int element = 0;
+
+    char * input_copy = (char*)NULL;
+    strlist_tokenize(input, &input_copy, paramname, 100);
+    double parval = fltlist_next_token(&list_end);
+    while( list_end != 1 ){
+      if( element >= max_size ){
+        snprintf(error_message, ERR_MSG_LEN, "Exceeded maximum number of elements (%d) parsing %s!\n", max_size, paramname);
+        yy_fatal_error(error_message);
+      }
+
+      par_array[element] = parval;
+      if(myverbose){
+        printf("  %s, element %d set to %e line %d\n", paramname,
+                element, par_array[element], line_of_file);
+      }
+
+      element++;
+      parval = fltlist_next_token(&list_end);
+    }
+    free(input_copy);
+  }
+
 %}
 
 %option never-interactive
@@ -556,6 +635,11 @@ static inline double fltlist_next_token(int * const list_end){
 %x INITINTEGRATOR
 %x INTEGRATOR
 
+%x INITPTBC
+%x PTBC
+%x PTBCDEFECT
+%x PTBCINSTANCE
+
 %x DEFLATION
 %x INITDEFLATION
 
@@ -712,6 +796,8 @@ static inline double fltlist_next_token(int * const list_end){
 ^BeginInt                          BEGIN(INITINTEGRATOR);
 ^BeginOperator{SPC}+               BEGIN(INITOPERATOR);
 
+^BeginPTBC                         BEGIN(INITPTBC);
+
 ^BeginExternalInverter{SPC}+       BEGIN(INITEXTERNALINVERTER);
 
 ^BeginTuneMGParams{SPC}+           BEGIN(TUNEMGPARAMS);
@@ -3071,6 +3157,143 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
+<INITPTBC>{SPC}* {
+  appm()->ptbc.active = false;
+  appm()->ptbc.n_instances = 0;
+  appm()->ptbc.n_defects = 0;
+  appm()->ptbc.instance_id = 0;
+  if(myverbose) printf("Initialising PTBC line %d\n", line_of_file);
+  BEGIN(PTBC);
+}
+<PTBC>{
+  {SPC}*BeginPTBCDefect{SPC}+{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] %d", name, &b);
+    fail(b<0 || b>MAX_N_DEFECTS, "PTBC Defect id = %d out of bounds!", b);
+    appm()->ptbc.defects[b].active = true;
+    appm()->ptbc.n_defects++;
+    fail(appm()->ptbc.n_defects>=MAX_N_DEFECTS, "To many PTBC defects! Limit is %d", MAX_N_DEFECTS);
+    if(myverbose) printf("  Initialising PTBC defect (index = %d) in line %d\n", b, line_of_file);
+    BEGIN(PTBCDEFECT);
+  }
+  <PTBCDEFECT>{
+    {SPC}*Extents{EQL}{STRLIST} {
+
+      PTBCDefect* cd = appm()->ptbc.defects + b;
+
+      int n_parameters = strlist_count_tokens(yytext);
+      fail(n_parameters != 3, "Coefficients must provide exactly 3 numbers, not %d!\n", n_parameters);
+
+      sscanf(yytext, " %[a-zA-Z] = %d, %d, %d", name, cd->Ld, cd->Ld+1, cd->Ld+2);
+      if(myverbose) printf("    Ld = [%d, %d, %d] line %d\n", cd->Ld[0], cd->Ld[1], cd->Ld[2], line_of_file);
+      BEGIN(PTBCDEFECT);
+    }
+    {SPC}*Along{EQL}T {
+      appm()->ptbc.defects[b].along = DIRECTION_T;
+      if(myverbose) printf("    PTBC defect along T direction line %d\n", line_of_file);
+      BEGIN(PTBCDEFECT);
+    }
+    {SPC}*Along{EQL}X {
+      appm()->ptbc.defects[b].along = DIRECTION_X;
+      if(myverbose) printf("    PTBC defect along X direction line %d\n", line_of_file);
+      BEGIN(PTBCDEFECT);
+    }
+    {SPC}*Along{EQL}Y {
+      appm()->ptbc.defects[b].along = DIRECTION_Y;
+      if(myverbose) printf("    PTBC defect along Y direction line %d\n", line_of_file);
+      BEGIN(PTBCDEFECT);
+    }
+    {SPC}*Along{EQL}Z {
+      appm()->ptbc.defects[b].along = DIRECTION_Z;
+      if(myverbose) printf("    PTBC defect along Z direction line %d\n", line_of_file);
+      BEGIN(PTBCDEFECT);
+    }
+    {SPC}*EndPTBCDefect{SPC}* {
+      if(myverbose) printf("  PTBC defect parsed line %d\n", line_of_file);
+      BEGIN(PTBC);
+    }
+  }
+  {SPC}*BeginPTBCInstance{SPC}+{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] %d", name, &b);
+    fail(b<0 || b>MAX_N_INSTANCES, "PTBC Instance id = %d out of bounds!", b);
+    appm()->ptbc.instances[b].active = true;
+    appm()->ptbc.instances[b].n_coeffs = -1;
+    appm()->ptbc.n_instances++;
+    fail(appm()->ptbc.n_instances>=MAX_N_INSTANCES, "To many PTBC instances! Limit is %d", MAX_N_INSTANCES);
+    if(myverbose) printf("  Initialising PTBC instance (index = %d) in line %d\n", b, line_of_file);
+    BEGIN(PTBCINSTANCE);
+  }
+  <PTBCINSTANCE>{
+    {SPC}*Coefficients{EQL}{STRLIST} {
+
+      PTBCInstance* ci = appm()->ptbc.instances + b;
+
+      int n_parameters = strlist_count_tokens(yytext);
+      if (ci->coefficients == NULL)
+        ci->coefficients = safe_malloc(n_parameters*sizeof(double*));
+
+      parse_dbl_par_array(yytext, ci->coefficients, n_parameters);
+
+      for (int i = 0; i < n_parameters; ++i) {
+        fail(ci->coefficients[i]<0 || ci->coefficients[i] > 1,
+          "Coefficient %d with value %e is out of bounds", i, ci->coefficients[i]);
+      }
+
+      if (ci->n_coeffs == -1) ci->n_coeffs = n_parameters;
+      fail(ci->n_coeffs != n_parameters, "Number of PTBC coefficients must be equal to number of defects!\n");
+
+      for (int i = 0; i < n_parameters; ++i) {
+        if(myverbose) printf("    Coefficients[%d] = %f line %d\n", i, ci->coefficients[i], line_of_file);
+      }
+
+      BEGIN(PTBCINSTANCE);
+    }
+    {SPC}*Defects{EQL}{STRLIST} {
+
+      PTBCInstance* ci = appm()->ptbc.instances + b;
+
+      int n_parameters = strlist_count_tokens(yytext);
+      if (ci->defects == NULL) {
+        ci->defects = safe_malloc(n_parameters*sizeof(PTBCDefect*));
+      }
+
+      int* ids = safe_malloc(n_parameters*sizeof(int*));
+      parse_int_par_array(yytext, ids, n_parameters);
+
+      for (int i = 0; i < n_parameters; ++i) {
+        if(myverbose) printf("    Defects = %d line %d\n", ids[i], line_of_file);
+        fail(ids[i]<0 || ids[i]>MAX_N_DEFECTS, "PTBC Defect id = %d out of bounds!", ids[i]);
+        ci->defects[i] = &(appm()->ptbc.defects[ids[i]]);
+      }
+
+      if (ci->n_coeffs == -1) ci->n_coeffs = n_parameters;
+
+      fail(ci->n_coeffs != n_parameters,
+        "Number of PTBC defects must be equal to number of coefficients!\n");
+
+      free(ids);
+      BEGIN(PTBCINSTANCE);
+    }
+    {SPC}*EndPTBCInstance{SPC}* {
+      if(myverbose) printf("  PTBC instance parsed line %d\n", line_of_file);
+      BEGIN(PTBC);
+    }
+  }
+  EndPTBC{SPC}* {
+    fail(app()->ptbc.n_instances == 1, "Number of PTBC chains must be larger than 1!\n");
+
+    for (int i = 0; i < app()->ptbc.n_instances; ++i) {
+      fail(app()->ptbc.instances[i].active == false, "PTBC instance %d is not active!\n", i);
+
+      for (int j = 0; j < app()->ptbc.instances[i].n_coeffs; ++j) {
+        fail(app()->ptbc.instances[i].defects[j]->active == false, "PTBC instance %d does not refer to a valid defect!\n", i);
+      }
+    }
+
+    if(myverbose) printf("PTBC parsed line %d\n\n", line_of_file);
+    BEGIN(0);
+  }
+}
+
 <SOURCETYPE>{
   Point {
     SourceInfo.type = SRC_TYPE_POINT;
@@ -3729,7 +3952,7 @@ static inline double fltlist_next_token(int * const list_end){
   BEGIN(comment_caller);
 }
 
-<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,INITDEFLATION,DEFLATION,INITMULTIGRID,MULTIGRID,INITEXTERNALINVERTER,QUDAINVERTER,QPHIXINVERTER,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL,TUNEQUDAMGPARAMS>{SPC}*\n   {
+<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,PTBC,INITPTBC,PTBCDEFECT,PTBCINSTANCE,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,INITDEFLATION,DEFLATION,INITMULTIGRID,MULTIGRID,INITEXTERNALINVERTER,QUDAINVERTER,QPHIXINVERTER,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL,TUNEQUDAMGPARAMS>{SPC}*\n   {
   line_of_file++;
 }
 <*>{SPC}*\n                       {

From 90340d4d1f720f53eba8c01708755e7999fa7b29 Mon Sep 17 00:00:00 2001
From: Roman Gruber <roman.gruber@unibe.ch>
Date: Mon, 2 Mar 2026 11:36:29 +0100
Subject: [PATCH 19/19] added PTBC input file example

---
 ...-ptbc-hmc-rgmixedcg-tmcloverdetratio.input | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100755 doc/sample-input/sample-ptbc-hmc-rgmixedcg-tmcloverdetratio.input

diff --git a/doc/sample-input/sample-ptbc-hmc-rgmixedcg-tmcloverdetratio.input b/doc/sample-input/sample-ptbc-hmc-rgmixedcg-tmcloverdetratio.input
new file mode 100755
index 000000000..96e5e94ca
--- /dev/null
+++ b/doc/sample-input/sample-ptbc-hmc-rgmixedcg-tmcloverdetratio.input
@@ -0,0 +1,117 @@
+# this is a sample input file for cloverdet + cloverdetratio using
+# DDalphaAMG as a solver
+
+L=8
+T=8
+Measurements = 20
+Startcondition = hot
+2KappaMu = 0.01
+CSW = 1.00
+kappa = 0.138
+NSave = 500000
+ThetaT = 1.0
+UseEvenOdd = yes
+ReversibilityCheck = yes
+ReversibilityCheckIntervall = 4
+InitialStoreCounter = 0
+DebugLevel = 1
+
+BeginPTBC
+
+  BeginPTBCDefect 0
+    Extents = 2, 2, 2
+    Along = Y
+  EndPTBCDefect
+
+  BeginPTBCInstance 0
+    Coefficients = 1.0
+    Defects = 0
+  EndPTBCInstance
+
+  BeginPTBCInstance 1
+    Coefficients = 0.0
+    Defects = 0
+  EndPTBCInstance
+
+  BeginPTBCInstance 2
+    Coefficients = 0.75
+    Defects = 0
+  EndPTBCInstance
+
+  BeginPTBCInstance 3
+    Coefficients = 0.5
+    Defects = 0
+  EndPTBCInstance
+
+EndPTBC
+
+
+
+# since this is a test file, we employ the reproducible random numbers mode
+ReproduceRandomNumbers = yes
+Seed = 127782
+
+BeginMeasurement CORRELATORS
+  Frequency = 2
+EndMeasurement
+
+BeginMonomial GAUGE
+  Type = Wilson
+  beta = 5.60
+  Timescale = 0
+EndMonomial
+
+BeginMonomial CLOVERDET
+  Timescale = 1
+  2KappaMu = 0.01
+  CSW = 1.00
+  # nominator shift
+  rho = 0.1
+  kappa = 0.138
+  AcceptancePrecision =  1.e-20
+  ForcePrecision = 1.e-14
+  Name = cloverdet
+  solver = rgmixedcg
+  usesloppyprecision = single
+EndMonomial
+
+BeginMonomial CLOVERDETRATIO
+  Timescale = 2
+  2KappaMu = 0.01
+  # nominator shift
+  rho = 0.0
+  # denominator shift, should match CLOVERDET shift
+  rho2 = 0.1
+  CSW = 1.00
+  kappa = 0.138
+  AcceptancePrecision =  1.e-20
+  ForcePrecision = 1.e-16
+  Name = cloverdetratio
+  solver = rgmixedcg
+  usesloppyprecision = single
+EndMonomial
+
+BeginIntegrator 
+  Type0 = 2MNFG
+  Type1 = 2MNFG
+  Type2 = 2MNFG
+  IntegrationSteps0 = 1
+  IntegrationSteps1 = 1
+  IntegrationSteps2 = 4
+  tau = 1.00
+  Lambda0 = 0.16666667
+  Lambda1 = 0.16666667
+  Lambda2 = 0.16666667
+  NumberOfTimescales = 3
+EndIntegrator
+
+BeginOperator CLOVER
+  2KappaMu = 0.01
+  CSW = 1.00
+  kappa = 0.138
+  SolverPrecision = 1e-16
+  MaxSolverIterations = 1000
+  useevenodd = yes
+  solver = rgmixedcg
+  usesloppyprecision = single
+EndOperator