diff --git a/.github/required-checks.yml b/.github/required-checks.yml
index 0180c6da623..e995dce4a4f 100644
--- a/.github/required-checks.yml
+++ b/.github/required-checks.yml
@@ -8,10 +8,10 @@
 # An empty list (or missing key) for an event type disables enforcement
 # for that event — useful for bootstrapping.
 merge_group: &full_matrix
-- Build and test (arm64, gcc11, openmpi) / Dev environment (Debug)
-- Build and test (arm64, gcc11, openmpi) / Dev environment (Python)
-- Build and test (arm64, gcc12, openmpi) / Dev environment (Debug)
-- Build and test (arm64, gcc12, openmpi) / Dev environment (Python)
+- Build and test (amd64, llvm, openmpi) / Dev environment (Debug)
+- Build and test (amd64, llvm, openmpi) / Dev environment (Python)
+- Build and test (arm64, llvm, openmpi) / Dev environment (Debug)
+- Build and test (arm64, llvm, openmpi) / Dev environment (Python)
 - Create CUDA Quantum installer (amd64, 12.6) / Build CUDA Quantum assets
 - Create CUDA Quantum installer (amd64, 12.6) / Minimal OpenMPI installation
 - Create CUDA Quantum installer (amd64, 12.6) / Validate installer (debian:12)
@@ -44,13 +44,40 @@ merge_group: &full_matrix
 - Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (redhat/ubi8:8.10, --user)
 - Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (redhat/ubi8:8.10)
 push:
-- Build and test (amd64, clang16, openmpi) / Dev environment (Debug)
-- Build and test (amd64, clang16, openmpi) / Dev environment (Python)
-- Build and test (amd64, gcc11, openmpi) / Dev environment (Debug)
-- Build and test (amd64, gcc11, openmpi) / Dev environment (Python)
-- Build and test (amd64, gcc12, openmpi) / Dev environment (Debug)
-- Build and test (amd64, gcc12, openmpi) / Dev environment (Python)
-- Build and test (arm64, clang16, openmpi) / Dev environment (Debug)
-- Build and test (arm64, clang16, openmpi) / Dev environment (Python)
+- Build and test (amd64, llvm, openmpi) / Dev environment (Debug)
+- Build and test (amd64, llvm, openmpi) / Dev environment (Python)
+- Build and test (arm64, llvm, openmpi) / Dev environment (Debug)
+- Build and test (arm64, llvm, openmpi) / Dev environment (Python)
+- Create CUDA Quantum installer (amd64, 12.6) / Build CUDA Quantum assets
+- Create CUDA Quantum installer (amd64, 12.6) / Minimal OpenMPI installation
+- Create CUDA Quantum installer (amd64, 12.6) / Validate installer (debian:12)
+- Create CUDA Quantum installer (amd64, 12.6) / Validate installer (fedora:42)
+- Create CUDA Quantum installer (amd64, 12.6) / Validate installer (opensuse/leap:15.5)
+- Create CUDA Quantum installer (amd64, 12.6) / Validate installer (redhat/ubi9:9.6)
+- Create CUDA Quantum installer (amd64, 12.6) / Validate installer (ubuntu:22.04)
+- Create CUDA Quantum installer (arm64, 12.6) / Build CUDA Quantum assets
+- Create CUDA Quantum installer (arm64, 12.6) / Minimal OpenMPI installation
+- Create CUDA Quantum installer (arm64, 12.6) / Validate installer (redhat/ubi9:9.6)
+- Create CUDA Quantum installer (arm64, 12.6) / Validate installer (ubuntu:22.04)
+- Create Docker images (amd64) / Documentation
+- Create Docker images (amd64) / Validation
+- Create Docker images (arm64) / Validation
+- Create Python metapackages / Build Python metapackages
+- Create Python metapackages / Test Python metapackages (3.11)
+- Create Python metapackages / Test Python metapackages (3.13)
+- Create Python metapackages / Test Python metapackages (12.6, 3.11)
+- Create Python metapackages / Test Python metapackages (12.6, 3.13)
+- Create Python wheels (amd64, 3.11, 12.6) / Validate wheel (debian:12, --user)
+- Create Python wheels (amd64, 3.11, 12.6) / Validate wheel (debian:12)
+- Create Python wheels (amd64, 3.11, 12.6) / Validate wheel (fedora:42, --user)
+- Create Python wheels (amd64, 3.11, 12.6) / Validate wheel (fedora:42)
+- Create Python wheels (amd64, 3.11, 12.6) / Validate wheel (redhat/ubi8:8.10, --user)
+- Create Python wheels (amd64, 3.11, 12.6) / Validate wheel (redhat/ubi8:8.10)
+- Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (debian:12, --user)
+- Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (debian:12)
+- Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (fedora:42, --user)
+- Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (fedora:42)
+- Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (redhat/ubi8:8.10, --user)
+- Create Python wheels (arm64, 3.11, 12.6) / Validate wheel (redhat/ubi8:8.10)
 workflow_dispatch: *full_matrix
 schedule: *full_matrix
diff --git a/.github/workflows/build_package_sources.yml b/.github/workflows/build_package_sources.yml
index caccffabb3e..f2112652b20 100644
--- a/.github/workflows/build_package_sources.yml
+++ b/.github/workflows/build_package_sources.yml
@@ -67,7 +67,7 @@ jobs:
           else
             # cudaqx: devcontainer base, cudaqx target
             cu_tag=$(echo "${{ matrix.cuda }}" | tr -d .)
-            echo "base_image=ghcr.io/nvidia/cuda-quantum-devcontainer:amd64-cu${{ matrix.cuda }}-gcc11-main" | tee -a $GITHUB_OUTPUT
+            echo "base_image=ghcr.io/nvidia/cuda-quantum-devcontainer:amd64-cu${{ matrix.cuda }}-llvm-main" | tee -a $GITHUB_OUTPUT
             echo "target_image=ghcr.io/nvidia/cudaqx:cu${cuda_major}-latest" | tee -a $GITHUB_OUTPUT
           fi
 
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index de5055fdbef..9084a6b7194 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -42,6 +42,7 @@ jobs:
       cache_base: ${{ steps.pr_info.outputs.pr_base }}
       llvm_commit: ${{ steps.repo_info.outputs.llvm_commit }}
       pybind11_commit: ${{ steps.repo_info.outputs.pybind11_commit }}
+      nanobind_commit: ${{ steps.repo_info.outputs.nanobind_commit }}
       platform_config: ${{ steps.config.outputs.platforms }}
       build_test_matrix: ${{ steps.config.outputs.build_test_matrix }}
 
@@ -65,19 +66,13 @@ jobs:
           # (expensive, redundant with amd64/gcc). merge_group/dispatch run all.
           if [ "${{ github.event_name }}" = "push" ]; then
             build_test_matrix='{"include":[
-              {"platform":"amd64","toolchain":"clang16","mpi":"openmpi"},
-              {"platform":"amd64","toolchain":"gcc11","mpi":"openmpi"},
-              {"platform":"amd64","toolchain":"gcc12","mpi":"openmpi"},
-              {"platform":"arm64","toolchain":"clang16","mpi":"openmpi"}
+              {"platform":"amd64","toolchain":"llvm","mpi":"openmpi"},
+              {"platform":"arm64","toolchain":"llvm","mpi":"openmpi"}
             ]}'
           else
             build_test_matrix='{"include":[
-              {"platform":"amd64","toolchain":"clang16","mpi":"openmpi"},
-              {"platform":"amd64","toolchain":"gcc11","mpi":"openmpi"},
-              {"platform":"amd64","toolchain":"gcc12","mpi":"openmpi"},
-              {"platform":"arm64","toolchain":"clang16","mpi":"openmpi"},
-              {"platform":"arm64","toolchain":"gcc11","mpi":"openmpi"},
-              {"platform":"arm64","toolchain":"gcc12","mpi":"openmpi"}
+              {"platform":"amd64","toolchain":"llvm","mpi":"openmpi"},
+              {"platform":"arm64","toolchain":"llvm","mpi":"openmpi"},
             ]}'
           fi
           echo "build_test_matrix=$(echo "$build_test_matrix" | jq -c .)" >> $GITHUB_OUTPUT
@@ -114,6 +109,7 @@ jobs:
         run: |
           echo "llvm_commit=$(git rev-parse @:./tpls/llvm)" >> $GITHUB_OUTPUT
           echo "pybind11_commit=$(git rev-parse @:./tpls/pybind11)" >> $GITHUB_OUTPUT
+          echo "nanobind_commit=$(git rev-parse @:./tpls/nanobind)" >> $GITHUB_OUTPUT
 
   devdeps:
     name: Load dependencies
@@ -121,7 +117,6 @@ jobs:
     strategy:
       matrix:
         platform: [amd64, arm64]
-        toolchain: [clang16, gcc11, gcc12]
       fail-fast: false
     uses: ./.github/workflows/dev_environment.yml
     secrets:
@@ -130,14 +125,12 @@ jobs:
     with:
       platforms: linux/${{ matrix.platform }}
       dockerfile: build/devdeps.Dockerfile
-      build_config_id: ${{ matrix.toolchain }}
-      build_args: |
-        toolchain=${{ matrix.toolchain }}
+      build_config_id: llvm
       registry_cache_from: ${{ inputs.cache_base || needs.metadata.outputs.cache_base }}
       checkout_submodules: true
       environment: ghcr-ci
       # needed only for the cloudposse GitHub action
-      matrix_key: ${{ matrix.platform }}-${{ matrix.toolchain }}
+      matrix_key: ${{ matrix.platform }}-llvm
       matrix_step_name: dev_environment_devdeps
 
   wheeldeps:
@@ -155,14 +148,15 @@ jobs:
     with:
       platforms: linux/${{ matrix.platform }}
       dockerfile: build/devdeps.manylinux.Dockerfile
-      build_config_id: cu${{ matrix.cuda_version }}-gcc11
+      build_config_id: cu${{ matrix.cuda_version }}-gcc12
       build_args: |
         base_image=ghcr.io/nvidia/pypa/manylinux_2_28${{ (matrix.platform == 'arm64' && '_aarch64') || (matrix.platform == 'amd64' && '_x86_64') || '' }}:latest
         cuda_version=${{ matrix.cuda_version }}
-        toolchain=gcc11
+        toolchain=gcc12
         distro=rhel8
         llvm_commit=${{ needs.metadata.outputs.llvm_commit }}
         pybind11_commit=${{ needs.metadata.outputs.pybind11_commit }}
+        nanobind_commit=${{ needs.metadata.outputs.nanobind_commit }}
       registry_cache_from: ${{ inputs.cache_base || needs.metadata.outputs.cache_base }}
       environment: ghcr-ci
       # needed only for the cloudposse GitHub action
@@ -259,16 +253,15 @@ jobs:
     strategy:
       matrix:
         platform: [amd64]
-        toolchain: [clang16]
       fail-fast: false
     uses: ./.github/workflows/generate_cc.yml
     secrets:
       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
     with:
       platform: linux/${{ matrix.platform }}
-      devdeps_image: ${{ fromJson(needs.config_devdeps.outputs.json).image_hash[format('{0}-{1}', matrix.platform, matrix.toolchain)] }}
-      devdeps_cache: ${{ fromJson(needs.config_devdeps.outputs.json).cache_key[format('{0}-{1}', matrix.platform, matrix.toolchain)] }}
-      devdeps_archive: ${{ fromJson(needs.config_devdeps.outputs.json).tar_archive[format('{0}-{1}', matrix.platform, matrix.toolchain)] }}
+      devdeps_image: ${{ fromJson(needs.config_devdeps.outputs.json).image_hash[format('{0}-llvm', matrix.platform)] }}
+      devdeps_cache: ${{ fromJson(needs.config_devdeps.outputs.json).cache_key[format('{0}-llvm', matrix.platform)] }}
+      devdeps_archive: ${{ fromJson(needs.config_devdeps.outputs.json).tar_archive[format('{0}-llvm', matrix.platform)] }}
       export_environment: ${{ github.event_name == 'workflow_dispatch' && inputs.export_environment }}
 
   # Docker images are packaging, not correctness — only built on merge_group/dispatch.
@@ -286,9 +279,9 @@ jobs:
       DOCKERHUB_READONLY_TOKEN: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
     with:
       platforms: linux/${{ matrix.platform }}
-      devdeps_image: ${{ fromJson(needs.config_devdeps.outputs.json).image_hash[format('{0}-gcc11', matrix.platform)] }}
-      devdeps_cache: ${{ fromJson(needs.config_devdeps.outputs.json).cache_key[format('{0}-gcc11', matrix.platform)] }}
-      devdeps_archive: ${{ fromJson(needs.config_devdeps.outputs.json).tar_archive[format('{0}-gcc11', matrix.platform)] }}
+      devdeps_image: ${{ fromJson(needs.config_devdeps.outputs.json).image_hash[format('{0}-llvm', matrix.platform)] }}
+      devdeps_cache: ${{ fromJson(needs.config_devdeps.outputs.json).cache_key[format('{0}-llvm', matrix.platform)] }}
+      devdeps_archive: ${{ fromJson(needs.config_devdeps.outputs.json).tar_archive[format('{0}-llvm', matrix.platform)] }}
       environment: ghcr-ci
 
   python_wheels:
diff --git a/.github/workflows/clean_caches.yml b/.github/workflows/clean_caches.yml
index 210fb0c68b7..083a035abf6 100644
--- a/.github/workflows/clean_caches.yml
+++ b/.github/workflows/clean_caches.yml
@@ -125,14 +125,14 @@ jobs:
       - name: Delete build caches for MPI asset (ARM64)
         uses: actions/delete-package-versions@v5
         with: 
-          package-name: buildcache-cuda-quantum-assets-openmpi-gcc11-arm64
+          package-name: buildcache-cuda-quantum-assets-openmpi-llvm-arm64
           package-type: 'container'
           min-versions-to-keep: 1 # the used action does not support 0 here
 
       - name: Delete build caches for MPI asset (AMD64)
         uses: actions/delete-package-versions@v5
         with: 
-          package-name: buildcache-cuda-quantum-assets-openmpi-gcc11-amd64
+          package-name: buildcache-cuda-quantum-assets-openmpi-llvm-amd64
           package-type: 'container'
           min-versions-to-keep: 1 # the used action does not support 0 here
 
diff --git a/.github/workflows/create_cache_command.yml b/.github/workflows/create_cache_command.yml
index 06e0ae6aeee..c19d059e11e 100644
--- a/.github/workflows/create_cache_command.yml
+++ b/.github/workflows/create_cache_command.yml
@@ -56,6 +56,7 @@ jobs:
     outputs:
       llvm_commit: ${{ steps.repo_info.outputs.llvm_commit }}
       pybind11_commit: ${{ steps.repo_info.outputs.pybind11_commit }}
+      nanobind_commit: ${{ steps.repo_info.outputs.nanobind_commit }}
       platform_config: ${{ steps.config.outputs.platforms }}
 
     steps:
@@ -80,6 +81,7 @@ jobs:
         run: |
           echo "llvm_commit=$(git rev-parse @:./tpls/llvm)" >> $GITHUB_OUTPUT
           echo "pybind11_commit=$(git rev-parse @:./tpls/pybind11)" >> $GITHUB_OUTPUT
+          echo "nanobind_commit=$(git rev-parse @:./tpls/nanobind)" >> $GITHUB_OUTPUT
 
   devdeps_caches:
     name: Cache dev dependencies
@@ -87,7 +89,6 @@ jobs:
     strategy:
       matrix:
         platform: [amd64, arm64]
-        toolchain: [clang16, gcc11, gcc12]
       fail-fast: false
     uses: ./.github/workflows/dev_environment.yml
     secrets:
@@ -96,15 +97,13 @@ jobs:
     with:
       platforms: linux/${{ matrix.platform }}
       dockerfile: build/devdeps.Dockerfile
-      build_config_id: ${{ matrix.toolchain }}
-      build_args: |
-        toolchain=${{ matrix.toolchain }}
+      build_config_id: llvm
       create_local_cache: true
       registry_cache_from: ${{ needs.pr_info.outputs.target_branch }}
       pull_request_number: ${{ needs.pr_info.outputs.pull_request_number }}
       checkout_submodules: true
       # needed only for the cloudposse GitHub action
-      matrix_key: ${{ matrix.platform }}-${{ matrix.toolchain }}
+      matrix_key: ${{ matrix.platform }}-llvm
 
   wheeldeps_caches:
     name: Cache wheel dependencies
@@ -121,14 +120,15 @@ jobs:
     with:
       platforms: linux/${{ matrix.platform }}
       dockerfile: build/devdeps.manylinux.Dockerfile
-      build_config_id: cu${{ matrix.cuda_version }}-gcc11
+      build_config_id: cu${{ matrix.cuda_version }}-gcc12
       build_args: |
         base_image=ghcr.io/nvidia/pypa/manylinux_2_28${{ (matrix.platform == 'arm64' && '_aarch64') || (matrix.platform == 'amd64' && '_x86_64') || '' }}:latest
         cuda_version=${{ matrix.cuda_version }}
-        toolchain=gcc11
+        toolchain=gcc12
         distro=rhel8
         llvm_commit=${{ needs.metadata.outputs.llvm_commit }}
         pybind11_commit=${{ needs.metadata.outputs.pybind11_commit }}
+        nanobind_commit=${{ needs.metadata.outputs.nanobind_commit }}
       create_local_cache: true
       registry_cache_from: ${{ needs.pr_info.outputs.target_branch }}
       pull_request_number: ${{ needs.pr_info.outputs.pull_request_number }}
diff --git a/.github/workflows/deployments.yml b/.github/workflows/deployments.yml
index 54b4ccad6b1..25ff93aeefe 100644
--- a/.github/workflows/deployments.yml
+++ b/.github/workflows/deployments.yml
@@ -77,6 +77,7 @@ jobs:
       pull_request_commit: ${{ steps.pr_info.outputs.merge_commit }}
       llvm_commit: ${{ steps.build_config.outputs.llvm_commit }}
       pybind11_commit: ${{ steps.build_config.outputs.pybind11_commit }}
+      nanobind_commit: ${{ steps.build_config.outputs.nanobind_commit }}
       cache_base: ${{ steps.build_info.outputs.cache_base }}
       cache_target: ${{ steps.build_info.outputs.cache_target }}
       multi_platform: ${{ steps.build_info.outputs.multi_platform }}
@@ -188,6 +189,7 @@ jobs:
         run: |
           echo "llvm_commit=$(git rev-parse @:./tpls/llvm)" >> $GITHUB_OUTPUT
           echo "pybind11_commit=$(git rev-parse @:./tpls/pybind11)" >> $GITHUB_OUTPUT
+          echo "nanobind_commit=$(git rev-parse @:./tpls/nanobind)" >> $GITHUB_OUTPUT
 
           if ${{ github.event_name != 'workflow_run' || steps.pr_info.outputs.pr_number != '' }}; then
             echo "build_dependencies=true" >> $GITHUB_OUTPUT
@@ -203,7 +205,7 @@ jobs:
     strategy:
       matrix:
         platform: ${{ fromJson(needs.metadata.outputs.platforms).ids }}
-        toolchain: [clang16, gcc11, gcc12]
+        toolchain: [llvm]
       fail-fast: false
     uses: ./.github/workflows/dev_environment.yml
     secrets:
@@ -242,14 +244,15 @@ jobs:
     with:
       platforms: ${{ fromJson(needs.metadata.outputs.platforms)[format('{0}', matrix.platform)].docker_flag }}
       dockerfile: build/devdeps.manylinux.Dockerfile
-      build_config_id: cu${{ matrix.cuda_version }}-gcc11
+      build_config_id: cu${{ matrix.cuda_version }}-gcc12
       build_args: |
         base_image=ghcr.io/nvidia/pypa/manylinux_2_28${{ (matrix.platform == 'arm64' && '_aarch64') || (matrix.platform == 'amd64' && '_x86_64') || '' }}:latest
         cuda_version=${{ matrix.cuda_version }}
-        toolchain=gcc11
+        toolchain=gcc12
         distro=rhel8
         llvm_commit=${{ needs.metadata.outputs.llvm_commit }}
         pybind11_commit=${{ needs.metadata.outputs.pybind11_commit }}
+        nanobind_commit=${{ needs.metadata.outputs.nanobind_commit }}
       registry_cache_from: ${{ needs.metadata.outputs.cache_base }}
       update_registry_cache: ${{ needs.metadata.outputs.cache_target }}
       pull_request_number: ${{ needs.metadata.outputs.pull_request_number }}
@@ -368,7 +371,7 @@ jobs:
     strategy:
       matrix:
         platform: [amd64]
-        toolchain: [clang16]
+        toolchain: [llvm]
       fail-fast: false
     uses: ./.github/workflows/generate_cc.yml
     secrets:
@@ -393,10 +396,10 @@ jobs:
     with:
       platforms: ${{ fromJson(needs.metadata.outputs.platforms)[format('{0}', matrix.platform)].docker_flag }}
       dockerfile: build/devcontainer.Dockerfile
-      build_config_id: cu${{ matrix.cuda_version }}-gcc11
+      build_config_id: cu${{ matrix.cuda_version }}-llvm
       build_args: |
         cuda_version=${{ matrix.cuda_version }}
-        base_image=${{ fromJson(needs.config.outputs.json).image_hash[format('{0}-gcc11', matrix.platform)] }}
+        base_image=${{ fromJson(needs.config.outputs.json).image_hash[format('{0}-llvm', matrix.platform)] }}
         ompidev_image=${{ fromJson(needs.config.outputs.json).image_hash[format('{0}-cu{1}-ompi', matrix.platform, matrix.cuda_version)] }}
       registry_cache_from: ${{ needs.metadata.outputs.cache_base }}
       update_registry_cache: ${{ needs.metadata.outputs.cache_target }}
diff --git a/.github/workflows/dev_environment_macos.yml b/.github/workflows/dev_environment_macos.yml
index 579f9ea0073..9947f016430 100644
--- a/.github/workflows/dev_environment_macos.yml
+++ b/.github/workflows/dev_environment_macos.yml
@@ -95,6 +95,7 @@ jobs:
             scripts/build_llvm.sh \
             scripts/set_env_defaults.sh \
             .github/workflows/dev_environment_macos.yml \
+            tpls/customizations/llvm/*.diff \
             | sha256sum | cut -c1-8)
           echo "scripts_hash=$scripts_hash" >> $GITHUB_OUTPUT
 
@@ -181,6 +182,9 @@ jobs:
           # cache already has MLIR_ENABLE_BINDINGS_PYTHON=ON.  Downstream
           # wheel jobs only change Python3_EXECUTABLE, which keeps ninja's
           # incremental rebuild scoped to the binding targets.
+          # Initialize nanobind submodule which are needed for MLIR Python bindings
+          git submodule update --init --recursive tpls/nanobind
+
           source scripts/set_env_defaults.sh
           export LLVM_PROJECTS='clang;lld;mlir;openmp;python-bindings'
           
diff --git a/.github/workflows/test_in_devenv.yml b/.github/workflows/test_in_devenv.yml
index 6aa6b771e11..6015fe11d30 100644
--- a/.github/workflows/test_in_devenv.yml
+++ b/.github/workflows/test_in_devenv.yml
@@ -297,6 +297,7 @@ jobs:
               echo "::error file=test_in_devenv.yml:: Pip install of CUDA Quantum failed with status $pyinstall_status."
               exit 1
             fi
+
             python -m pip install pytest pytest-xdist
             python -m pytest -v --durations=0 -n auto python/tests/ \
               --ignore python/tests/backends \
diff --git a/.gitmodules b/.gitmodules
index 644ab8cc24f..5cf32c5ccd4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -51,3 +51,4 @@
 [submodule "tpls/nanobind"]
 	path = tpls/nanobind
 	url = https://github.com/wjakob/nanobind.git
+	ignore = dirty
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 54e8edface0..d07196d066e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,6 @@ repos:
       - id: markdownlint
         name: Markdown linting
         files: '\.md$'
-        exclude: '^tpls/'
         args: ['--config', '.github/pre-commit/md_lint_config.yml']
 
   # Standard quality checks
diff --git a/Building.md b/Building.md
index e0f00751f40..d9a86b5ebfb 100644
--- a/Building.md
+++ b/Building.md
@@ -69,7 +69,10 @@ CUDA-Q can be built on macOS for development purposes. Note that:
 
 - **ARM64 only**: Only Apple silicon Macs are supported; Intel Macs are not supported
 - **CPU-only**: No CUDA/GPU support is available on macOS
-- **Apple Clang**: Uses the system compiler (no need to install GCC or LLVM separately)
+- **LLVM 22.1 toolchain**: CUDA-Q is compiled with the `Clang/LLD/libomp` built
+  by `scripts/build_llvm.sh` — the same toolchain used on Linux. Xcode Command
+  Line Tools are still required for the macOS `SDK/sysroot` (headers, frameworks,
+  `xcrun`), but Apple Clang is no longer used to compile CUDA-Q itself.
 - **Prerequisites required**: You must use `-p` to install LLVM and other dependencies
 
 Before building, complete the macOS setup steps in
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddcc6a9ea22..a4670866380 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,7 @@ endif()
 # CMP0116: Ninja generators transform `DEPFILE`s from `add_custom_command()`
 # New in CMake 3.20. https://cmake.org/cmake/help/latest/policy/CMP0116.html
 if(POLICY CMP0116)
-  cmake_policy(SET CMP0116 OLD)
+  cmake_policy(SET CMP0116 NEW)
 endif()
 
 # Project setup
@@ -87,11 +87,14 @@ endif()
 # Enable the remote simulator by default.
 if (CUDAQ_ENABLE_REST AND NOT DEFINED CUDAQ_ENABLE_REMOTE_SIM)
   set(CUDAQ_ENABLE_REMOTE_SIM ON CACHE BOOL "Enable building cudaq-qpud.")
-  # Optionally enable the tests that use cudaq-qpud.
   if (NOT DEFINED CUDAQ_TEST_REMOTE_SIM)
     set(CUDAQ_TEST_REMOTE_SIM ON CACHE BOOL "Run remote-sim tests.")
   endif()
 endif()
+if (NOT CUDAQ_ENABLE_REST)
+  set(CUDAQ_ENABLE_REMOTE_SIM OFF CACHE BOOL "Enable building cudaq-qpud." FORCE)
+  set(CUDAQ_TEST_REMOTE_SIM OFF CACHE BOOL "Run remote-sim tests." FORCE)
+endif()
 
 # Enable Amazon Braket backends by default.
 if (NOT DEFINED CUDAQ_ENABLE_BRAKET_BACKEND)
@@ -140,7 +143,10 @@ endif()
 set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 
 if(NOT LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 16)
+  set(LLVM_VERSION_MAJOR 22)
+endif()
+if(NOT LLVM_VERSION_MINOR)
+  set(LLVM_VERSION_MINOR 1)
 endif()
 
 find_package(Git QUIET)
@@ -182,6 +188,27 @@ if (${CUDAQ_FORCE_COLORED_OUTPUT})
     endif ()
 endif ()
 
+add_compile_options(-Wno-error=deprecated-declarations)
+# Use plain -Wno- (not -Wno-error=) so GCC silently ignores it when unsupported.
+# Guard with check_cxx_compiler_flag so Apple Clang (which errors on unknown
+# -Wno- options) doesn't see it if the warning doesn't exist there.
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag(-Wno-uninitialized-const-pointer
+                        CUDAQ_HAS_WNO_UNINITIALIZED_CONST_POINTER)
+if (CUDAQ_HAS_WNO_UNINITIALIZED_CONST_POINTER)
+  add_compile_options(-Wno-uninitialized-const-pointer)
+endif()
+# GCC 12 headers emit false-positive diagnostics (e.g. char_traits.h,
+# stl_algobase.h) when compiled with Clang using GCC's sysroot.
+check_cxx_compiler_flag(-Wno-restrict CUDAQ_HAS_WNO_RESTRICT)
+if (CUDAQ_HAS_WNO_RESTRICT)
+  add_compile_options(-Wno-restrict)
+endif()
+# -Wstringop-overflow is GCC-only; check_cxx_compiler_flag caching is unreliable here.
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  add_compile_options(-Wno-stringop-overflow)
+endif()
+
 # Certain build configurations may be set directly in the environment.
 # This facilitates some of the packaging (e.g. python packages built based on the pyproject.toml).
 # These are cached so they persist across cmake runs without needing the env vars set again.
@@ -229,9 +256,9 @@ SET(CMAKE_SKIP_INSTALL_RPATH FALSE)
 SET(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 if(APPLE)
-  SET(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/lib;@loader_path/lib/plugins;@loader_path/../lib;@loader_path/../lib/plugins;@executable_path;@executable_path/lib;@executable_path/lib/plugins;@executable_path/../lib;@executable_path/../lib/plugins")
+  SET(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/lib;@loader_path/lib/plugins;@loader_path/../lib;@loader_path/../lib/plugins;@loader_path/../cudaq/mlir/_mlir_libs;@loader_path/../python/cudaq/mlir/_mlir_libs;@executable_path;@executable_path/lib;@executable_path/lib/plugins;@executable_path/../lib;@executable_path/../lib/plugins;@executable_path/../cudaq/mlir/_mlir_libs;@executable_path/../python/cudaq/mlir/_mlir_libs")
 else()
-  SET(CMAKE_INSTALL_RPATH "$ORIGIN:$ORIGIN/lib:$ORIGIN/lib/plugins:$ORIGIN/../lib:$ORIGIN/../lib/plugins")
+  SET(CMAKE_INSTALL_RPATH "$ORIGIN:$ORIGIN/lib:$ORIGIN/lib/plugins:$ORIGIN/../lib:$ORIGIN/../lib/plugins:$ORIGIN/../cudaq/mlir/_mlir_libs:$ORIGIN/../python/cudaq/mlir/_mlir_libs")
 endif()
 
 SET(BLA_STATIC ON)
@@ -262,6 +289,7 @@ else()
   # B. Using LLVM/MLIR dylibs. This won't work until later versions of LLVM are used as
   #    the first versions of this setting did not appropriately link all libraries to the dylibs.
   add_link_options("-Wl,-flat_namespace")
+  add_link_options("-Wl,-undefined,dynamic_lookup")
 endif()
 
 # Detect sysroot for C++ stdlib headers/libs. Critical on macOS where a custom-built
@@ -325,10 +353,10 @@ endif()
 # using `LLVM_VERSION_MAJOR`, e.g. "-LLVM_VERSION_MAJOR=16".  Note that this
 # version variable is set to the latest LLVM version by default, and setting it
 # to an older version might break the project.
-find_package(LLVM ${LLVM_VERSION_MAJOR} CONFIG QUIET)
+find_package(LLVM ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} CONFIG QUIET)
 
 if(NOT LLVM_DIR)
-  message(STATUS "LLVM_DIR not found, will try with llvm-config executable.")
+  message(STATUS "LLVM_DIR not found for ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}, will try with llvm-config executable.")
 
   macro(find_llvm_config name version_major)
     set(extra_args ${ARGN})
@@ -386,7 +414,7 @@ if(NOT LLVM_DIR)
       "Could not find suitable llvm-config(-${LLVM_VERSION_MAJOR}).\
       \nTry providing valid -DLLVM_DIR=/path/to/llvm/lib/cmake/llvm.")
   else()
-    find_package(LLVM ${LLVM_VERSION_MAJOR} REQUIRED CONFIG
+    find_package(LLVM ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} REQUIRED CONFIG
       HINTS ${LLVM_CONFIG_CMAKE_DIR} NO_DEFAULT_PATH)
   endif()
 endif()
@@ -682,6 +710,7 @@ endif()
 
 if (CUDAQ_ENABLE_PYTHON)
   find_package(Python 3 COMPONENTS Interpreter Development)
+  find_package(Python3 COMPONENTS Interpreter Development)
 
   # Apply specific patch to pybind11 for our documentation.
   # Only apply the patch if not already applied.
@@ -729,6 +758,12 @@ if(CUDAQ_BUILD_TESTS AND NOT CUDAQ_DISABLE_CPP_FRONTEND)
   umbrella_lit_testsuite_begin(check-all)
   set(INSTALL_GTEST OFF)
   add_subdirectory(tpls/googletest-src)
+  # Turn off character-conversion warning in gtest for clang compilers
+  include(CheckCXXCompilerFlag)
+  check_cxx_compiler_flag(-Wno-character-conversion CUDAQ_HAS_WNO_CHARACTER_CONVERSION)
+  if (CUDAQ_HAS_WNO_CHARACTER_CONVERSION)
+    target_compile_options(gtest PUBLIC -Wno-character-conversion)
+  endif()
   # Bug in GCC 12 leads to spurious warnings (-Wrestrict)
   # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105329
   if (CMAKE_COMPILER_IS_GNUCXX
diff --git a/cmake/caches/LLVM.cmake b/cmake/caches/LLVM.cmake
index efce183a8dc..6b546d36a74 100644
--- a/cmake/caches/LLVM.cmake
+++ b/cmake/caches/LLVM.cmake
@@ -20,15 +20,17 @@ set(LLVM_ENABLE_ZSTD OFF CACHE BOOL "")
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
 
 set(LLVM_BUILD_TESTS OFF CACHE BOOL "")
+set(LLVM_INCLUDE_TESTS OFF CACHE BOOL "")
 set(LLVM_BUILD_EXAMPLES OFF CACHE BOOL "")
 set(LLVM_ENABLE_OCAMLDOC OFF CACHE BOOL "")
 
 if(DEFINED LLVM_ENABLE_RUNTIMES AND LLVM_ENABLE_RUNTIMES MATCHES "libcxx")
     message(STATUS "Setting defaults to use LLVM runtimes.")
 
-    # If we want to build dynamic libraries for the unwinder,
-    # we need to build support for exception handling.
-    set(LLVM_ENABLE_EH ON CACHE BOOL "")
+    # The runtimes (libcxx, libcxxabi, libunwind) control exception support
+    # independently via LIBCXX_ENABLE_EXCEPTIONS and LIBCXXABI_ENABLE_EXCEPTIONS.
+    # LLVM_ENABLE_EH must remain OFF when Flang is a project, since Flang
+    # rejects LLVM_ENABLE_EH=ON with a FATAL_ERROR.
     set(LLVM_ENABLE_RTTI ON CACHE BOOL "")
     set(LIBCXX_ENABLE_EXCEPTIONS ON CACHE BOOL "")
     set(LIBCXXABI_ENABLE_EXCEPTIONS ON CACHE BOOL "")
diff --git a/cmake/modules/BuildHelpers.cmake b/cmake/modules/BuildHelpers.cmake
index e52d9347a16..7ee5b77d6d0 100644
--- a/cmake/modules/BuildHelpers.cmake
+++ b/cmake/modules/BuildHelpers.cmake
@@ -8,29 +8,42 @@
 
 include_guard()
 
+function(_cudaq_check_openmp_usable RESULT_VAR)
+    find_package(OpenMP)
+    if(NOT OpenMP_CXX_FOUND)
+        set(${RESULT_VAR} FALSE PARENT_SCOPE)
+        return()
+    endif()
+    include(CheckCXXCompilerFlag)
+    set(CMAKE_REQUIRED_FLAGS "${OpenMP_CXX_FLAGS}")
+    check_cxx_compiler_flag("${OpenMP_CXX_FLAGS}" CUDAQ_HAS_OPENMP_FLAG)
+    unset(CMAKE_REQUIRED_FLAGS)
+    set(${RESULT_VAR} ${CUDAQ_HAS_OPENMP_FLAG} PARENT_SCOPE)
+endfunction()
+
 # If OpenMP is enabled and found, adds the necessary compile definitions to the
 # given target, and the necessary dependencies to the given list of dependencies.
 function(add_openmp_configurations TARGET_NAME DEPENDENCIES)
-    find_package(OpenMP)
-    if(OpenMP_CXX_FOUND)
+    _cudaq_check_openmp_usable(_openmp_usable)
+    if(_openmp_usable)
         message(STATUS "OpenMP Found. Adding build flags to target ${TARGET_NAME}: ${OpenMP_CXX_FLAGS}.")
         list(APPEND ${DEPENDENCIES} OpenMP::OpenMP_CXX)
-        set(${DEPENDENCIES} "${${DEPENDENCIES}}" PARENT_SCOPE) 
+        set(${DEPENDENCIES} "${${DEPENDENCIES}}" PARENT_SCOPE)
         target_compile_definitions(${TARGET_NAME} PRIVATE HAS_OPENMP)
     elseif (CUDAQ_REQUIRE_OPENMP)
-        message(FATAL_ERROR "OpenMP not found.")
+        message(FATAL_ERROR "OpenMP not found or compiler rejects OpenMP flags.")
     endif()
 endfunction()
 
 # If OpenMP is enabled and found, adds the necessary compile definitions to the
 # interface dependencies of the given target.
 function(add_openmp_interface_definitions TARGET_NAME)
-    find_package(OpenMP)
-    if(OpenMP_CXX_FOUND)
+    _cudaq_check_openmp_usable(_openmp_usable)
+    if(_openmp_usable)
         message(STATUS "OpenMP Found. Adding interface definitions to target ${TARGET_NAME}.")
         target_compile_definitions(${TARGET_NAME} INTERFACE HAS_OPENMP)
     elseif (CUDAQ_REQUIRE_OPENMP)
-        message(FATAL_ERROR "OpenMP not found.")
+        message(FATAL_ERROR "OpenMP not found or compiler rejects OpenMP flags.")
     endif()
 endfunction()
 
diff --git a/docker/build/assets.Dockerfile b/docker/build/assets.Dockerfile
index 58c83cd9555..2d4f4d65ff3 100644
--- a/docker/build/assets.Dockerfile
+++ b/docker/build/assets.Dockerfile
@@ -58,6 +58,7 @@ ADD tpls/customizations/llvm /cuda-quantum/tpls/customizations/llvm
 ADD .gitmodules /cuda-quantum/.gitmodules
 ADD .git/modules/tpls/pybind11/HEAD /.git_modules/tpls/pybind11/HEAD
 ADD .git/modules/tpls/llvm/HEAD /.git_modules/tpls/llvm/HEAD
+ADD .git/modules/tpls/nanobind/HEAD /.git_modules/tpls/nanobind/HEAD
 
 # This is a hack so that we do not need to rebuild the prerequisites 
 # whenever we pick up a new CUDA-Q commit (which is always in CI).
@@ -72,7 +73,7 @@ RUN cd /cuda-quantum && git init && \
         fi; \
     done && git submodule init && git submodule
 RUN cd /cuda-quantum && source scripts/configure_build.sh && \
-    LLVM_PROJECTS='clang;flang;lld;mlir;openmp;runtimes' \
+    LLVM_PROJECTS='clang;flang;lld;mlir;openmp;runtimes' BOOTSTRAP_LLVM=true \
     bash scripts/install_prerequisites.sh -t llvm -e qrmi
 
 # Validate that the built toolchain and libraries have no GCC dependencies.
@@ -250,7 +251,7 @@ RUN cd /cuda-quantum && \
     bash scripts/install_prerequisites.sh -t llvm -e qrmi && \
     CC="$LLVM_INSTALL_PREFIX/bin/clang" \
     CXX="$LLVM_INSTALL_PREFIX/bin/clang++" \
-    FC="$LLVM_INSTALL_PREFIX/bin/flang-new" \
+    FC="$LLVM_INSTALL_PREFIX/bin/flang" \
     python3 -m build --wheel && \
     echo "=== ccache stats (python_build) ===" && (ccache -s 2>/dev/null || true)
     ## [<CUDAQuantumPythonBuild]
diff --git a/docker/build/cudaq.dev.Dockerfile b/docker/build/cudaq.dev.Dockerfile
index 8a5c7f3ad22..c9f84cd948c 100644
--- a/docker/build/cudaq.dev.Dockerfile
+++ b/docker/build/cudaq.dev.Dockerfile
@@ -17,7 +17,7 @@
 # 3) set the CC and CXX environment variable to use the same compiler toolchain
 #    as the LLVM dependencies have been built with.
 
-ARG base_image=ghcr.io/nvidia/cuda-quantum-devcontainer:cu12.6-gcc11-main
+ARG base_image=ghcr.io/nvidia/cuda-quantum-devcontainer:cu12.6-gcc12-main
 # Default empty stage for ccache data. CI overrides this with
 # --build-context ccache-data=<path> to inject a pre-populated cache,
 # while the devcontainer builds get the scratch as a noop.
diff --git a/docker/build/devcontainer.Dockerfile b/docker/build/devcontainer.Dockerfile
index 4100c90e340..5e6fdc10bdc 100644
--- a/docker/build/devcontainer.Dockerfile
+++ b/docker/build/devcontainer.Dockerfile
@@ -17,7 +17,7 @@
 #   docker build -t ghcr.io/nvidia/cuda-quantum-devdeps:ext -f docker/build/devdeps.ext.Dockerfile .
 
 ARG cuda_version=12.6
-ARG base_image=ghcr.io/nvidia/cuda-quantum-devdeps:gcc11-main
+ARG base_image=ghcr.io/nvidia/cuda-quantum-devdeps:gcc12-main
 ARG ompidev_image=ghcr.io/nvidia/cuda-quantum-devdeps:cu12-ompi-main
 FROM $ompidev_image AS ompibuild
 ARG cuda_version
diff --git a/docker/build/devdeps.Dockerfile b/docker/build/devdeps.Dockerfile
index 82068b89b88..78f9f957f43 100644
--- a/docker/build/devdeps.Dockerfile
+++ b/docker/build/devdeps.Dockerfile
@@ -6,21 +6,20 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
-# This file builds the development environment that contains the necessary development 
-# dependencies for building and testing CUDA-Q. This does not include the CUDA, OpenMPI 
+# This file builds the development environment that contains the necessary development
+# dependencies for building and testing CUDA-Q. This does not include the CUDA, OpenMPI
 # and other dependencies that some of the simulator backends require. These backends
 # will be omitted from the build if this environment is used.
 #
 # Usage:
 # Must be built from the repo root with:
-#   docker build -t ghcr.io/nvidia/cuda-quantum-devdeps:${toolchain}-latest -f docker/build/devdeps.Dockerfile --build-arg toolchain=$toolchain .
+#   docker build -t ghcr.io/nvidia/cuda-quantum-devdeps:llvm-latest -f docker/build/devdeps.Dockerfile .
 #
-# The variable $toolchain indicates which compiler toolchain to build the LLVM libraries with. 
+# The variable $toolchain indicates which compiler toolchain to build the LLVM libraries with.
 # The toolchain used to build the LLVM binaries that CUDA-Q depends on must be used to build
-# CUDA-Q. This image sets the CC and CXX environment variables to use that toolchain. 
-# Currently, clang16, clang15, gcc12, and gcc11 are supported. To use a different 
-# toolchain, add support for it to the install_toolchain.sh script. If the toolchain is set to llvm, 
-# then the toolchain will be built from source.
+# CUDA-Q. Currently, the $toolchain argument is a no-op; the bootstrap always uses clang.
+# Support for gcc12 (and potentially other toolchains) may be added back in the future.
+# To use a different toolchain, add support for it to the install_toolchain.sh script.
 
 # [Operating System]
 ARG base_image=ubuntu:24.04
@@ -28,10 +27,10 @@ ARG base_image=ubuntu:24.04
 # [CUDA-Q Dependencies]
 FROM ${base_image} AS prereqs
 SHELL ["/bin/bash", "-c"]
-ARG toolchain=gcc11
+ARG toolchain=llvm
 
 # When a dialogue box would be needed during install, assume default configurations.
-# Set here to avoid setting it for all install commands. 
+# Set here to avoid setting it for all install commands.
 # Given as arg to make sure that this value is only set during build but not in the launched container.
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && \
@@ -51,30 +50,29 @@ ENV ZLIB_INSTALL_PREFIX=/usr/local/zlib
 ENV OPENSSL_INSTALL_PREFIX=/usr/local/openssl
 ENV CURL_INSTALL_PREFIX=/usr/local/curl
 ENV AWS_INSTALL_PREFIX=/usr/local/aws
+ENV NANOBIND_INSTALL_PREFIX=/usr/local/nanobind
 # TODO: eliminate the need for this
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
 
 ## [Build Dependencies]
 RUN apt-get update && apt-get install -y --no-install-recommends \
-        wget git unzip \
+        wget git unzip ccache \
+        libstdc++-13-dev \
         python3-dev python3-pip && \
     python3 -m pip install --no-cache-dir numpy --break-system-packages && \
     apt-get autoremove -y --purge && apt-get clean && rm -rf /var/lib/apt/lists/*
+ADD scripts/configure_build.sh /cuda-quantum/scripts/configure_build.sh
 ADD scripts/install_toolchain.sh /cuda-quantum/scripts/install_toolchain.sh
-RUN source /cuda-quantum/scripts/install_toolchain.sh \
-        -e "$LLVM_INSTALL_PREFIX/bootstrap" -t ${toolchain}
-
-## [Source Dependencies]
-ADD scripts/install_prerequisites.sh /cuda-quantum/scripts/install_prerequisites.sh
 ADD scripts/build_llvm.sh /cuda-quantum/scripts/build_llvm.sh
 ADD cmake/caches/LLVM.cmake /cuda-quantum/cmake/caches/LLVM.cmake
 ADD tpls/customizations/llvm /cuda-quantum/tpls/customizations/llvm
 ADD .gitmodules /cuda-quantum/.gitmodules
 ADD .git/modules/tpls/pybind11/HEAD /.git_modules/tpls/pybind11/HEAD
 ADD .git/modules/tpls/llvm/HEAD /.git_modules/tpls/llvm/HEAD
+ADD .git/modules/tpls/nanobind/HEAD /.git_modules/tpls/nanobind/HEAD
 
-# This is initializing the .git index sufficiently so that we can 
-# check out the correct commits based on the submodule commit. 
+# This is initializing the .git index sufficiently so that we can
+# check out the correct commits based on the submodule commit.
 RUN cd /cuda-quantum && git init && \
     git config -f .gitmodules --get-regexp '^submodule\..*\.path$' | \
     while read path_key local_path; do \
@@ -85,9 +83,15 @@ RUN cd /cuda-quantum && git init && \
             $(cat /.git_modules/$local_path/HEAD) $local_path; \
         fi; \
     done && git submodule init && git submodule
-# Build compiler-rt (only) since it is needed for code coverage tools
-RUN LLVM_PROJECTS='clang;lld;mlir;python-bindings;compiler-rt' \
-    bash /cuda-quantum/scripts/install_prerequisites.sh -t ${toolchain}
+
+## [Source Dependencies]
+ADD scripts/bootstrap_prerequisites.sh /cuda-quantum/scripts/bootstrap_prerequisites.sh
+RUN apt-get update && apt-get install -y --no-install-recommends clang lld && \
+    CC=clang CXX=clang++ \
+    LLVM_PROJECTS='clang;flang;lld;mlir;python-bindings;compiler-rt' \
+    bash /cuda-quantum/scripts/bootstrap_prerequisites.sh && \
+    (apt-get remove -y clang lld || true) && apt-get autoremove -y --purge && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 
 ## [Dev Dependencies]
 RUN if [ "$(uname -m)" == "x86_64" ]; then \
@@ -120,18 +124,8 @@ COPY --from=prereqs /usr/local/llvm /usr/local/llvm
 ENV LLVM_INSTALL_PREFIX=/usr/local/llvm
 ENV PATH="$PATH:$LLVM_INSTALL_PREFIX/bin/"
 
-# Install the C/C++ compiler toolchain with which the LLVM dependencies have
-# been built. CUDA-Q needs to be built with that same toolchain. We use
-# a wrapper script so that the path that we set CC and CXX to is independent 
-# on the installed toolchain. Unfortunately, a symbolic link won't work.
-# Using update-alternatives for c++ and cc could maybe be a better option.
-RUN source "$LLVM_INSTALL_PREFIX/bootstrap/init_command.sh" \
-    && echo -e '#!/bin/bash\n"'$CC'" "$@"' > "$LLVM_INSTALL_PREFIX/bootstrap/cc" \
-    && echo -e '#!/bin/bash\n"'$CXX'" "$@"' > "$LLVM_INSTALL_PREFIX/bootstrap/cxx" \
-    && chmod +x "$LLVM_INSTALL_PREFIX/bootstrap/cc" \
-    && chmod +x "$LLVM_INSTALL_PREFIX/bootstrap/cxx"
-ENV CC="$LLVM_INSTALL_PREFIX/bootstrap/cc"
-ENV CXX="$LLVM_INSTALL_PREFIX/bootstrap/cxx"
+ENV CC="$LLVM_INSTALL_PREFIX/bin/clang"
+ENV CXX="$LLVM_INSTALL_PREFIX/bin/clang++"
 
 # Copy over additional prerequisites.
 ENV BLAS_INSTALL_PREFIX=/usr/local/blas
@@ -139,6 +133,8 @@ ENV ZLIB_INSTALL_PREFIX=/usr/local/zlib
 ENV OPENSSL_INSTALL_PREFIX=/usr/local/openssl
 ENV CURL_INSTALL_PREFIX=/usr/local/curl
 ENV AWS_INSTALL_PREFIX=/usr/local/aws
+ENV NANOBIND_INSTALL_PREFIX=/usr/local/nanobind
+COPY --from=prereqs /usr/local/nanobind "$NANOBIND_INSTALL_PREFIX"
 COPY --from=prereqs /usr/local/blas "$BLAS_INSTALL_PREFIX"
 COPY --from=prereqs /usr/local/zlib "$ZLIB_INSTALL_PREFIX"
 COPY --from=prereqs /usr/local/openssl "$OPENSSL_INSTALL_PREFIX"
@@ -155,7 +151,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certifi
 ENV PATH="${PATH}:/usr/local/cmake-3.28/bin"
 COPY requirements-dev.txt /cuda-quantum/requirements-dev.txt
 RUN apt-get update && apt-get install -y --no-install-recommends \
-        git gdb ninja-build file lldb ccache \
+        git gdb ninja-build file lldb ccache libatomic1 \
+        libstdc++-13-dev \
         python3 python3-pip libpython3-dev \
     && python3 -m pip install --no-cache-dir --break-system-packages \
         -r /cuda-quantum/requirements-dev.txt \
diff --git a/docker/build/devdeps.manylinux.Dockerfile b/docker/build/devdeps.manylinux.Dockerfile
index 4d5d321d80e..81b161ad515 100644
--- a/docker/build/devdeps.manylinux.Dockerfile
+++ b/docker/build/devdeps.manylinux.Dockerfile
@@ -16,7 +16,7 @@
 # The variable $toolchain indicates which compiler toolchain to build the LLVM libraries with. 
 # The toolchain used to build the LLVM binaries that CUDA-Q depends on must be used to build
 # CUDA-Q. This image sets the CC and CXX environment variables to use that toolchain. 
-# Currently, clang16 and gcc11, gcc12, and gcc13 are supported.
+# Currently, gcc12 and gcc13 are supported.
 
 # There are currently no multi-platform manylinux images available.
 # See https://github.com/pypa/manylinux/issues/1306.
@@ -26,7 +26,7 @@ FROM ${base_image}
 ARG distro=rhel8
 ARG llvm_commit
 ARG pybind11_commit
-ARG toolchain=gcc11
+ARG toolchain=gcc12
 
 # When a dialogue box would be needed during install, assume default configurations.
 # Set here to avoid setting it for all install commands. 
@@ -53,9 +53,6 @@ RUN if [ "${toolchain#gcc}" != "$toolchain" ]; then \
             enable_script=`find / -path '*gcc*' -path '*'$gcc_version'*' -name enable` && . "$enable_script"; \
         fi && \
         CC="$(which gcc)" && CXX="$(which g++)"; \
-    elif [ "$toolchain" == 'clang16' ]; then \
-        dnf install -y --nobest --setopt=install_weak_deps=False clang-16.0.6 && \
-        CC="$(which clang-16)" && CXX="$(which clang++-16)"; \
     else echo "Toolchain not supported." && exit 1; \
     fi && dnf clean all \
     && mkdir -p "$LLVM_INSTALL_PREFIX/bootstrap" \
@@ -88,7 +85,7 @@ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.28.4/cmake-3.2
 ADD ./scripts/build_llvm.sh /scripts/build_llvm.sh
 ADD ./cmake/caches/LLVM.cmake /cmake/caches/LLVM.cmake
 ADD ./tpls/customizations/llvm/ /tpls/customizations/llvm/
-RUN LLVM_PROJECTS='clang;mlir' LLVM_SOURCE=/llvm-project \
+RUN LLVM_PROJECTS='clang;lld;mlir' LLVM_SOURCE=/llvm-project \
     LLVM_CMAKE_CACHE=/cmake/caches/LLVM.cmake \
     LLVM_CMAKE_PATCHES=/tpls/customizations/llvm \
     bash /scripts/build_llvm.sh -c Release -v
diff --git a/docker/release/cudaq.wheel.Dockerfile b/docker/release/cudaq.wheel.Dockerfile
index b1d47ba7c09..d3e9956f375 100644
--- a/docker/release/cudaq.wheel.Dockerfile
+++ b/docker/release/cudaq.wheel.Dockerfile
@@ -18,7 +18,7 @@
 # - https://github.com/numpy/numpy/blob/main/pyproject.toml, and 
 # - https://github.com/numpy/numpy/blob/main/.github/workflows/wheels.yml
 
-ARG base_image=ghcr.io/nvidia/cuda-quantum-devdeps:manylinux-amd64-cu12.6-gcc11-main
+ARG base_image=ghcr.io/nvidia/cuda-quantum-devdeps:manylinux-amd64-cu12.6-gcc12-main
 # Default empty stage for ccache data. CI overrides this with
 # --build-context ccache-data=<path> to inject a pre-populated cache,
 # while local/devcontainer builds get a harmless no-op (empty scratch).
@@ -50,12 +50,12 @@ RUN --mount=from=ccache-data,target=/tmp/ccache-import,rw \
         mkdir -p /root/.ccache; \
     fi
 RUN echo "Building MLIR bindings for python${python_version}" && \
-    CCACHE_DISABLE=1 python${python_version} -m pip install --no-cache-dir numpy && \
+    CCACHE_DISABLE=1 python${python_version} -m pip install --no-cache-dir numpy "nanobind>=2.9.0" && \
     rm -rf "$LLVM_INSTALL_PREFIX/src" "$LLVM_INSTALL_PREFIX/python_packages" && \
     Python3_EXECUTABLE="$(which python${python_version})" \
-    LLVM_PROJECTS='clang;mlir;python-bindings' \
+    LLVM_PROJECTS='clang;lld;mlir;python-bindings' \
     LLVM_CMAKE_CACHE=/cmake/caches/LLVM.cmake LLVM_SOURCE=/llvm-project \
-    bash /scripts/build_llvm.sh -c Release -v 
+    bash /scripts/build_llvm.sh -c Release -v
 
 # Build wheel using unified wheel build script
 RUN cd /cuda-quantum && \
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index a9c1db77491..c2b9fd93f79 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -243,8 +243,14 @@ def setup(app):
     ('cpp:identifier', 'cudaq::detail::EigenSparseMatrix'),
     ('cpp:identifier', 'detail'),
     ('cpp:identifier', 'detail::NoisePoint'),
+    # nanobind generates RST function directives whose description text gets
+    # misinterpreted as py:class cross-references in nitpick mode
+    ('py:class', 'Convert spin_op to JSON string'),
+    ('py:class', 'Checks if all operators in the product are the identity. Note'),
 ]
 
+suppress_warnings = ["myst.duplicate_def"]
+
 napoleon_google_docstring = True
 napoleon_numpy_docstring = False
 autosectionlabel_prefix_document = True
diff --git a/include/cudaq/Frontend/nvqpp/ASTBridge.h b/include/cudaq/Frontend/nvqpp/ASTBridge.h
index 69294759fb0..a571d50dc94 100644
--- a/include/cudaq/Frontend/nvqpp/ASTBridge.h
+++ b/include/cudaq/Frontend/nvqpp/ASTBridge.h
@@ -15,6 +15,7 @@
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/AST/Mangle.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Analysis/CallGraph.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
@@ -371,33 +372,39 @@ class QuakeBridgeVisitor
   // Type nodes to lower to Quake.
   //===--------------------------------------------------------------------===//
 
-  bool TraverseTypedefType(clang::TypedefType *t) {
+  bool TraverseTypedefType(clang::TypedefType *t, bool &visitChildren) {
     return TraverseType(t->desugar());
   }
-  bool TraverseTypedefTypeLoc(clang::TypedefTypeLoc tl) {
+  bool TraverseTypedefTypeLoc(clang::TypedefTypeLoc tl, bool &visitChildren) {
     return TraverseType(tl.getType());
   }
-  bool TraverseUsingType(clang::UsingType *t) {
+  bool TraverseUsingType(clang::UsingType *t, bool &visitChildren) {
     return TraverseType(t->desugar());
   }
-  bool TraverseUsingTypeLoc(clang::UsingTypeLoc tl) {
+  bool TraverseUsingTypeLoc(clang::UsingTypeLoc tl, bool &visitChildren) {
     return TraverseType(tl.getType());
   }
-  bool
-  TraverseTemplateSpecializationType(clang::TemplateSpecializationType *t) {
+  bool TraverseTemplateSpecializationType(clang::TemplateSpecializationType *t,
+                                          bool &visitChildren) {
     return TraverseType(t->desugar());
   }
-  bool TraverseTypeOfExprType(clang::TypeOfExprType *t) {
+  bool TraverseTypeOfExprType(clang::TypeOfExprType *t, bool &visitChildren) {
     // Do not visit the expression as it is has no semantics other than for
     // inferring a type.
     return TraverseType(t->desugar());
   }
-  bool TraverseNestedNameSpecifier(clang::NestedNameSpecifier *) {
-    return true;
+  bool TraverseNestedNameSpecifier(clang::NestedNameSpecifier) { return true; }
+  bool TraverseDecltypeType(clang::DecltypeType *t, bool &visitChildren) {
+    return TraverseType(t->desugar());
   }
-  bool TraverseDecltypeType(clang::DecltypeType *t) {
+  bool TraversePredefinedSugarType(clang::PredefinedSugarType *t,
+                                   bool &visitChildren) {
     return TraverseType(t->desugar());
   }
+  bool TraversePredefinedSugarTypeLoc(clang::PredefinedSugarTypeLoc tl,
+                                      bool &visitChildren) {
+    return TraverseType(tl.getType());
+  }
 
   // When processing a record type, visit the type of all the field decls. This
   // will push 1 new type on the stack for each field. These types will be the
@@ -413,7 +420,7 @@ class QuakeBridgeVisitor
     return Base::WalkUpFromFieldDecl(x);
   }
 
-  bool TraverseRecordType(clang::RecordType *t);
+  bool TraverseRecordType(clang::RecordType *t, bool &visitChildren);
   bool interceptRecordDecl(clang::RecordDecl *x);
   std::pair<std::uint64_t, unsigned> getWidthAndAlignment(clang::RecordDecl *x);
   bool VisitRecordDecl(clang::RecordDecl *x);
@@ -468,9 +475,10 @@ class QuakeBridgeVisitor
   mlir::Value loadLValue(mlir::Value val) {
     auto valTy = val.getType();
     if (isa<cudaq::cc::PointerType>(valTy))
-      return builder.create<cudaq::cc::LoadOp>(val.getLoc(), val);
+      return cudaq::cc::LoadOp::create(builder, val.getLoc(), val);
     if (isa<mlir::LLVM::LLVMPointerType>(valTy))
-      return builder.create<mlir::LLVM::LoadOp>(val.getLoc(), val);
+      return mlir::LLVM::LoadOp::create(builder, val.getLoc(),
+                                        builder.getI8Type(), val);
     return val;
   }
 
@@ -789,7 +797,7 @@ inline bool isInNamespace(const clang::Decl *x, mlir::StringRef nsName) {
   do {
     if (const auto *nsd = dyn_cast<clang::NamespaceDecl>(declCtx))
       if (const auto *nsi = nsd->getIdentifier())
-        if (nsi->getName().equals(nsName))
+        if (nsi->getName() == nsName)
           return true;
     declCtx = declCtx->getParent();
   } while (declCtx);
@@ -804,7 +812,7 @@ inline bool isInClassInNamespace(const clang::Decl *x,
   assert(x && "decl is null");
   if (const auto *cld = dyn_cast<clang::RecordDecl>(x->getDeclContext()))
     if (const auto *cli = cld->getIdentifier())
-      return cli->getName().equals(className) && isInNamespace(cld, nsName);
+      return (cli->getName() == className) && isInNamespace(cld, nsName);
   return false;
 }
 
diff --git a/include/cudaq/Frontend/nvqpp/QisBuilder.h b/include/cudaq/Frontend/nvqpp/QisBuilder.h
index 489dc39873f..078b853e869 100644
--- a/include/cudaq/Frontend/nvqpp/QisBuilder.h
+++ b/include/cudaq/Frontend/nvqpp/QisBuilder.h
@@ -8,7 +8,6 @@
 
 #pragma once
 
-#include "llvm/Support/Registry.h"
 #include "mlir/IR/Builders.h"
 
 namespace nvqpp {
@@ -21,6 +20,4 @@ class QISBuilder {
                    mlir::ValueRange general_operands) = 0;
 };
 
-using QISBuilderRegistry = llvm::Registry<nvqpp::QISBuilder>;
-
 } // namespace nvqpp
diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h
index 7956aa68e21..73783252c78 100644
--- a/include/cudaq/Optimizer/Builder/Factory.h
+++ b/include/cudaq/Optimizer/Builder/Factory.h
@@ -19,6 +19,10 @@
 #include <complex>
 #include <vector>
 
+namespace llvm {
+class DataLayout;
+}
+
 namespace quake {
 class StateType;
 }
@@ -65,7 +69,7 @@ inline mlir::Type getCharType(mlir::MLIRContext *ctx) {
 
 /// Return the LLVM-IR dialect `ptr` type.
 inline mlir::Type getPointerType(mlir::MLIRContext *ctx) {
-  return mlir::LLVM::LLVMPointerType::get(getCharType(ctx));
+  return mlir::LLVM::LLVMPointerType::get(ctx);
 }
 
 /// The type of a dynamic buffer as returned via the runtime.
@@ -79,9 +83,9 @@ inline mlir::Type getOpaquePointerType(mlir::MLIRContext *ctx) {
   return mlir::LLVM::LLVMPointerType::get(ctx, /*addressSpace=*/0);
 }
 
-/// Return the LLVM-IR dialect type: `ty*`.
+/// Return the LLVM-IR dialect type: `ptr`. (changed for modern LLVM.)
 inline mlir::Type getPointerType(mlir::Type ty) {
-  return mlir::LLVM::LLVMPointerType::get(ty);
+  return factory::getPointerType(ty.getContext());
 }
 
 cudaq::cc::PointerType getIndexedObjectType(mlir::Type eleTy);
@@ -163,7 +167,7 @@ inline mlir::LLVM::ConstantOp genLlvmI32Constant(mlir::Location loc,
                                                  std::int32_t val) {
   auto idx = builder.getI32IntegerAttr(val);
   auto i32Ty = builder.getI32Type();
-  return builder.create<mlir::LLVM::ConstantOp>(loc, i32Ty, idx);
+  return mlir::LLVM::ConstantOp::create(builder, loc, i32Ty, idx);
 }
 
 inline mlir::LLVM::ConstantOp genLlvmI64Constant(mlir::Location loc,
@@ -171,14 +175,14 @@ inline mlir::LLVM::ConstantOp genLlvmI64Constant(mlir::Location loc,
                                                  std::int64_t val) {
   auto idx = builder.getI64IntegerAttr(val);
   auto i64Ty = builder.getI64Type();
-  return builder.create<mlir::LLVM::ConstantOp>(loc, i64Ty, idx);
+  return mlir::LLVM::ConstantOp::create(builder, loc, i64Ty, idx);
 }
 
 inline mlir::Value createFloatConstant(mlir::Location loc,
                                        mlir::OpBuilder &builder,
                                        llvm::APFloat value,
                                        mlir::FloatType type) {
-  return builder.create<mlir::arith::ConstantFloatOp>(loc, value, type);
+  return mlir::arith::ConstantFloatOp::create(builder, loc, type, value);
 }
 
 inline mlir::Value createFloatConstant(mlir::Location loc,
@@ -220,11 +224,16 @@ inline mlir::Block *addEntryBlock(mlir::LLVM::GlobalOp initVar) {
 
 /// Return an i64 array where element `k` is `N` if the
 /// operand `k` is `veq<N>` and 0 otherwise.
+/// \p originalControls contains the pre-conversion quake control values,
+/// used to distinguish `veq` from ref types (necessary with opaque pointers
+/// where both convert to the same !llvm.ptr type).
 mlir::Value packIsArrayAndLengthArray(mlir::Location loc,
                                       mlir::ConversionPatternRewriter &rewriter,
                                       mlir::ModuleOp parentModule,
                                       std::size_t numOperands,
-                                      mlir::ValueRange operands);
+                                      mlir::ValueRange operands,
+                                      mlir::ValueRange originalControls);
+
 mlir::FlatSymbolRefAttr
 createLLVMFunctionSymbol(mlir::StringRef name, mlir::Type retType,
                          mlir::ArrayRef<mlir::Type> inArgTypes,
diff --git a/include/cudaq/Optimizer/Builder/Intrinsics.h b/include/cudaq/Optimizer/Builder/Intrinsics.h
index e731e836c0d..20a3cadee98 100644
--- a/include/cudaq/Optimizer/Builder/Intrinsics.h
+++ b/include/cudaq/Optimizer/Builder/Intrinsics.h
@@ -19,9 +19,8 @@ class GlobalOp;
 /// calls will be erased before code gen.
 static constexpr const char stdMoveBuiltin[] = ".std::move";
 
-static constexpr const char llvmMemCopyIntrinsic[] =
-    "llvm.memcpy.p0i8.p0i8.i64";
-static constexpr const char llvmMemSetIntrinsic[] = "llvm.memset.p0i8.i64";
+static constexpr const char llvmMemCopyIntrinsic[] = "llvm.memcpy.p0.p0.i64";
+static constexpr const char llvmMemSetIntrinsic[] = "llvm.memset.p0.i64";
 
 // cudaq::range(count);
 static constexpr const char setCudaqRangeVector[] = "__nvqpp_CudaqRangeInit";
diff --git a/include/cudaq/Optimizer/CAPI/Dialects.h b/include/cudaq/Optimizer/CAPI/Dialects.h
index 251d805d638..9abb3df8f69 100644
--- a/include/cudaq/Optimizer/CAPI/Dialects.h
+++ b/include/cudaq/Optimizer/CAPI/Dialects.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "mlir/CAPI/IR.h"
 #include "mlir/CAPI/Registration.h"
 
 #ifdef __cplusplus
@@ -17,6 +18,9 @@ extern "C" {
 MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(Quake, quake);
 MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(CC, cc);
 
+// Register Quake, CC, and all upstream MLIR dialects into `context`.
+MLIR_CAPI_EXPORTED void cudaqRegisterAllDialects(MlirContext context);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td b/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td
index b94b977633d..fcefa08f754 100644
--- a/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td
+++ b/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td
@@ -24,7 +24,6 @@ def CodeGenDialect : Dialect {
 
   let cppNamespace = "cudaq::codegen";
   let useDefaultTypePrinterParser = 1;
-  let useFoldAPI = kEmitFoldAdaptorFolder;
 
   let extraClassDeclaration = [{
     void registerTypes(); // register at least a bogo type.
diff --git a/include/cudaq/Optimizer/CodeGen/Passes.h b/include/cudaq/Optimizer/CodeGen/Passes.h
index e36c350711b..b6bd4a5e6ef 100644
--- a/include/cudaq/Optimizer/CodeGen/Passes.h
+++ b/include/cudaq/Optimizer/CodeGen/Passes.h
@@ -13,6 +13,10 @@
 /// particular quantum target representation. There is a bevy of such targets
 /// that provide platforms on which the quantum code can be run.
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 
@@ -60,10 +64,10 @@ mlir::LLVM::LLVMStructType lambdaAsPairOfPointers(mlir::MLIRContext *context);
 /// before conversion to the LLVM-IR dialect.
 void registerToQIRAPIPipeline();
 
-/// Add the convert to QIR API pipeline to \p pm. We don't use opaque pointers
-/// yet, so provide a convenient overload.
+/// Add the convert to QIR API pipeline to \p pm. With the move to LLVM 22, we
+/// now use opaque pointers.
 void addConvertToQIRAPIPipeline(mlir::OpPassManager &pm, mlir::StringRef api,
-                                bool opaquePtr = false);
+                                bool opaquePtr = true);
 
 /// The pipeline for lowering Quake code to the execution manager API. This
 /// pipeline should be run before conversion to the LLVM-IR dialect.
diff --git a/include/cudaq/Optimizer/CodeGen/Passes.td b/include/cudaq/Optimizer/CodeGen/Passes.td
index 598e9d2c7c0..8d072551cc1 100644
--- a/include/cudaq/Optimizer/CodeGen/Passes.td
+++ b/include/cudaq/Optimizer/CodeGen/Passes.td
@@ -58,7 +58,9 @@ def ConvertToQIR : Pass<"quake-to-qir", "mlir::ModuleOp"> {
   }];
 
   let dependentDialects = [
-    "cudaq::codegen::CodeGenDialect", "mlir::LLVM::LLVMDialect"
+    "cudaq::codegen::CodeGenDialect", "mlir::LLVM::LLVMDialect",
+    "mlir::arith::ArithDialect", "mlir::complex::ComplexDialect",
+    "cudaq::cc::CCDialect"
   ];
 }
 
@@ -136,6 +138,7 @@ def QIRToQIRProfile : Pass<"convert-to-qir-profile"> {
            "Which QIR profile to convert to (default is 'qir-base')">
   ];
 
+  let dependentDialects = ["mlir::LLVM::LLVMDialect"];
   let constructor = "cudaq::opt::createQIRToQIRProfilePass(\"qir-base\")";
 }
 
@@ -172,7 +175,8 @@ def WireSetToProfileQIR : Pass<"wireset-to-profile-qir", "mlir::func::FuncOp"> {
     the code to CC dialect with QIR calls, etc.
   }];
 
-  let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
+  let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect",
+      "mlir::func::FuncDialect", "mlir::arith::ArithDialect"];
   let options = [
     Option<"convertTo", "convert-to", "std::string", /*default=*/"\"qir-base\"",
            "Select the profile to convert wire sets to.">
@@ -220,7 +224,7 @@ def QuakeToQIRAPI : Pass<"quake-to-qir-api"> {
   let options = [
     Option<"api", "api", "std::string", /*default=*/"\"full\"",
            "Select the QIR API to use.">,
-    Option<"opaquePtr", "opaque-pointer", "bool", /*default=*/"false",
+    Option<"opaquePtr", "opaque-pointer", "bool", /*default=*/"true",
            "Use opaque pointers.">
   ];
 }
@@ -250,7 +254,7 @@ def QuakeToQIRAPIPrep : Pass<"quake-to-qir-api-prep", "mlir::ModuleOp"> {
   let options = [
     Option<"api", "api", "std::string", /*default=*/"\"full\"",
            "Select the QIR API to use.">,
-    Option<"opaquePtr", "opaque-pointer", "bool", /*default=*/"false",
+    Option<"opaquePtr", "opaque-pointer", "bool", /*default=*/"true",
            "Use opaque pointers.">
   ];
 }
diff --git a/include/cudaq/Optimizer/CodeGen/Peephole.h b/include/cudaq/Optimizer/CodeGen/Peephole.h
index e829cdb5f7a..f8fd0493127 100644
--- a/include/cudaq/Optimizer/CodeGen/Peephole.h
+++ b/include/cudaq/Optimizer/CodeGen/Peephole.h
@@ -16,9 +16,9 @@
 #include "mlir/Support/LLVM.h"
 
 inline bool needsToBeRenamed(mlir::StringRef name) {
-  return name.startswith(cudaq::opt::QIRQISPrefix) &&
-         !name.endswith("__body") && !name.endswith("__adj") &&
-         !name.endswith("__ctl");
+  return name.starts_with(cudaq::opt::QIRQISPrefix) &&
+         !name.ends_with("__body") && !name.ends_with("__adj") &&
+         !name.ends_with("__ctl");
 }
 
 inline bool callToInvokeWithXCtrlOneTarget(mlir::StringRef callee,
@@ -26,7 +26,7 @@ inline bool callToInvokeWithXCtrlOneTarget(mlir::StringRef callee,
   if ((args.size() == 4) && (callee == cudaq::opt::NVQIRInvokeWithControlBits))
     if (auto addrOf = dyn_cast_or_null<mlir::LLVM::AddressOfOp>(
             args[1].getDefiningOp())) {
-      return addrOf.getGlobalName().startswith(
+      return addrOf.getGlobalName().starts_with(
           std::string(cudaq::opt::QIRQISPrefix) + "x__ctl");
     }
   return false;
@@ -41,14 +41,14 @@ static constexpr char resultIndexName[] = "result.index";
 inline mlir::Value createMeasureCall(mlir::PatternRewriter &builder,
                                      mlir::Location loc, mlir::LLVM::CallOp op,
                                      mlir::ValueRange args) {
-  auto ptrTy = cudaq::opt::getResultType(builder.getContext());
+  auto ptrTy = cudaq::cg::getLLVMResultType(builder.getContext());
   if (auto intAttr =
           dyn_cast_or_null<mlir::IntegerAttr>(op->getAttr(resultIndexName))) {
-    auto constOp = builder.create<mlir::LLVM::ConstantOp>(loc, intAttr);
-    auto cast = builder.create<mlir::LLVM::IntToPtrOp>(loc, ptrTy, constOp);
-    builder.create<mlir::LLVM::CallOp>(
-        loc, mlir::TypeRange{}, cudaq::opt::QIRMeasureBody,
-        mlir::ArrayRef<mlir::Value>{args[0], cast});
+    mlir::Value constOp = mlir::LLVM::ConstantOp::create(builder, loc, intAttr);
+    auto cast = mlir::LLVM::IntToPtrOp::create(builder, loc, ptrTy, constOp);
+    mlir::LLVM::CallOp::create(builder, loc, mlir::TypeRange{},
+                               cudaq::opt::QIRMeasureBody,
+                               mlir::ArrayRef<mlir::Value>{args[0], cast});
     return cast;
   }
   op.emitError("mz op must have an associated result index.");
@@ -60,9 +60,8 @@ inline mlir::Value createReadResultCall(mlir::PatternRewriter &builder,
                                         mlir::Value result) {
   // NB: This code is only used from a deprecated pass.
   auto i1Ty = mlir::IntegerType::get(builder.getContext(), 1);
-  return builder
-      .create<mlir::LLVM::CallOp>(loc, mlir::TypeRange{i1Ty},
-                                  cudaq::opt::qir0_1::ReadResultBody,
-                                  mlir::ArrayRef<mlir::Value>{result})
+  return mlir::LLVM::CallOp::create(builder, loc, mlir::TypeRange{i1Ty},
+                                    cudaq::opt::qir0_1::ReadResultBody,
+                                    mlir::ArrayRef<mlir::Value>{result})
       .getResult();
 }
diff --git a/include/cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h b/include/cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h
index aca0cc5d2ba..4dfd71ce612 100644
--- a/include/cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h
+++ b/include/cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h
@@ -11,6 +11,7 @@
 /// This file provides the opaque struct types to be used with the obsolete LLVM
 /// typed pointer type.
 
+#include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
@@ -22,74 +23,65 @@ inline mlir::Type getQuantumTypeByName(mlir::StringRef type,
 }
 
 namespace opt {
-
-// The following type creators are deprecated and should only be used in the
-// older codegen passes. Use the creators in the cg namespace immediately below
-// instead.
-inline mlir::Type getOpaquePointerType(mlir::MLIRContext *context) {
-  return mlir::LLVM::LLVMPointerType::get(context);
-}
-
-inline mlir::Type getQubitType(mlir::MLIRContext *context) {
-  return mlir::LLVM::LLVMPointerType::get(
-      getQuantumTypeByName("Qubit", context));
-}
-
-inline mlir::Type getArrayType(mlir::MLIRContext *context) {
-  return mlir::LLVM::LLVMPointerType::get(
-      getQuantumTypeByName("Array", context));
-}
-
-inline mlir::Type getResultType(mlir::MLIRContext *context) {
-  return mlir::LLVM::LLVMPointerType::get(
-      getQuantumTypeByName("Result", context));
-}
-
-inline mlir::Type getCharPointerType(mlir::MLIRContext *context) {
-  return mlir::LLVM::LLVMPointerType::get(mlir::IntegerType::get(context, 8));
-}
-
 void initializeTypeConversions(mlir::LLVMTypeConverter &typeConverter);
-
 } // namespace opt
 
 namespace cg {
 
-// The following type creators replace the ones above. They are configurable on
-// the fly to either use opaque structs or opaque pointers. The default is to
-// use pointers to opaque structs, which is no longer supported in modern LLVM.
+// These type creators are configurable on the fly to either use opaque structs
+// or opaque pointers. The default is to use opaque pointers, which are the
+// default in any modern LLVM version.
 
 inline mlir::Type getOpaquePointerType(mlir::MLIRContext *context) {
   return cc::PointerType::get(mlir::NoneType::get(context));
 }
 
 inline mlir::Type getQubitType(mlir::MLIRContext *context,
-                               bool useOpaquePtr = false) {
+                               bool useOpaquePtr = true) {
   if (useOpaquePtr)
     return getOpaquePointerType(context);
   return cc::PointerType::get(getQuantumTypeByName("Qubit", context));
 }
 
 inline mlir::Type getArrayType(mlir::MLIRContext *context,
-                               bool useOpaquePtr = false) {
+                               bool useOpaquePtr = true) {
   if (useOpaquePtr)
     return getOpaquePointerType(context);
   return cc::PointerType::get(getQuantumTypeByName("Array", context));
 }
 
 inline mlir::Type getResultType(mlir::MLIRContext *context,
-                                bool useOpaquePtr = false) {
+                                bool useOpaquePtr = true) {
   if (useOpaquePtr)
     return getOpaquePointerType(context);
   return cc::PointerType::get(getQuantumTypeByName("Result", context));
 }
 
 inline mlir::Type getCharPointerType(mlir::MLIRContext *context,
-                                     bool useOpaquePtr = false) {
+                                     bool useOpaquePtr = true) {
   if (useOpaquePtr)
     return getOpaquePointerType(context);
   return cc::PointerType::get(mlir::IntegerType::get(context, 8));
 }
 
+// LLVM Types:
+// The factory builder will build opaque pointers for modern MLIR.
+
+inline mlir::Type getLLVMQubitType(mlir::MLIRContext *context) {
+  return opt::factory::getPointerType(getQuantumTypeByName("Qubit", context));
+}
+
+inline mlir::Type getLLVMArrayType(mlir::MLIRContext *context) {
+  return opt::factory::getPointerType(getQuantumTypeByName("Array", context));
+}
+
+inline mlir::Type getLLVMResultType(mlir::MLIRContext *context) {
+  return opt::factory::getPointerType(getQuantumTypeByName("Result", context));
+}
+
+inline mlir::Type getLLVMCharPointerType(mlir::MLIRContext *context) {
+  return opt::factory::getPointerType(mlir::IntegerType::get(context, 8));
+}
+
 } // namespace cg
 } // namespace cudaq
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCDialect.td b/include/cudaq/Optimizer/Dialect/CC/CCDialect.td
index 456235e2b7d..e6b2e0d9f40 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCDialect.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCDialect.td
@@ -32,7 +32,6 @@ def CCDialect : Dialect {
 
   let cppNamespace = "cudaq::cc";
   let useDefaultTypePrinterParser = 1;
-  let useFoldAPI = kEmitFoldAdaptorFolder;
 
   let extraClassDeclaration = [{
     /// Register all CC types.
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index f6f3e4fe711..bf7f13c8ba4 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -332,8 +332,8 @@ def cc_LoopOp : CCOp<"loop",
         mlir::Block::BlockArgListType{};
     }
 
-    mlir::OperandRange
-    getSuccessorEntryOperands(std::optional<unsigned> index);
+    mlir::OperandRange getEntrySuccessorOperands(mlir::RegionBranchPoint point);
+    mlir::OperandRange getEntrySuccessorOperands(mlir::RegionSuccessor point);
 
     bool hasBreakInBody();
   }];
@@ -345,8 +345,8 @@ def cc_LoopOp : CCOp<"loop",
 
 def cc_IfOp : CCOp<"if",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
-                               ["getNumRegionInvocations",
-                                "getRegionInvocationBounds"]>,
+      ["getNumRegionInvocations", "getRegionInvocationBounds",
+       "getEntrySuccessorRegions"]>,
      RecursiveMemoryEffects, LinearTypeArgsTrait]> {
   let summary = "if-then-else operation";
   let description = [{
@@ -981,7 +981,6 @@ def cc_ExtractValueOp : CCOp<"extract_value", [Pure]> {
       $rawConstantIndices) `]` `:` functional-type(operands, results) attr-dict
   }];
 
-  let hasFolder = 1;
   let hasVerifier = 1;
   let hasCanonicalizer = 1;
 
@@ -1081,7 +1080,6 @@ def cc_ComputePtrOp : CCOp<"compute_ptr", [Pure]> {
       `]` `:` functional-type(operands, results) attr-dict
   }];
 
-  let hasFolder = 1;
   let hasCanonicalizer = 1;
   let hasVerifier = 1;
 
@@ -1411,7 +1409,6 @@ def cc_CastOp : CCOp<"cast", [Pure]> {
   );
   let results = (outs AnyType:$result);
 
-  let hasFolder = 1;
   let hasCanonicalizer = 1;
   let hasVerifier = 1;
   
@@ -1551,7 +1548,9 @@ def cc_CallCallableOp : CCOp<"call_callable", [CallOpInterface]> {
 
   let arguments = (ins
     AnyCallableType:$callee,
-    Variadic<AnyType>:$args
+    Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
   let results = (outs Variadic<AnyType>:$results);
   let hasVerifier = 1;
@@ -1560,7 +1559,17 @@ def cc_CallCallableOp : CCOp<"call_callable", [CallOpInterface]> {
     $callee (`,` $args^)? `:` functional-type(operands, results) attr-dict
   }];
 
+  let builders = [
+    OpBuilder<(ins "mlir::TypeRange":$result, "mlir::Value":$callee,
+        "mlir::ValueRange":$args), [{
+      return build($_builder, $_state, result, callee, args, mlir::ArrayAttr{},
+        mlir::ArrayAttr{});
+    }]>
+  ];
+
   let extraClassDeclaration = [{
+    static constexpr mlir::StringRef getCalleeAttrNameStr() { return "callee"; }
+
     /// Get the argument operands to the called function.
     operand_range getArgOperands() {
       return {arg_operand_begin(), arg_operand_end()};
@@ -1569,9 +1578,18 @@ def cc_CallCallableOp : CCOp<"call_callable", [CallOpInterface]> {
     operand_iterator arg_operand_begin() { return ++operand_begin(); }
     operand_iterator arg_operand_end() { return operand_end(); }
 
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      return getArgsMutable();
+    }
+
     /// Return the callee of this operation.
     mlir::CallInterfaceCallable getCallableForCallee() { return getCallee(); }
 
+    /// Set the callee for this operation.
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      setOperand(0, mlir::cast<mlir::Value>(callee));
+    }
+
     mlir::FunctionType getFunctionType() {
       return mlir::FunctionType::get(getContext(), getOperands().getType(),
         getResults().getTypes());
@@ -1593,7 +1611,9 @@ def cc_CallIndirectCallableOp :
 
   let arguments = (ins
     cc_IndirectCallableType:$callee,
-    Variadic<AnyType>:$args
+    Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
   let results = (outs Variadic<AnyType>:$results);
   let hasVerifier = 1;
@@ -1603,6 +1623,14 @@ def cc_CallIndirectCallableOp :
     $callee (`,` $args^)? `:` functional-type(operands, results) attr-dict
   }];
 
+  let builders = [
+    OpBuilder<(ins "mlir::TypeRange":$result, "mlir::Value":$callee,
+        "mlir::ValueRange":$args), [{
+      return build($_builder, $_state, result, callee, args, mlir::ArrayAttr{},
+        mlir::ArrayAttr{});
+    }]>
+  ];
+
   let extraClassDeclaration = [{
     /// Get the argument operands to the called function.
     operand_range getArgOperands() {
@@ -1612,9 +1640,18 @@ def cc_CallIndirectCallableOp :
     operand_iterator arg_operand_begin() { return ++operand_begin(); }
     operand_iterator arg_operand_end() { return operand_end(); }
 
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      return getArgsMutable();
+    }
+
     /// Return the callee of this operation.
     mlir::CallInterfaceCallable getCallableForCallee() { return getCallee(); }
 
+    /// Set the callee for this operation.
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      setOperand(0, mlir::cast<mlir::Value>(callee));
+    }
+
     mlir::FunctionType getFunctionType() {
       return mlir::FunctionType::get(getContext(), getOperands().getType(),
         getResults().getTypes());
@@ -1789,7 +1826,9 @@ def cc_NoInlineCallOp : CCOp<"noinline_call",
 
   let arguments = (ins
     FlatSymbolRefAttr:$callee,
-    Variadic<AnyType>:$args
+    Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
   let results = (outs Variadic<AnyType>);
 
@@ -1805,6 +1844,15 @@ def cc_NoInlineCallOp : CCOp<"noinline_call",
     operand_iterator arg_operand_begin() { return operand_begin(); }
     operand_iterator arg_operand_end() { return operand_end(); }
 
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      return getArgsMutable();
+    }
+
+    /// Set the callee for this operation.
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      setOperand(0, mlir::cast<mlir::Value>(callee));
+    }
+
     /// DO NOT RETURN the callee of this operation. This fools the inliner into
     /// not knowing what is actually called.
     mlir::CallInterfaceCallable getCallableForCallee() {
@@ -1830,7 +1878,9 @@ def cc_DeviceCallOp : CCOp<"device_call",
     Variadic<AnySignlessInteger>:$numBlocks,
     Variadic<AnySignlessInteger>:$numThreadsPerBlock,
     Optional<AnySignlessInteger>:$device,
-    Variadic<AnyType>:$args
+    Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
   let results = (outs Variadic<AnyType>);
   let assemblyFormat = [{
@@ -1845,18 +1895,20 @@ def cc_DeviceCallOp : CCOp<"device_call",
     OpBuilder<(ins "mlir::TypeRange":$resTys, "mlir::StringRef":$callee,
                    "mlir::ValueRange":$values), [{
       return build($_builder, $_state, resTys, callee, mlir::ValueRange{},
-        mlir::ValueRange{},  mlir::Value{}, values);
+        mlir::ValueRange{},  mlir::Value{}, values, mlir::ArrayAttr{},
+        mlir::ArrayAttr{});
     }]>,
     OpBuilder<(ins "mlir::TypeRange":$resTys, "mlir::StringRef":$callee,
                    "mlir::Value":$device, "mlir::ValueRange":$values), [{
       return build($_builder, $_state, resTys, callee, mlir::ValueRange{},
-        mlir::ValueRange{}, device, values);
+        mlir::ValueRange{}, device, values, mlir::ArrayAttr{},
+        mlir::ArrayAttr{});
     }]>,
     OpBuilder<(ins "mlir::TypeRange":$resTys, "mlir::StringRef":$callee,
                    "mlir::ValueRange":$blocks, "mlir::ValueRange":$threads,
                    "mlir::ValueRange":$values), [{
       return build($_builder, $_state, resTys, callee, blocks, threads,
-        mlir::Value{}, values);
+        mlir::Value{}, values, mlir::ArrayAttr{}, mlir::ArrayAttr{});
     }]>
   ];
 
@@ -1868,6 +1920,15 @@ def cc_DeviceCallOp : CCOp<"device_call",
     operand_iterator arg_operand_begin() { return operand_begin(); }
     operand_iterator arg_operand_end() { return operand_end(); }
 
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      return getArgsMutable();
+    }
+
+    /// Set the callee for this operation.
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      setOperand(0, mlir::cast<mlir::Value>(callee));
+    }
+
     /// Return the callee of this operation.
     mlir::CallInterfaceCallable getCallableForCallee() {
       return getCalleeAttr();
@@ -1912,7 +1973,9 @@ def cc_VarargCallOp : CCOp<"call_vararg",
 
   let arguments = (ins
     FlatSymbolRefAttr:$callee,
-    Variadic<AnyType>:$args
+    Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
   let results = (outs Variadic<AnyType>);
 
@@ -1920,6 +1983,20 @@ def cc_VarargCallOp : CCOp<"call_vararg",
     $callee `(` $args `)` `:` functional-type(operands, results) attr-dict
   }];
 
+  let builders = [
+    OpBuilder<(ins "mlir::TypeRange":$result, "mlir::FlatSymbolRefAttr":$callee,
+        "mlir::ValueRange":$args), [{
+      return build($_builder, $_state, result, callee, args, mlir::ArrayAttr{},
+        mlir::ArrayAttr{});
+    }]>,
+    OpBuilder<(ins "mlir::TypeRange":$result, "mlir::StringRef":$callee,
+        "mlir::ValueRange":$args), [{
+      return build($_builder, $_state, result,
+        mlir::FlatSymbolRefAttr::get($_builder.getContext(), callee), args,
+        mlir::ArrayAttr{}, mlir::ArrayAttr{});
+    }]>
+  ];
+
   let extraClassDeclaration = [{
     operand_range getArgOperands() {
       return {arg_operand_begin(), arg_operand_end()};
@@ -1928,11 +2005,21 @@ def cc_VarargCallOp : CCOp<"call_vararg",
     operand_iterator arg_operand_begin() { return operand_begin(); }
     operand_iterator arg_operand_end() { return operand_end(); }
 
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      return getArgsMutable();
+    }
+
     /// Return the callee of this operation.
     mlir::CallInterfaceCallable getCallableForCallee() {
       return getCalleeAttr();
     }
 
+    /// Set the callee for this operation.
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      (*this)->setAttr(getCalleeAttrName(),
+                       llvm::cast<mlir::SymbolRefAttr>(callee));
+    }
+
     mlir::LogicalResult verifySymbolUses(mlir::SymbolTableCollection &);
   }];
 }
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
index 15b533a6a8a..12a58032c64 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
@@ -311,21 +311,21 @@ def AnyStateInitLike : TypeConstraint<cc_PointerType.predicate,
                          "state initializer types">;
 def AnyStateInitType : Type<AnyStateInitLike.predicate, "initial state type">;
 
-def IsStdvecTypePred : CPred<"$_self.isa<::cudaq::cc::StdvecType>()">;
+def IsStdvecTypePred : CPred<"::mlir::isa<::cudaq::cc::StdvecType>($_self)">;
 
 class StdvecOf<list<Type> allowedTypes> : Type<
     And<[IsStdvecTypePred, Concat<"[](::mlir::Type elementType) { return ",
       SubstLeaves<"$_self", "elementType", AnyTypeOf<allowedTypes>.predicate>,
-        "; }($_self.cast<::cudaq::cc::StdvecType>().getElementType())">]>,
+        "; }(::mlir::cast<::cudaq::cc::StdvecType>($_self).getElementType())">]>,
     "stdvec of " # AnyTypeOf<allowedTypes>.summary # " values",
     "::cudaq::cc::StdvecType">;
 
-def IsPointerTypePred : CPred<"$_self.isa<::cudaq::cc::PointerType>()">;
+def IsPointerTypePred : CPred<"::mlir::isa<::cudaq::cc::PointerType>($_self)">;
 
 class PointerOf<list<Type> allowedTypes> : Type<
     And<[IsPointerTypePred, Concat<"[](::mlir::Type elementType) { return ",
       SubstLeaves<"$_self", "elementType", AnyTypeOf<allowedTypes>.predicate>,
-        "; }($_self.cast<::cudaq::cc::PointerType>().getElementType())">]>,
+        "; }(::mlir::cast<::cudaq::cc::PointerType>($_self).getElementType())">]>,
     "pointer of " # AnyTypeOf<allowedTypes>.summary # " values",
     "::cudaq::cc::PointerType">;
 
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeDialect.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeDialect.td
index af6c0ec803e..98f24840960 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeDialect.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeDialect.td
@@ -30,7 +30,6 @@ def QuakeDialect : Dialect {
     /// Register all Quake types.
     void registerTypes();
   }];
-  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 #endif // CUDAQ_OPTIMIZER_DIALECT_QUAKE_IR_QUAKE
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.h b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.h
index 52755a6befe..42096690e80 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.h
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.h
@@ -34,17 +34,18 @@ void getResetEffectsImpl(
     mlir::SmallVectorImpl<
         mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>
         &effects,
-    mlir::ValueRange targets);
+    llvm::MutableArrayRef<mlir::OpOperand> targets);
 void getMeasurementEffectsImpl(
     mlir::SmallVectorImpl<
         mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>
         &effects,
-    mlir::ValueRange targets);
+    llvm::MutableArrayRef<mlir::OpOperand> targets);
 void getOperatorEffectsImpl(
     mlir::SmallVectorImpl<
         mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>
         &effects,
-    mlir::ValueRange controls, mlir::ValueRange targets);
+    llvm::MutableArrayRef<mlir::OpOperand> controls,
+    llvm::MutableArrayRef<mlir::OpOperand> targets);
 
 mlir::ParseResult genericOpParse(mlir::OpAsmParser &parser,
                                  mlir::OperationState &result);
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index bfe252c497c..35c44db9ef3 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -397,7 +397,9 @@ def quake_ApplyOp : QuakeOp<"apply",
     Variadic<cc_CallableType>:$indirect_callee, // must be 0 or 1 element
     UnitAttr:$is_adj,
     Variadic<AnyQType>:$controls,
-    Variadic<AnyType>:$actuals
+    Variadic<AnyType>:$actuals,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
   let results = (outs Variadic<AnyType>);
 
@@ -411,7 +413,7 @@ def quake_ApplyOp : QuakeOp<"apply",
                    "mlir::ValueRange":$controls,
                    "mlir::ValueRange":$args), [{
       return build($_builder, $_state, retTy, callee, {}, is_adj, controls,
-                   args);
+                   args, {}, {});
     }]>,
     OpBuilder<(ins "mlir::TypeRange":$retTy,
                    "mlir::SymbolRefAttr":$callee,
@@ -419,7 +421,7 @@ def quake_ApplyOp : QuakeOp<"apply",
                    "mlir::ValueRange":$controls,
                    "mlir::ValueRange":$args), [{
       return build($_builder, $_state, retTy, callee, {}, is_adj, controls,
-                   args);
+                   args, {}, {});
     }]>,
     OpBuilder<(ins "mlir::TypeRange":$retTy,
                    "mlir::Value":$callable,
@@ -427,7 +429,7 @@ def quake_ApplyOp : QuakeOp<"apply",
                    "mlir::ValueRange":$controls,
                    "mlir::ValueRange":$args), [{
       return build($_builder, $_state, retTy, mlir::SymbolRefAttr{},
-                   mlir::ValueRange{callable}, is_adj, controls, args);
+                   mlir::ValueRange{callable}, is_adj, controls, args, {}, {});
     }]>,
     OpBuilder<(ins "mlir::TypeRange":$retTy,
                    "mlir::Value":$callable,
@@ -435,7 +437,7 @@ def quake_ApplyOp : QuakeOp<"apply",
                    "mlir::ValueRange":$controls,
                    "mlir::ValueRange":$args), [{
       return build($_builder, $_state, retTy, mlir::SymbolRefAttr{},
-                   mlir::ValueRange{callable}, is_adj, controls, args);
+                   mlir::ValueRange{callable}, is_adj, controls, args, {}, {});
     }]>
   ];
 
@@ -451,6 +453,14 @@ def quake_ApplyOp : QuakeOp<"apply",
       return {getActuals().begin(), getActuals().end()};
     }
 
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      auto range0 = getODSOperandIndexAndLength(0);
+      auto range2 = getODSOperandIndexAndLength(2);
+      auto mutableRange = ::mlir::MutableOperandRange(getOperation(),
+        range0.first, range2.second);
+      return mutableRange;
+    }
+
     bool applyToVariant() {
       return getIsAdj() || !getControls().empty();
     }
@@ -461,6 +471,12 @@ def quake_ApplyOp : QuakeOp<"apply",
         return (*this)->getAttrOfType<mlir::SymbolRefAttr>(getCalleeAttrName());
       return getIndirectCallee().front();
     }
+
+    /// Set the callee for this operation.
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      (*this)->setAttr(getCalleeAttrName(),
+                       llvm::cast<mlir::SymbolRefAttr>(callee));
+    }
   }];
 }
 
@@ -629,7 +645,9 @@ def quake_CallByRefOp : QuakeOp<"call_by_ref", [CallOpInterface]> {
 
   let arguments = (ins
     SymbolRefAttr:$callee,
-    Variadic<AnyType>:$args
+    Variadic<AnyType>:$args,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
   let results = (outs Variadic<AnyType>);
 
@@ -637,16 +655,33 @@ def quake_CallByRefOp : QuakeOp<"call_by_ref", [CallOpInterface]> {
     $callee `(` $args `)` `:` functional-type(operands, results) attr-dict
   }];
 
+  let builders = [
+    OpBuilder<(ins "mlir::SymbolRefAttr":$callee,
+                   "mlir::TypeRange":$results,
+                   "mlir::ValueRange":$args), [{
+      return build($_builder, $_state, results, callee, args, {}, {});
+    }]>
+  ];
+
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
     operand_range getArgOperands() {
       return {operand_begin(), operand_end()};
     }
-    
+
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      return getArgsMutable();
+    }
+
     mlir::CallInterfaceCallable getCallableForCallee() {
       return (*this)->getAttrOfType<mlir::SymbolRefAttr>(getCalleeAttrName());
     }
+
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      (*this)->setAttr(getCalleeAttrName(),
+                       llvm::cast<mlir::SymbolRefAttr>(callee));
+    }
   }];
 }
 
@@ -1050,7 +1085,7 @@ def quake_ResetOp : QuakeOp<"reset", [QuantumGate,
   let extraClassDeclaration = [{
     void getEffectsImpl(mlir::SmallVectorImpl<mlir::SideEffects::
         EffectInstance<mlir::MemoryEffects::Effect>> &effects) {
-      quake::getResetEffectsImpl(effects, getTargets());
+      quake::getResetEffectsImpl(effects, getTargetsMutable());
     }
   }];
 }
@@ -1087,7 +1122,7 @@ class Measurement<string mnemonic> : QuakeOp<mnemonic, [MeasurementInterface,
   code OpBaseDeclaration = [{
     void getEffectsImpl(mlir::SmallVectorImpl<mlir::SideEffects::EffectInstance<
       mlir::MemoryEffects::Effect>> &effects) {
-      quake::getMeasurementEffectsImpl(effects, getTargets());
+      quake::getMeasurementEffectsImpl(effects, getTargetsMutable());
     }
   }];
 
@@ -1252,7 +1287,7 @@ class QuakeOperator<string mnemonic, list<Trait> traits = [],
 
     void getEffectsImpl(mlir::SmallVectorImpl<mlir::SideEffects::EffectInstance<
       mlir::MemoryEffects::Effect>> &effects) {
-      quake::getOperatorEffectsImpl(effects, getControls(), getTargets());
+      quake::getOperatorEffectsImpl(effects, getControlsMutable(), getTargetsMutable());
     }
 
     //===------------------------------------------------------------------===//
@@ -1415,7 +1450,7 @@ def quake_ExpPauliOp : QuakeOp<"exp_pauli",
 
     void getEffectsImpl(mlir::SmallVectorImpl<mlir::SideEffects::EffectInstance<
       mlir::MemoryEffects::Effect>> &effects) {
-      quake::getOperatorEffectsImpl(effects, getControls(), getTargets());
+      quake::getOperatorEffectsImpl(effects, getControlsMutable(), getTargetsMutable());
     }
 
     //===------------------------------------------------------------------===//
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index e9000d6421b..32ef9de4969 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -12,6 +12,11 @@
 // These transforms can generally be thought of as "optimizations" or "rewrites"
 // on the IR.
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -47,7 +52,7 @@ void createTargetFinalizePipeline(mlir::OpPassManager &pm);
 /// crashes.
 void addDecomposition(mlir::OpPassManager &pm,
                       mlir::ArrayRef<std::string> enabledPats,
-                      mlir::ArrayRef<std::string> disabledPats = std::nullopt);
+                      mlir::ArrayRef<std::string> disabledPats = {});
 
 void registerAOTPipelines();
 void registerJITPipelines();
@@ -76,9 +81,6 @@ createQuakeSynthesizer(std::string_view, const void *,
 std::unique_ptr<mlir::Pass>
 createPySynthCallableBlockArgs(const llvm::SmallVector<llvm::StringRef> &,
                                bool removeBlockArg = false);
-inline std::unique_ptr<mlir::Pass> createPySynthCallableBlockArgs() {
-  return createPySynthCallableBlockArgs({}, false);
-}
 
 /// Helper function to build an argument synthesis pass. The names of the
 /// functions and the substitutions text can be built as an unzipped pair of
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 910eef5142a..df5e413e68e 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -154,6 +154,7 @@ def BasisConversion : Pass<"basis-conversion", "mlir::ModuleOp"> {
     If no `basis` is specified or the pass cannot decompose all operations to
     the specified basis, the pass application will fail.
   }];
+  let dependentDialects = ["mlir::arith::ArithDialect"];
   let options = [
     ListOption<"basis", "basis", "std::string", "Set of basis operations">,
     ListOption<"disabledPatterns", "disable-patterns", "std::string",
@@ -317,6 +318,7 @@ def Decomposition : Pass<"decomposition", "mlir::ModuleOp"> {
     means no decomposition will take place under the presence of controlled
     `quake.apply` operations in the module.
   }];
+  let dependentDialects = ["mlir::arith::ArithDialect"];
   let options = [
     ListOption<"basis", "basis", "std::string", "Set of basis operations">,
     ListOption<"disabledPatterns", "disable-patterns", "std::string",
@@ -543,7 +545,9 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
     constants) regardless of the kernel's (semantically correct) signature.
   }];
 
-  let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
+  let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect",
+    "mlir::func::FuncDialect", "mlir::cf::ControlFlowDialect",
+    "mlir::arith::ArithDialect"];
 
   let options = [
     Option<"outputFilename", "output-filename", "std::string",
@@ -656,6 +660,9 @@ def LambdaLifting : Pass<"lambda-lifting", "mlir::ModuleOp"> {
     before the loop is analyzed.
   }];
 
+  let dependentDialects = ["mlir::cf::ControlFlowDialect",
+                           "mlir::func::FuncDialect"];
+
   let options = [
     Option<"constantPropagation", "constant-prop", "bool", /*default=*/"false",
       "Enable specialization and constant propagation into lifted lambdas.">
@@ -1060,6 +1067,10 @@ def QuakeSynthesize : Pass<"quake-synth", "mlir::ModuleOp"> {
      runtime values.
   }];
 
+  let dependentDialects = ["mlir::arith::ArithDialect",
+                           "mlir::complex::ComplexDialect",
+                           "cudaq::cc::CCDialect", "mlir::math::MathDialect"];
+
   let constructor = "cudaq::opt::createQuakeSynthesizer()";
 }
 
@@ -1381,6 +1392,7 @@ def UnitarySynthesis : Pass<"unitary-synthesis", "mlir::ModuleOp"> {
       }
     ```
   }];
+  let dependentDialects = ["mlir::arith::ArithDialect", "mlir::func::FuncDialect"];
 }
 
 def UnwindLowering : Pass<"unwind-lowering", "mlir::func::FuncOp"> {
@@ -1406,6 +1418,9 @@ def UnwindLowering : Pass<"unwind-lowering", "mlir::func::FuncOp"> {
     The lower to CFG pass removed all structured operations from a function,
     lowering the body of the function completely to a primitive CFG.
   }];
+
+  let dependentDialects = ["mlir::cf::ControlFlowDialect",
+                           "quake::QuakeDialect", "cudaq::cc::CCDialect"];
 }
 
 def UpdateRegisterNames : Pass<"update-register-names"> {
diff --git a/lib/Frontend/nvqpp/ASTBridge.cpp b/lib/Frontend/nvqpp/ASTBridge.cpp
index 724b13e16e7..e6b5b46a64f 100644
--- a/lib/Frontend/nvqpp/ASTBridge.cpp
+++ b/lib/Frontend/nvqpp/ASTBridge.cpp
@@ -91,12 +91,6 @@ trimmedMangledTypeName(clang::QualType ty,
   return s;
 }
 
-static std::string
-trimmedMangledTypeName(const clang::Type *ty,
-                       clang::ItaniumMangleContext *mangler) {
-  return trimmedMangledTypeName(clang::QualType(ty, /*Quals=*/0), mangler);
-}
-
 std::string
 cudaq::details::getTagNameOfFunctionDecl(const clang::FunctionDecl *func,
                                          clang::ItaniumMangleContext *mangler) {
@@ -108,8 +102,10 @@ cudaq::details::getTagNameOfFunctionDecl(const clang::FunctionDecl *func,
       //   template<typename A> T operator()(args...) { ... }
       // };
       // cudaq::get_class_kernel_name<C, As...>();
-      auto name = "instance_" +
-                  trimmedMangledTypeName(cxxCls->getTypeForDecl(), mangler);
+      auto name =
+          "instance_" +
+          trimmedMangledTypeName(
+              mangler->getASTContext().getCanonicalTagType(cxxCls), mangler);
       assert(cxxMethod->getTemplateSpecializationArgs());
       for (auto &templArg :
            cxxMethod->getTemplateSpecializationArgs()->asArray())
@@ -120,7 +116,8 @@ cudaq::details::getTagNameOfFunctionDecl(const clang::FunctionDecl *func,
     }
     // Member function, but not a template function.
     // cudaq::get_class_kernel_name<C>();
-    auto name = trimmedMangledTypeName(cxxCls->getTypeForDecl(), mangler);
+    auto name = trimmedMangledTypeName(
+        mangler->getASTContext().getCanonicalTagType(cxxCls), mangler);
     LLVM_DEBUG(llvm::dbgs() << "member name is: " << name << '\n');
     return name;
   }
@@ -324,9 +321,8 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor<QPUCodeFinder> {
 
   bool VisitVarDecl(clang::VarDecl *x) {
     if (isTupleReverseVar(x)) {
-      auto loc = x->getLocation();
-      auto opt = x->getAnyInitializer()->getIntegerConstantExpr(
-          x->getASTContext(), &loc, false);
+      auto opt =
+          x->getAnyInitializer()->getIntegerConstantExpr(x->getASTContext());
       if (opt) {
         LLVM_DEBUG(llvm::dbgs() << "tuples are reversed: " << *opt << '\n');
         tuplesAreReversed = !opt->isZero();
@@ -335,9 +331,8 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor<QPUCodeFinder> {
     if (cudaq::isInNamespace(x, "cudaq") &&
         cudaq::isInNamespace(x, "details") && x->getName() == "_nvqpp_sizeof") {
       // This constexpr is the sizeof a pauli_word and a std::string.
-      auto loc = x->getLocation();
-      auto opt = x->getAnyInitializer()->getIntegerConstantExpr(
-          x->getASTContext(), &loc, false);
+      auto opt =
+          x->getAnyInitializer()->getIntegerConstantExpr(x->getASTContext());
       assert(opt && "must compute the sizeof a cudaq::pauli_word");
       auto sizeofString = opt->getZExtValue();
       auto sizeAttr = module->getAttr(cudaq::runtime::sizeofStringAttrName);
@@ -359,8 +354,8 @@ class QPUCodeFinder : public clang::RecursiveASTVisitor<QPUCodeFinder> {
           if (auto *id = decl->getIdentifier()) {
             auto name = id->getName();
             if (name == "qubit" || name == "qudit" || name == "qspan" ||
-                name.startswith("qreg") || name.startswith("qvector") ||
-                name.startswith("qarray") || name.startswith("qview"))
+                name.starts_with("qreg") || name.starts_with("qvector") ||
+                name.starts_with("qarray") || name.starts_with("qview"))
               cudaq::details::reportClangError(
                   x, mangler,
                   "may not use quantum types in non-kernel functions");
@@ -511,8 +506,8 @@ void ASTBridgeAction::ASTBridgeConsumer::addFunctionDecl(
       isa<clang::CXXMethodDecl>(funcDecl) && !funcDecl->isStatic();
   FunctionType hostFuncTy =
       opt::factory::toHostSideFuncType(funcTy, addThisPtr, *module);
-  auto func = build.create<func::FuncOp>(loc, funcName, hostFuncTy,
-                                         ArrayRef<NamedAttribute>{});
+  auto func = func::FuncOp::create(build, loc, funcName, hostFuncTy,
+                                   ArrayRef<NamedAttribute>{});
   if (!addThisPtr)
     func->setAttr("no_this", build.getUnitAttr());
 
@@ -527,8 +522,8 @@ void ASTBridgeAction::ASTBridgeConsumer::addFunctionDecl(
     build.setInsertionPointToStart(block);
     SmallVector<Value> results;
     for (auto resTy : hostFuncTy.getResults())
-      results.push_back(build.create<cc::UndefOp>(loc, resTy));
-    build.create<func::ReturnOp>(loc, results);
+      results.push_back(cc::UndefOp::create(build, loc, resTy));
+    func::ReturnOp::create(build, loc, results);
   }
 
   // Walk the arguments and add byval attributes where needed.
@@ -710,7 +705,7 @@ std::string getCxxMangledTypeName(clang::QualType ty,
                                   clang::ItaniumMangleContext *mangler) {
   std::string s;
   llvm::raw_string_ostream os(s);
-  mangler->mangleTypeName(ty, os);
+  mangler->mangleCanonicalTypeName(ty, os);
   os.flush();
   LLVM_DEBUG(llvm::dbgs() << "type name mangled as '" << s << "'\n");
   return s;
diff --git a/lib/Frontend/nvqpp/ConvertDecl.cpp b/lib/Frontend/nvqpp/ConvertDecl.cpp
index fd01e4b5ec4..d546229d2c4 100644
--- a/lib/Frontend/nvqpp/ConvertDecl.cpp
+++ b/lib/Frontend/nvqpp/ConvertDecl.cpp
@@ -97,9 +97,9 @@ void QuakeBridgeVisitor::addArgumentSymbols(
               quake::VeqType, quake::WireType>(parmTy)) {
         symbolTable.insert(name, entryBlock->getArgument(index));
       } else {
-        auto stackSlot = builder.create<cc::AllocaOp>(loc, parmTy);
-        builder.create<cc::StoreOp>(loc, entryBlock->getArgument(index),
-                                    stackSlot);
+        auto stackSlot = cc::AllocaOp::create(builder, loc, parmTy);
+        cc::StoreOp::create(builder, loc, entryBlock->getArgument(index),
+                            stackSlot);
         symbolTable.insert(name, stackSlot);
       }
     }
@@ -447,8 +447,10 @@ bool QuakeBridgeVisitor::TraverseFunctionDecl(clang::FunctionDecl *x) {
   skipCompoundScope = true;
 
   // Visit the trailing requires clause, if any.
-  if (auto *trailingRequiresClause = x->getTrailingRequiresClause())
-    if (!TraverseStmt(trailingRequiresClause))
+  if (const auto &trailingRequiresClause = x->getTrailingRequiresClause();
+      trailingRequiresClause.ConstraintExpr)
+    if (!TraverseStmt(
+            const_cast<clang::Expr *>(trailingRequiresClause.ConstraintExpr)))
       return false;
 
   if (auto *ctor = dyn_cast<clang::CXXConstructorDecl>(x)) {
@@ -499,8 +501,8 @@ bool QuakeBridgeVisitor::TraverseFunctionDecl(clang::FunctionDecl *x) {
     auto loc = toLocation(x);
     SmallVector<Value> dummyResults;
     for (auto ty : funcTy.getResults())
-      dummyResults.push_back(builder.create<cc::UndefOp>(loc, ty));
-    builder.create<func::ReturnOp>(loc, dummyResults);
+      dummyResults.push_back(cc::UndefOp::create(builder, loc, ty));
+    func::ReturnOp::create(builder, loc, dummyResults);
   }
   builder.clearInsertionPoint();
   return true;
@@ -516,7 +518,7 @@ bool QuakeBridgeVisitor::VisitCXXScalarValueInitExpr(
       if (ptrTy.getElementType() == ty) {
         auto v = popValue();
         auto loc = toLocation(x);
-        return pushValue(builder.create<cc::LoadOp>(loc, v));
+        return pushValue(cc::LoadOp::create(builder, loc, v));
       }
   return true;
 }
@@ -558,13 +560,13 @@ bool QuakeBridgeVisitor::VisitFunctionDecl(clang::FunctionDecl *x) {
         return false;
       }
     }
-    return pushValue(builder.create<func::ConstantOp>(loc, fTy, fSym));
+    return pushValue(func::ConstantOp::create(builder, loc, fTy, fSym));
   }
   auto [funcOp, alreadyAdded] = getOrAddFunc(loc, kernName, typeFromStack);
   if (!alreadyAdded)
     funcOp.setPrivate();
-  return pushValue(builder.create<func::ConstantOp>(
-      loc, funcOp.getFunctionType(), funcOp.getSymNameAttr()));
+  return pushValue(func::ConstantOp::create(
+      builder, loc, funcOp.getFunctionType(), funcOp.getSymNameAttr()));
 }
 
 bool QuakeBridgeVisitor::VisitNamedDecl(clang::NamedDecl *x) {
@@ -692,12 +694,12 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
       qreg = popValue();
     } else {
       // this is a qreg<N> q;
-      auto qregSizeVal = builder.create<mlir::arith::ConstantIntOp>(
-          loc, qregSize, builder.getIntegerType(64));
+      auto qregSizeVal = mlir::arith::ConstantIntOp::create(
+          builder, loc, builder.getIntegerType(64), qregSize);
       if (qregSize != 0)
-        qreg = builder.create<quake::AllocaOp>(loc, qType);
+        qreg = quake::AllocaOp::create(builder, loc, qType);
       else
-        qreg = builder.create<quake::AllocaOp>(loc, qType, qregSizeVal);
+        qreg = quake::AllocaOp::create(builder, loc, qType, qregSizeVal);
     }
     symbolTable.insert(name, qreg);
     // allocated_qreg_names.push_back(name);
@@ -710,12 +712,12 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
       symbolTable.insert(name, peekValue());
       return true;
     }
-    auto zero = builder.create<mlir::arith::ConstantIntOp>(
-        loc, 0, builder.getIntegerType(64));
-    auto qregSizeOne = builder.create<quake::AllocaOp>(
-        loc, quake::VeqType::get(builder.getContext(), 1));
+    auto zero = mlir::arith::ConstantIntOp::create(
+        builder, loc, builder.getIntegerType(64), 0);
+    auto qregSizeOne = quake::AllocaOp::create(
+        builder, loc, quake::VeqType::get(builder.getContext(), 1));
     Value addressTheQubit =
-        builder.create<quake::ExtractRefOp>(loc, qregSizeOne, zero);
+        quake::ExtractRefOp::create(builder, loc, qregSizeOne, zero);
     symbolTable.insert(name, addressTheQubit);
     return pushValue(addressTheQubit);
   }
@@ -807,7 +809,7 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
   // slot in which to save the value. This stack slot is the variable in the
   // memory domain.
   if (!x->getInit() || x->isCXXForRangeDecl()) {
-    Value alloca = builder.create<cc::AllocaOp>(loc, type);
+    Value alloca = cc::AllocaOp::create(builder, loc, type);
     symbolTable.insert(x->getName(), alloca);
     return pushValue(alloca);
   }
@@ -826,16 +828,16 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
     if (initValue.getType().getIntOrFloatBitWidth() <
         type.getIntOrFloatBitWidth()) {
       // FIXME: Use zero-extend if this is unsigned!
-      initValue = builder.create<cudaq::cc::CastOp>(
-          loc, type, initValue, cudaq::cc::CastOpMode::Signed);
+      initValue = cudaq::cc::CastOp::create(builder, loc, type, initValue,
+                                            cudaq::cc::CastOpMode::Signed);
     } else if (initValue.getType().getIntOrFloatBitWidth() >
                type.getIntOrFloatBitWidth()) {
-      initValue = builder.create<cudaq::cc::CastOp>(loc, type, initValue);
+      initValue = cudaq::cc::CastOp::create(builder, loc, type, initValue);
     }
   } else if (isa<IntegerType>(initValue.getType()) && isa<FloatType>(type)) {
     // FIXME: Use UIToFP if this is unsigned!
-    initValue = builder.create<cudaq::cc::CastOp>(
-        loc, type, initValue, cudaq::cc::CastOpMode::Signed);
+    initValue = cudaq::cc::CastOp::create(builder, loc, type, initValue,
+                                          cudaq::cc::CastOpMode::Signed);
   }
 
   if (auto initObject = initValue.getDefiningOp<cc::AllocaOp>()) {
@@ -861,7 +863,7 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
   if (isStdvecBoolReference(qualTy) || qualTy.getTypePtr()->isReferenceType()) {
     // A similar case is when the C++ variable is a reference to a subobject.
     assert(isa<cc::PointerType>(type));
-    Value cast = builder.create<cc::CastOp>(loc, type, initValue);
+    Value cast = cc::CastOp::create(builder, loc, type, initValue);
     symbolTable.insert(x->getName(), cast);
     return pushValue(cast);
   }
@@ -874,8 +876,8 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
 
   // Initialization expression resulted in a value. Create a variable and save
   // that value to the variable's memory address.
-  Value alloca = builder.create<cc::AllocaOp>(loc, type);
-  builder.create<cc::StoreOp>(loc, initValue, alloca);
+  Value alloca = cc::AllocaOp::create(builder, loc, type);
+  cc::StoreOp::create(builder, loc, initValue, alloca);
   symbolTable.insert(x->getName(), alloca);
   return pushValue(alloca);
 }
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index 70aaf25f990..47bc24e0cad 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -13,6 +13,8 @@
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 
 #define DEBUG_TYPE "lower-ast-expr"
 
@@ -74,25 +76,27 @@ maybeUnpackOperands(OpBuilder &builder, Location loc, ValueRange operands,
   if (isa<quake::VeqType>(last_target.getType())) {
     // Split the vector. Last `targetCount` are targets, front `N-targetCount`
     // are controls.
-    auto vecSize = builder.create<quake::VeqSizeOp>(
-        loc, builder.getIntegerType(64), targets);
-    auto size = builder.create<cudaq::cc::CastOp>(
-        loc, builder.getI64Type(), vecSize, cudaq::cc::CastOpMode::Unsigned);
-
-    auto numTargets =
-        builder.create<arith::ConstantIntOp>(loc, targetCount, 64);
-    auto offset = builder.create<arith::SubIOp>(loc, size, numTargets);
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto last = builder.create<arith::SubIOp>(loc, offset, numTargets);
+    auto vecSize = quake::VeqSizeOp::create(
+        builder, loc, builder.getIntegerType(64), targets);
+    auto size =
+        cudaq::cc::CastOp::create(builder, loc, builder.getI64Type(), vecSize,
+                                  cudaq::cc::CastOpMode::Unsigned);
+
+    auto numTargets = arith::ConstantIntOp::create(
+        builder, loc, builder.getI64Type(), targetCount);
+    auto offset = arith::SubIOp::create(builder, loc, size, numTargets);
+    auto zero =
+        arith::ConstantIntOp::create(builder, loc, builder.getI64Type(), 0);
+    auto last = arith::SubIOp::create(builder, loc, offset, numTargets);
     // The canonicalizer will compute a constant size, if possible.
     auto unsizedVeqTy = quake::VeqType::getUnsized(builder.getContext());
 
     // Get the subvector of all targets
-    Value targetSubveq = builder.create<quake::SubVeqOp>(
-        loc, unsizedVeqTy, last_target, zero, offset);
+    Value targetSubveq = quake::SubVeqOp::create(builder, loc, unsizedVeqTy,
+                                                 last_target, zero, offset);
     // Get the subvector of all qubits excluding the last one: controls.
-    Value ctrlSubveq = builder.create<quake::SubVeqOp>(loc, unsizedVeqTy,
-                                                       last_target, zero, last);
+    Value ctrlSubveq = quake::SubVeqOp::create(builder, loc, unsizedVeqTy,
+                                               last_target, zero, last);
     return std::make_pair(SmallVector<Value>{targetSubveq},
                           SmallVector<Value>{ctrlSubveq});
   }
@@ -155,11 +159,11 @@ bool buildOp(OpBuilder &builder, Location loc, ValueRange operands,
         negatedControlsAttribute(builder.getContext(), ctrls, negations);
     if (ctrls.empty())
       for (auto t : target)
-        builder.create<A>(loc, isAdjoint, params, ctrls, t, negs);
+        A::create(builder, loc, isAdjoint, params, ctrls, t, negs);
     else {
       assert(target.size() == 1 &&
              "can only have a single target with control qubits.");
-      builder.create<A>(loc, isAdjoint, params, ctrls, target, negs);
+      A::create(builder, loc, isAdjoint, params, ctrls, target, negs);
     }
   } else {
     assert(operands.size() >= 1 && "must be at least 1 operand");
@@ -168,15 +172,15 @@ bool buildOp(OpBuilder &builder, Location loc, ValueRange operands,
       if (!negations.empty())
         reportNegateError();
       Type i64Ty = builder.getI64Type();
-      auto size = builder.create<quake::VeqSizeOp>(
-          loc, builder.getIntegerType(64), target);
-      Value rank = builder.create<cudaq::cc::CastOp>(
-          loc, i64Ty, size, cudaq::cc::CastOpMode::Unsigned);
+      auto size = quake::VeqSizeOp::create(builder, loc,
+                                           builder.getIntegerType(64), target);
+      Value rank = cudaq::cc::CastOp::create(builder, loc, i64Ty, size,
+                                             cudaq::cc::CastOpMode::Unsigned);
       auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &,
                              Block &block) {
-        Value ref = builder.create<quake::ExtractRefOp>(loc, target,
-                                                        block.getArgument(0));
-        builder.create<A>(loc, ValueRange(), ref);
+        Value ref = quake::ExtractRefOp::create(builder, loc, target,
+                                                block.getArgument(0));
+        A::create(builder, loc, ValueRange(), ref);
       };
       cudaq::opt::factory::createInvariantLoop(builder, loc, rank, bodyBuilder);
     } else {
@@ -190,12 +194,12 @@ bool buildOp(OpBuilder &builder, Location loc, ValueRange operands,
       if (ctrls.empty())
         // May have multiple targets, but no controls, op(q, r, s, ...)
         for (auto t : target)
-          builder.create<A>(loc, isAdjoint, ValueRange(), ValueRange(), t,
-                            negs);
+          A::create(builder, loc, isAdjoint, ValueRange(), ValueRange(), t,
+                    negs);
       else {
         assert(target.size() == 1 &&
                "can only have a single target with control qubits.");
-        builder.create<A>(loc, isAdjoint, ValueRange(), ctrls, target, negs);
+        A::create(builder, loc, isAdjoint, ValueRange(), ctrls, target, negs);
       }
     }
   }
@@ -204,14 +208,14 @@ bool buildOp(OpBuilder &builder, Location loc, ValueRange operands,
 
 static Value getConstantInt(OpBuilder &builder, Location loc,
                             const uint64_t value, const int bitwidth) {
-  return builder.create<arith::ConstantIntOp>(loc, value,
-                                              builder.getIntegerType(bitwidth));
+  return arith::ConstantIntOp::create(builder, loc,
+                                      builder.getIntegerType(bitwidth), value);
 }
 
 static Value getConstantInt(OpBuilder &builder, Location loc,
                             const uint64_t value, Type intTy) {
   assert(isa<IntegerType>(intTy));
-  return builder.create<arith::ConstantIntOp>(loc, value, intTy);
+  return arith::ConstantIntOp::create(builder, loc, intTy, value);
 }
 
 template <auto KindConst, typename T,
@@ -255,35 +259,35 @@ static void castToSameType(OpBuilder builder, Location loc,
       auto mode = (lhsType && lhsType->isUnsignedIntegerOrEnumerationType())
                       ? cudaq::cc::CastOpMode::Unsigned
                       : cudaq::cc::CastOpMode::Signed;
-      lhs = builder.create<cudaq::cc::CastOp>(loc, rhs.getType(), lhs, mode);
+      lhs = cudaq::cc::CastOp::create(builder, loc, rhs.getType(), lhs, mode);
       return;
     }
     auto mode = (rhsType && rhsType->isUnsignedIntegerOrEnumerationType())
                     ? cudaq::cc::CastOpMode::Unsigned
                     : cudaq::cc::CastOpMode::Signed;
-    rhs = builder.create<cudaq::cc::CastOp>(loc, lhs.getType(), rhs, mode);
+    rhs = cudaq::cc::CastOp::create(builder, loc, lhs.getType(), rhs, mode);
     return;
   }
   if (isa<FloatType>(lhsTy) && isa<FloatType>(rhsTy)) {
     if (lhsTy.getIntOrFloatBitWidth() < rhsTy.getIntOrFloatBitWidth()) {
-      lhs = builder.create<cudaq::cc::CastOp>(loc, rhs.getType(), lhs);
+      lhs = cudaq::cc::CastOp::create(builder, loc, rhs.getType(), lhs);
       return;
     }
-    rhs = builder.create<cudaq::cc::CastOp>(loc, lhs.getType(), rhs);
+    rhs = cudaq::cc::CastOp::create(builder, loc, lhs.getType(), rhs);
     return;
   }
   if (isa<FloatType>(lhsTy) && isa<IntegerType>(rhsTy)) {
     auto mode = (rhsType && rhsType->isUnsignedIntegerOrEnumerationType())
                     ? cudaq::cc::CastOpMode::Unsigned
                     : cudaq::cc::CastOpMode::Signed;
-    rhs = builder.create<cudaq::cc::CastOp>(loc, lhs.getType(), rhs, mode);
+    rhs = cudaq::cc::CastOp::create(builder, loc, lhs.getType(), rhs, mode);
     return;
   }
   if (isa<IntegerType>(lhsTy) && isa<FloatType>(rhsTy)) {
     auto mode = (lhsType && lhsType->isUnsignedIntegerOrEnumerationType())
                     ? cudaq::cc::CastOpMode::Unsigned
                     : cudaq::cc::CastOpMode::Signed;
-    lhs = builder.create<cudaq::cc::CastOp>(loc, rhs.getType(), lhs, mode);
+    lhs = cudaq::cc::CastOp::create(builder, loc, rhs.getType(), lhs, mode);
     return;
   }
   TODO_loc(loc, "conversion of operands in binary expression");
@@ -346,7 +350,7 @@ bool QuakeBridgeVisitor::VisitArraySubscriptExpr(clang::ArraySubscriptExpr *x) {
     return eleTy;
   }();
   auto elePtrTy = cc::PointerType::get(arrEleTy);
-  return pushValue(builder.create<cc::ComputePtrOp>(loc, elePtrTy, lhs, rhs));
+  return pushValue(cc::ComputePtrOp::create(builder, loc, elePtrTy, lhs, rhs));
 }
 
 bool QuakeBridgeVisitor::VisitFloatingLiteral(clang::FloatingLiteral *x) {
@@ -380,7 +384,7 @@ bool QuakeBridgeVisitor::VisitCharacterLiteral(clang::CharacterLiteral *x) {
   auto intTy =
       builtinTypeToType(cast<clang::BuiltinType>(x->getType().getTypePtr()));
   auto intVal = x->getValue();
-  return pushValue(builder.create<arith::ConstantIntOp>(loc, intVal, intTy));
+  return pushValue(arith::ConstantIntOp::create(builder, loc, intTy, intVal));
 }
 
 bool QuakeBridgeVisitor::VisitUnaryOperator(clang::UnaryOperator *x) {
@@ -388,63 +392,63 @@ bool QuakeBridgeVisitor::VisitUnaryOperator(clang::UnaryOperator *x) {
   switch (x->getOpcode()) {
   case clang::UnaryOperatorKind::UO_PostInc: {
     auto var = popValue();
-    auto loaded = builder.create<cc::LoadOp>(loc, var);
-    auto incremented = builder.create<arith::AddIOp>(
-        loc, loaded,
+    auto loaded = cc::LoadOp::create(builder, loc, var);
+    auto incremented = arith::AddIOp::create(
+        builder, loc, loaded,
         getConstantInt(builder, loc, 1,
                        loaded.getType().getIntOrFloatBitWidth()));
-    builder.create<cc::StoreOp>(loc, incremented, var);
+    cc::StoreOp::create(builder, loc, incremented, var);
     return pushValue(loaded);
   }
   case clang::UnaryOperatorKind::UO_PreInc: {
     auto var = popValue();
-    auto loaded = builder.create<cc::LoadOp>(loc, var);
-    auto incremented = builder.create<arith::AddIOp>(
-        loc, loaded,
+    auto loaded = cc::LoadOp::create(builder, loc, var);
+    auto incremented = arith::AddIOp::create(
+        builder, loc, loaded,
         getConstantInt(builder, loc, 1,
                        loaded.getType().getIntOrFloatBitWidth()));
-    builder.create<cc::StoreOp>(loc, incremented, var);
+    cc::StoreOp::create(builder, loc, incremented, var);
     return pushValue(incremented);
   }
   case clang::UnaryOperatorKind::UO_PostDec: {
     auto var = popValue();
-    auto loaded = builder.create<cc::LoadOp>(loc, var);
-    auto decremented = builder.create<arith::SubIOp>(
-        loc, loaded,
+    auto loaded = cc::LoadOp::create(builder, loc, var);
+    auto decremented = arith::SubIOp::create(
+        builder, loc, loaded,
         getConstantInt(builder, loc, 1,
                        loaded.getType().getIntOrFloatBitWidth()));
-    builder.create<cc::StoreOp>(loc, decremented, var);
+    cc::StoreOp::create(builder, loc, decremented, var);
     return pushValue(loaded);
   }
   case clang::UnaryOperatorKind::UO_PreDec: {
     auto var = popValue();
-    auto loaded = builder.create<cc::LoadOp>(loc, var);
-    auto decremented = builder.create<arith::SubIOp>(
-        loc, loaded,
+    auto loaded = cc::LoadOp::create(builder, loc, var);
+    auto decremented = arith::SubIOp::create(
+        builder, loc, loaded,
         getConstantInt(builder, loc, 1,
                        loaded.getType().getIntOrFloatBitWidth()));
-    builder.create<cc::StoreOp>(loc, decremented, var);
+    cc::StoreOp::create(builder, loc, decremented, var);
     return pushValue(decremented);
   }
   case clang::UnaryOperatorKind::UO_LNot: {
     auto var = popValue();
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, var.getType());
-    Value unaryNot =
-        builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, var, zero);
+    auto zero = arith::ConstantIntOp::create(builder, loc, var.getType(), 0);
+    Value unaryNot = arith::CmpIOp::create(builder, loc,
+                                           arith::CmpIPredicate::eq, var, zero);
     return pushValue(unaryNot);
   }
   case clang::UnaryOperatorKind::UO_Minus: {
     auto subExpr = popValue();
     auto resTy = subExpr.getType();
     if (isa<IntegerType>(resTy))
-      return pushValue(builder.create<arith::MulIOp>(
-          loc, subExpr,
+      return pushValue(arith::MulIOp::create(
+          builder, loc, subExpr,
           getConstantInt(builder, loc, -1, resTy.getIntOrFloatBitWidth())));
 
     if (isa<FloatType>(resTy)) {
       auto neg_one = opt::factory::createFloatConstant(loc, builder, -1.0,
                                                        cast<FloatType>(resTy));
-      return pushValue(builder.create<arith::MulFOp>(loc, subExpr, neg_one));
+      return pushValue(arith::MulFOp::create(builder, loc, subExpr, neg_one));
     }
     TODO_x(loc, x, mangler, "unknown type for unary minus");
     return false;
@@ -452,7 +456,7 @@ bool QuakeBridgeVisitor::VisitUnaryOperator(clang::UnaryOperator *x) {
   case clang::UnaryOperatorKind::UO_Deref: {
     auto subExpr = popValue();
     assert(isa<cc::PointerType>(subExpr.getType()));
-    return pushValue(builder.create<cc::LoadOp>(loc, subExpr));
+    return pushValue(cc::LoadOp::create(builder, loc, subExpr));
   }
   case clang::UnaryOperatorKind::UO_AddrOf: {
     auto subExpr = peekValue();
@@ -478,7 +482,7 @@ Value QuakeBridgeVisitor::floatingPointCoercion(Location loc, Type toType,
   if (toType == fromType)
     return value;
   assert(isa<FloatType>(fromType) && isa<FloatType>(toType));
-  return builder.create<cudaq::cc::CastOp>(loc, toType, value);
+  return cudaq::cc::CastOp::create(builder, loc, toType, value);
 }
 
 Value QuakeBridgeVisitor::integerCoercion(Location loc,
@@ -493,10 +497,10 @@ Value QuakeBridgeVisitor::integerCoercion(Location loc,
     auto mode = (clangTy->isUnsignedIntegerOrEnumerationType())
                     ? cudaq::cc::CastOpMode::Unsigned
                     : cudaq::cc::CastOpMode::Signed;
-    return builder.create<cudaq::cc::CastOp>(loc, dstTy, srcVal, mode);
+    return cudaq::cc::CastOp::create(builder, loc, dstTy, srcVal, mode);
   }
   assert(fromTy.getIntOrFloatBitWidth() > dstTy.getIntOrFloatBitWidth());
-  return builder.create<cudaq::cc::CastOp>(loc, dstTy, srcVal);
+  return cudaq::cc::CastOp::create(builder, loc, dstTy, srcVal);
 }
 
 /// Generalized kernel argument morphing. When traversing the AST, the calling
@@ -520,17 +524,17 @@ SmallVector<Value> QuakeBridgeVisitor::convertKernelArgs(
       auto eleTy = ptrTy.getElementType();
       if (eleTy == kTy) {
         // Promote pass-by-reference to pass-by-value.
-        auto load = builder.create<cudaq::cc::LoadOp>(loc, v);
+        auto load = cudaq::cc::LoadOp::create(builder, loc, v);
         result.push_back(load);
         continue;
       }
 
       // We've passed clang++'s semantics checks but the types are distinct.
       if (isa<cudaq::cc::PointerType>(kTy)) {
-        result.push_back(builder.create<cudaq::cc::CastOp>(loc, kTy, v));
+        result.push_back(cudaq::cc::CastOp::create(builder, loc, kTy, v));
         continue;
       }
-      auto load = builder.create<cudaq::cc::LoadOp>(loc, v);
+      auto load = cudaq::cc::LoadOp::create(builder, loc, v);
       auto loadTy = load.getType();
       Value castTo;
       if (isa<IntegerType>(loadTy) && isa<IntegerType>(kTy)) {
@@ -550,7 +554,7 @@ SmallVector<Value> QuakeBridgeVisitor::convertKernelArgs(
         // Both are Veq but the Veq are not identical. If the callee has a
         // dynamic size, we can relax the size from the calling context.
         if (vVecTy.hasSpecifiedSize() && !kVecTy.hasSpecifiedSize()) {
-          auto relax = builder.create<quake::RelaxSizeOp>(loc, kVecTy, v);
+          auto relax = quake::RelaxSizeOp::create(builder, loc, kVecTy, v);
           result.push_back(relax);
           continue;
         }
@@ -590,7 +594,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
     clang::QualType srcTy = x->getSubExpr()->getType();
     // Check for and handle reference to integer cases.
     if (isa<cudaq::cc::PointerType>(mlirVal.getType()))
-      mlirVal = builder.create<cudaq::cc::LoadOp>(loc, mlirVal);
+      mlirVal = cudaq::cc::LoadOp::create(builder, loc, mlirVal);
     return pushValue(integerCoercion(locSub, srcTy, castToTy, mlirVal));
   };
 
@@ -601,7 +605,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
   }
   case clang::CastKind::CK_BitCast: {
     auto value = popValue();
-    return pushValue(builder.create<cudaq::cc::CastOp>(loc, castToTy, value));
+    return pushValue(cudaq::cc::CastOp::create(builder, loc, castToTy, value));
   }
   case clang::CastKind::CK_FloatingCast: {
     [[maybe_unused]] auto dstType = x->getType();
@@ -613,7 +617,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
     assert(toType && fromType);
     if (toType == fromType)
       return pushValue(value);
-    return pushValue(builder.create<cudaq::cc::CastOp>(loc, toType, value));
+    return pushValue(cudaq::cc::CastOp::create(builder, loc, toType, value));
   }
   case clang::CastKind::CK_IntegralCast: {
     auto locSub = toLocation(x->getSubExpr());
@@ -633,7 +637,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
                     ? cudaq::cc::CastOpMode::Unsigned
                     : cudaq::cc::CastOpMode::Signed;
     return pushValue(
-        builder.create<cudaq::cc::CastOp>(loc, castToTy, popValue(), mode));
+        cudaq::cc::CastOp::create(builder, loc, castToTy, popValue(), mode));
   }
   case clang::CastKind::CK_IntegralToFloating: {
     auto mode =
@@ -641,20 +645,20 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
             ? cudaq::cc::CastOpMode::Unsigned
             : cudaq::cc::CastOpMode::Signed;
     return pushValue(
-        builder.create<cudaq::cc::CastOp>(loc, castToTy, popValue(), mode));
+        cudaq::cc::CastOp::create(builder, loc, castToTy, popValue(), mode));
   }
   case clang::CastKind::CK_IntegralToBoolean: {
     auto last = popValue();
-    Value zero = builder.create<arith::ConstantIntOp>(loc, 0, last.getType());
-    return pushValue(builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ne, last, zero));
+    Value zero = arith::ConstantIntOp::create(builder, loc, last.getType(), 0);
+    return pushValue(arith::CmpIOp::create(
+        builder, loc, arith::CmpIPredicate::ne, last, zero));
   }
   case clang::CastKind::CK_FloatingToBoolean: {
     auto last = popValue();
     Value zero = opt::factory::createFloatConstant(
         loc, builder, 0.0, cast<FloatType>(last.getType()));
-    return pushValue(builder.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UNE, last, zero));
+    return pushValue(arith::CmpFOp::create(
+        builder, loc, arith::CmpFPredicate::UNE, last, zero));
   }
   case clang::CastKind::CK_UserDefinedConversion: {
     auto sub = popValue();
@@ -695,7 +699,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
     if (isa<quake::StateType>(castToTy))
       if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(peekValue().getType()))
         if (isa<quake::StateType>(ptrTy.getElementType()))
-          return pushValue(builder.create<cudaq::cc::LoadOp>(loc, popValue()));
+          return pushValue(cudaq::cc::LoadOp::create(builder, loc, popValue()));
     if (auto funcTy = peelPointerFromFunction(castToTy))
       if (auto fromTy = dyn_cast<cc::CallableType>(peekValue().getType())) {
         auto inputs = funcTy.getInputs();
@@ -715,9 +719,10 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
       return false;
     if (x->getCastKind() == clang::CastKind::CK_IntegralToBoolean) {
       auto last = popValue();
-      Value zero = builder.create<arith::ConstantIntOp>(loc, 0, last.getType());
-      return pushValue(builder.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::ne, last, zero));
+      Value zero =
+          arith::ConstantIntOp::create(builder, loc, last.getType(), 0);
+      return pushValue(arith::CmpIOp::create(
+          builder, loc, arith::CmpIPredicate::ne, last, zero));
     }
   }
   TODO_loc(loc, "unhandled implicit cast expression");
@@ -739,15 +744,15 @@ bool QuakeBridgeVisitor::TraverseBinaryOperator(clang::BinaryOperator *x,
       return false;
     auto lhsVal = popValue();
     auto loc = toLocation(x->getSourceRange());
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, lhsVal.getType());
-    Value cond = builder.create<arith::CmpIOp>(loc,
-                                               shortCircuitWhenTrue
-                                                   ? arith::CmpIPredicate::ne
+    auto zero = arith::ConstantIntOp::create(builder, loc, lhsVal.getType(), 0);
+    Value cond =
+        arith::CmpIOp::create(builder, loc,
+                              shortCircuitWhenTrue ? arith::CmpIPredicate::ne
                                                    : arith::CmpIPredicate::eq,
-                                               lhsVal, zero);
+                              lhsVal, zero);
     bool result = true;
-    auto ifOp = builder.create<cc::IfOp>(
-        loc, TypeRange{cond.getType()}, cond,
+    auto ifOp = cc::IfOp::create(
+        builder, loc, TypeRange{cond.getType()}, cond,
         // Value if `cond` is true
         // For `BO_LAnd`, that means Value if lhs is     zero (i.e. false)
         // For `BO_LOr`,  that means Value if lhs is non-zero (i.e. true)
@@ -760,13 +765,12 @@ bool QuakeBridgeVisitor::TraverseBinaryOperator(clang::BinaryOperator *x,
           builder.setInsertionPointToStart(&bodyBlock);
           if (x->getOpcode() == clang::BinaryOperatorKind::BO_LAnd) {
             // Return false out of this block in order to avoid evaluating rhs
-            auto constantFalse =
-                builder
-                    .create<arith::ConstantOp>(loc, builder.getBoolAttr(false))
-                    .getResult();
-            builder.create<cc::ContinueOp>(loc, TypeRange{}, constantFalse);
+            auto constantFalse = arith::ConstantOp::create(
+                                     builder, loc, builder.getBoolAttr(false))
+                                     .getResult();
+            cc::ContinueOp::create(builder, loc, TypeRange{}, constantFalse);
           } else {
-            builder.create<cc::ContinueOp>(loc, TypeRange{}, cond);
+            cc::ContinueOp::create(builder, loc, TypeRange{}, cond);
           }
         },
         // Value if `cond` is false
@@ -784,7 +788,7 @@ bool QuakeBridgeVisitor::TraverseBinaryOperator(clang::BinaryOperator *x,
             return;
           }
           auto rhsVal = popValue();
-          builder.create<cc::ContinueOp>(loc, TypeRange{}, rhsVal);
+          cc::ContinueOp::create(builder, loc, TypeRange{}, rhsVal);
         });
     if (!result)
       return result;
@@ -802,7 +806,7 @@ bool QuakeBridgeVisitor::VisitBinaryOperator(clang::BinaryOperator *x) {
   auto loc = toLocation(x->getSourceRange());
   auto maybeLoadValue = [&](Value v) -> Value {
     if (isa<cc::PointerType>(v.getType()))
-      return builder.create<cc::LoadOp>(loc, v);
+      return cc::LoadOp::create(builder, loc, v);
     return v;
   };
 
@@ -836,7 +840,7 @@ bool QuakeBridgeVisitor::VisitBinaryOperator(clang::BinaryOperator *x) {
       default:
         TODO_loc(loc, "floating-point comparison");
       }
-      return pushValue(builder.create<arith::CmpFOp>(loc, pred, lhs, rhs));
+      return pushValue(arith::CmpFOp::create(builder, loc, pred, lhs, rhs));
     }
     arith::CmpIPredicate pred;
     auto lhsTy = x->getLHS()->getType();
@@ -866,12 +870,12 @@ bool QuakeBridgeVisitor::VisitBinaryOperator(clang::BinaryOperator *x) {
     default:
       TODO_loc(loc, "integer comparison");
     }
-    return pushValue(builder.create<arith::CmpIOp>(loc, pred, lhs, rhs));
+    return pushValue(arith::CmpIOp::create(builder, loc, pred, lhs, rhs));
   }
 
   switch (x->getOpcode()) {
   case clang::BinaryOperatorKind::BO_Assign: {
-    builder.create<cc::StoreOp>(loc, rhs, lhs);
+    cc::StoreOp::create(builder, loc, rhs, lhs);
     return pushValue(lhs);
   }
   case clang::BinaryOperatorKind::BO_AddAssign:
@@ -894,60 +898,60 @@ bool QuakeBridgeVisitor::VisitBinaryOperator(clang::BinaryOperator *x) {
   switch (x->getOpcode()) {
   case clang::BinaryOperatorKind::BO_Add: {
     if (x->getType()->isIntegerType())
-      return pushValue(builder.create<arith::AddIOp>(loc, lhs, rhs));
+      return pushValue(arith::AddIOp::create(builder, loc, lhs, rhs));
     if (x->getType()->isFloatingType())
-      return pushValue(builder.create<arith::AddFOp>(loc, lhs, rhs));
+      return pushValue(arith::AddFOp::create(builder, loc, lhs, rhs));
     TODO_loc(loc, "error in bo_add binary op");
   }
   case clang::BinaryOperatorKind::BO_Rem: {
     if (x->getType()->isIntegerType()) {
       if (x->getType()->isUnsignedIntegerOrEnumerationType())
-        return pushValue(builder.create<arith::RemUIOp>(loc, lhs, rhs));
-      return pushValue(builder.create<arith::RemSIOp>(loc, lhs, rhs));
+        return pushValue(arith::RemUIOp::create(builder, loc, lhs, rhs));
+      return pushValue(arith::RemSIOp::create(builder, loc, lhs, rhs));
     }
     if (x->getType()->isFloatingType())
-      return pushValue(builder.create<arith::AddFOp>(loc, lhs, rhs));
+      return pushValue(arith::AddFOp::create(builder, loc, lhs, rhs));
     TODO_loc(loc, "error in bo_add binary op");
   }
   case clang::BinaryOperatorKind::BO_Sub: {
     if (x->getType()->isIntegerType())
-      return pushValue(builder.create<arith::SubIOp>(loc, lhs, rhs));
+      return pushValue(arith::SubIOp::create(builder, loc, lhs, rhs));
     if (x->getType()->isFloatingType())
-      return pushValue(builder.create<arith::SubFOp>(loc, lhs, rhs));
+      return pushValue(arith::SubFOp::create(builder, loc, lhs, rhs));
     TODO_loc(loc, "error in bo_add binary op");
   }
 
   case clang::BinaryOperatorKind::BO_Mul: {
     if (x->getType()->isIntegerType())
-      return pushValue(builder.create<arith::MulIOp>(loc, lhs, rhs));
+      return pushValue(arith::MulIOp::create(builder, loc, lhs, rhs));
     if (x->getType()->isFloatingType())
-      return pushValue(builder.create<arith::MulFOp>(loc, lhs, rhs));
+      return pushValue(arith::MulFOp::create(builder, loc, lhs, rhs));
     TODO_loc(loc, "error in bo_mul binary op");
   }
 
   case clang::BinaryOperatorKind::BO_Div: {
     if (x->getType()->isIntegerType()) {
       if (x->getType()->isUnsignedIntegerOrEnumerationType())
-        return pushValue(builder.create<arith::DivUIOp>(loc, lhs, rhs));
-      return pushValue(builder.create<arith::DivSIOp>(loc, lhs, rhs));
+        return pushValue(arith::DivUIOp::create(builder, loc, lhs, rhs));
+      return pushValue(arith::DivSIOp::create(builder, loc, lhs, rhs));
     }
     if (x->getType()->isFloatingType())
-      return pushValue(builder.create<arith::DivFOp>(loc, lhs, rhs));
+      return pushValue(arith::DivFOp::create(builder, loc, lhs, rhs));
     TODO_loc(loc, "error in bo_div binary op");
   }
 
   case clang::BinaryOperatorKind::BO_Shl:
-    return pushValue(builder.create<arith::ShLIOp>(loc, lhs, rhs));
+    return pushValue(arith::ShLIOp::create(builder, loc, lhs, rhs));
   case clang::BinaryOperatorKind::BO_Shr:
     if (x->getLHS()->getType()->isUnsignedIntegerOrEnumerationType())
-      return pushValue(builder.create<mlir::arith::ShRUIOp>(loc, lhs, rhs));
-    return pushValue(builder.create<mlir::arith::ShRSIOp>(loc, lhs, rhs));
+      return pushValue(mlir::arith::ShRUIOp::create(builder, loc, lhs, rhs));
+    return pushValue(mlir::arith::ShRSIOp::create(builder, loc, lhs, rhs));
   case clang::BinaryOperatorKind::BO_Or:
-    return pushValue(builder.create<arith::OrIOp>(loc, lhs, rhs));
+    return pushValue(arith::OrIOp::create(builder, loc, lhs, rhs));
   case clang::BinaryOperatorKind::BO_Xor:
-    return pushValue(builder.create<arith::XOrIOp>(loc, lhs, rhs));
+    return pushValue(arith::XOrIOp::create(builder, loc, lhs, rhs));
   case clang::BinaryOperatorKind::BO_And:
-    return pushValue(builder.create<arith::AndIOp>(loc, lhs, rhs));
+    return pushValue(arith::AndIOp::create(builder, loc, lhs, rhs));
   case clang::BinaryOperatorKind::BO_LAnd:
   case clang::BinaryOperatorKind::BO_LOr:
     emitFatalError(loc, "&& and || ops are handled elsewhere.");
@@ -991,14 +995,14 @@ bool QuakeBridgeVisitor::TraverseConditionalOperator(
         return;
       }
       Value resultVal = popValue();
-      builder.create<cc::ContinueOp>(loc, TypeRange{}, resultVal);
+      cc::ContinueOp::create(builder, loc, TypeRange{}, resultVal);
       resultTy = resultVal.getType();
     };
   };
 
-  auto ifOp = builder.create<cc::IfOp>(loc, TypeRange{resultTy}, condVal,
-                                       thenElseLambda(x->getTrueExpr()),
-                                       thenElseLambda(x->getFalseExpr()));
+  auto ifOp = cc::IfOp::create(builder, loc, TypeRange{resultTy}, condVal,
+                               thenElseLambda(x->getTrueExpr()),
+                               thenElseLambda(x->getFalseExpr()));
 
   if (!result)
     return result;
@@ -1026,8 +1030,8 @@ bool QuakeBridgeVisitor::VisitMaterializeTemporaryExpr(
     return true;
 
   // Materialize the value into a glvalue location in memory.
-  auto materialize = builder.create<cc::AllocaOp>(loc, ty);
-  builder.create<cc::StoreOp>(loc, popValue(), materialize);
+  auto materialize = cc::AllocaOp::create(builder, loc, ty);
+  cc::StoreOp::create(builder, loc, popValue(), materialize);
   return pushValue(materialize);
 }
 
@@ -1042,8 +1046,8 @@ bool QuakeBridgeVisitor::TraverseLambdaExpr(clang::LambdaExpr *x,
   if (!TraverseType(x->getType()))
     return false;
   auto callableTy = cast<cc::CallableType>(popType());
-  auto lambdaInstance = builder.create<cc::CreateLambdaOp>(
-      loc, callableTy, [&](OpBuilder &builder, Location loc) {
+  auto lambdaInstance = cc::CreateLambdaOp::create(
+      builder, loc, callableTy, [&](OpBuilder &builder, Location loc) {
         // FIXME: the capture list, etc. should be visited in an appropriate
         // context here, not as part of lowering the body of the lambda.
         auto *entryBlock = builder.getInsertionBlock();
@@ -1053,7 +1057,7 @@ bool QuakeBridgeVisitor::TraverseLambdaExpr(clang::LambdaExpr *x,
           result = false;
           return;
         }
-        builder.create<cc::ReturnOp>(loc);
+        cc::ReturnOp::create(builder, loc);
       });
   pushValue(lambdaInstance);
   return result;
@@ -1086,7 +1090,7 @@ bool QuakeBridgeVisitor::VisitMemberExpr(clang::MemberExpr *x) {
     std::int32_t offset = field->getFieldIndex();
     if (isa<quake::StruqType>(object.getType())) {
       return pushValue(
-          builder.create<quake::GetMemberOp>(loc, ty, object, offset));
+          quake::GetMemberOp::create(builder, loc, ty, object, offset));
     }
     if (!isa<cc::PointerType>(object.getType())) {
       reportClangError(x, mangler,
@@ -1099,8 +1103,8 @@ bool QuakeBridgeVisitor::VisitMemberExpr(clang::MemberExpr *x) {
       if (arrTy.isUnknownSize())
         offsets.push_back(0);
     offsets.push_back(offset);
-    return pushValue(builder.create<cc::ComputePtrOp>(
-        loc, cc::PointerType::get(ty), object, offsets));
+    return pushValue(cc::ComputePtrOp::create(
+        builder, loc, cc::PointerType::get(ty), object, offsets));
   }
   return true;
 }
@@ -1112,7 +1116,7 @@ bool QuakeBridgeVisitor::VisitUnaryExprOrTypeTraitExpr(
   switch (x->getKind()) {
   case clang::UnaryExprOrTypeTrait::UETT_SizeOf:
     return pushValue(
-        builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, popType()));
+        cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, popType()));
   default:
     break;
   }
@@ -1147,16 +1151,16 @@ bool QuakeBridgeVisitor::visitMathLibFunc(clang::CallExpr *x,
         auto resTy = calleeTy.getResult(0);
         castToSameType(builder, loc, x->getArg(0)->getType().getTypePtrOrNull(),
                        base, x->getArg(1)->getType().getTypePtrOrNull(), power);
-        auto ipow = builder.create<math::IPowIOp>(loc, base, power);
+        auto ipow = math::IPowIOp::create(builder, loc, base, power);
         if (isa<FloatType>(resTy))
-          return pushValue(builder.create<cudaq::cc::CastOp>(
-              loc, resTy, ipow, cudaq::cc::CastOpMode::Signed));
+          return pushValue(cudaq::cc::CastOp::create(
+              builder, loc, resTy, ipow, cudaq::cc::CastOpMode::Signed));
         assert(resTy == ipow.getType());
         return pushValue(ipow);
       }
-      return pushValue(builder.create<math::FPowIOp>(loc, base, power));
+      return pushValue(math::FPowIOp::create(builder, loc, base, power));
     }
-    return pushValue(builder.create<math::PowFOp>(loc, base, power));
+    return pushValue(math::PowFOp::create(builder, loc, base, power));
   }
 
   auto floatOperator = [&]<typename Op>(Op, const char *dblName) -> bool {
@@ -1164,14 +1168,14 @@ bool QuakeBridgeVisitor::visitMathLibFunc(clang::CallExpr *x,
     Value arg = popValue();
     [[maybe_unused]] auto funcConst = popValue();
     if (isa<IntegerType>(arg.getType()))
-      arg = builder.create<cudaq::cc::CastOp>(
-          loc,
+      arg = cudaq::cc::CastOp::create(
+          builder, loc,
           funcName == dblName ? builder.getF64Type() : builder.getF32Type(),
           arg,
           x->getArg(0)->getType()->isUnsignedIntegerOrEnumerationType()
               ? cudaq::cc::CastOpMode::Unsigned
               : cudaq::cc::CastOpMode::Signed);
-    return pushValue(builder.create<Op>(loc, arg));
+    return pushValue(Op::create(builder, loc, arg));
   };
 
   // Handle std::sqrt
@@ -1191,8 +1195,8 @@ bool QuakeBridgeVisitor::visitMathLibFunc(clang::CallExpr *x,
     Value arg = popValue();
     [[maybe_unused]] auto funcConst = popValue();
     if (isa<IntegerType>(arg.getType()))
-      return pushValue(builder.create<math::AbsIOp>(loc, arg));
-    return pushValue(builder.create<math::AbsFOp>(loc, arg));
+      return pushValue(math::AbsIOp::create(builder, loc, arg));
+    return pushValue(math::AbsFOp::create(builder, loc, arg));
   }
 
   // Handle std::sin
@@ -1253,13 +1257,13 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
   if (isInClassInNamespace(func, "complex", "std")) {
     auto value = popValue();
     if (isa<cc::PointerType>(value.getType()))
-      value = builder.create<cc::LoadOp>(loc, value);
+      value = cc::LoadOp::create(builder, loc, value);
     if (funcName == "real") {
       if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
         if (memberCall->getImplicitObjectArgument()) {
           [[maybe_unused]] auto calleeTy = popType();
           assert(isa<FunctionType>(calleeTy));
-          return pushValue(builder.create<complex::ReOp>(loc, value));
+          return pushValue(complex::ReOp::create(builder, loc, value));
         }
     }
     if (funcName == "imag") {
@@ -1267,7 +1271,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         if (memberCall->getImplicitObjectArgument()) {
           [[maybe_unused]] auto calleeTy = popType();
           assert(isa<FunctionType>(calleeTy));
-          return pushValue(builder.create<complex::ImOp>(loc, value));
+          return pushValue(complex::ImOp::create(builder, loc, value));
         }
     }
   }
@@ -1280,9 +1284,9 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     // Get the size of the std::vector.
     auto svec = popValue();
     if (isa<cc::PointerType>(svec.getType()))
-      svec = builder.create<cc::LoadOp>(loc, svec);
+      svec = cc::LoadOp::create(builder, loc, svec);
     auto ext =
-        builder.create<cc::StdvecSizeOp>(loc, builder.getI64Type(), svec);
+        cc::StdvecSizeOp::create(builder, loc, builder.getI64Type(), svec);
     if (funcName == "size")
       if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
         if (memberCall->getImplicitObjectArgument()) {
@@ -1295,9 +1299,9 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         if (memberCall->getImplicitObjectArgument()) {
           [[maybe_unused]] auto calleeTy = popType();
           assert(isa<FunctionType>(calleeTy));
-          return pushValue(builder.create<mlir::arith::CmpIOp>(
-              ext->getLoc(), arith::CmpIPredicate(arith::CmpIPredicate::eq),
-              ext.getResult(),
+          return pushValue(mlir::arith::CmpIOp::create(
+              builder, ext->getLoc(),
+              arith::CmpIPredicate(arith::CmpIPredicate::eq), ext.getResult(),
               getConstantInt(
                   builder, ext->getLoc(), 0,
                   ext.getResult().getType().getIntOrFloatBitWidth())));
@@ -1310,7 +1314,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           auto eleTy = cast<cc::SpanLikeType>(svec.getType()).getElementType();
           auto elePtrTy = cc::PointerType::get(eleTy);
           return pushValue(
-              builder.create<cc::StdvecDataOp>(loc, elePtrTy, svec));
+              cc::StdvecDataOp::create(builder, loc, elePtrTy, svec));
         }
     if (funcName == "back" || funcName == "rbegin")
       if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
@@ -1323,12 +1327,12 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           auto elePtrTy = cc::PointerType::get(eleTy);
           auto *ctx = eleTy.getContext();
           auto i64Ty = mlir::IntegerType::get(ctx, 64);
-          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
-          auto vecLen = builder.create<cc::StdvecSizeOp>(loc, i64Ty, svec);
+          auto vecPtr = cc::StdvecDataOp::create(builder, loc, eleArrTy, svec);
+          auto vecLen = cc::StdvecSizeOp::create(builder, loc, i64Ty, svec);
           Value vecLenMinusOne =
-              builder.create<arith::AddIOp>(loc, vecLen, negativeOneIndex);
-          return pushValue(builder.create<cc::ComputePtrOp>(
-              loc, elePtrTy, vecPtr, ValueRange{vecLenMinusOne}));
+              arith::AddIOp::create(builder, loc, vecLen, negativeOneIndex);
+          return pushValue(cc::ComputePtrOp::create(
+              builder, loc, elePtrTy, vecPtr, ValueRange{vecLenMinusOne}));
         }
     if (funcName == "end")
       if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
@@ -1340,24 +1344,24 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
           auto *ctx = eleTy.getContext();
           auto i64Ty = mlir::IntegerType::get(ctx, 64);
-          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
-          Value vecLen = builder.create<cc::StdvecSizeOp>(loc, i64Ty, svec);
-          return pushValue(builder.create<cc::ComputePtrOp>(
-              loc, elePtrTy, vecPtr, ValueRange{vecLen}));
+          auto vecPtr = cc::StdvecDataOp::create(builder, loc, eleArrTy, svec);
+          Value vecLen = cc::StdvecSizeOp::create(builder, loc, i64Ty, svec);
+          return pushValue(cc::ComputePtrOp::create(
+              builder, loc, elePtrTy, vecPtr, ValueRange{vecLen}));
         }
     if (funcName == "rend")
       if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
         if (memberCall->getImplicitObjectArgument()) {
           [[maybe_unused]] auto calleeTy = popType();
           assert(isa<FunctionType>(calleeTy));
-          Value negativeOneIndex =
-              builder.create<arith::ConstantIntOp>(loc, -1, 64);
+          Value negativeOneIndex = arith::ConstantIntOp::create(
+              builder, loc, builder.getI64Type(), -1);
           auto eleTy = cast<cc::SpanLikeType>(svec.getType()).getElementType();
           auto elePtrTy = cc::PointerType::get(eleTy);
           auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-          auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
-          return pushValue(builder.create<cc::ComputePtrOp>(
-              loc, elePtrTy, vecPtr, ValueRange{negativeOneIndex}));
+          auto vecPtr = cc::StdvecDataOp::create(builder, loc, eleArrTy, svec);
+          return pushValue(cc::ComputePtrOp::create(
+              builder, loc, elePtrTy, vecPtr, ValueRange{negativeOneIndex}));
         }
     if (funcName == "data")
       if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
@@ -1368,7 +1372,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           auto eleTy = cast<cc::SpanLikeType>(svec.getType()).getElementType();
           auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
           return pushValue(
-              builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec));
+              cc::StdvecDataOp::create(builder, loc, eleArrTy, svec));
         }
 
     TODO_loc(loc, "unhandled std::vector member function, " + funcName);
@@ -1380,9 +1384,9 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     // Calling std::_Bit_reference::method().
     auto loadFromReference = [&](mlir::Value ref) -> Value {
       if (auto mrTy = dyn_cast<cc::PointerType>(ref.getType())) {
-        auto loadVal = builder.create<cc::LoadOp>(loc, ref);
+        auto loadVal = cc::LoadOp::create(builder, loc, ref);
         if (mrTy.getElementType() == builder.getI8Type())
-          return builder.create<cc::CastOp>(loc, builder.getI1Type(), loadVal);
+          return cc::CastOp::create(builder, loc, builder.getI1Type(), loadVal);
         return loadVal;
       }
       assert(ref.getType() == builder.getI1Type());
@@ -1390,7 +1394,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     };
     if (isa<clang::CXXConversionDecl>(func)) {
       assert(isa<cc::PointerType>(peekValue().getType()));
-      return pushValue(builder.create<cc::LoadOp>(loc, popValue()));
+      return pushValue(cc::LoadOp::create(builder, loc, popValue()));
     }
     if (func->isOverloadedOperator()) {
       auto overloadedOperator = func->getOverloadedOperator();
@@ -1398,8 +1402,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         auto rhs = loadFromReference(popValue());
         auto lhs = loadFromReference(popValue());
         popValue(); // The compare equal operator address.
-        return pushValue(builder.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::eq, lhs, rhs));
+        return pushValue(arith::CmpIOp::create(
+            builder, loc, arith::CmpIPredicate::eq, lhs, rhs));
       }
       if (isAssignmentOperator(overloadedOperator)) {
         auto rhs = loadFromReference(popValue());
@@ -1412,10 +1416,10 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           if (auto arrTy = dyn_cast<cc::ArrayType>(eleTy))
             eleTy = arrTy.getElementType();
           if (eleTy != rhs.getType())
-            rhs = builder.create<cc::CastOp>(loc, eleTy, rhs,
-                                             cc::CastOpMode::Unsigned);
+            rhs = cc::CastOp::create(builder, loc, eleTy, rhs,
+                                     cc::CastOpMode::Unsigned);
         }
-        builder.create<cc::StoreOp>(loc, rhs, lhs);
+        cc::StoreOp::create(builder, loc, rhs, lhs);
         return pushValue(loadFromReference(lhs));
       }
       if (isSubscriptOperator(overloadedOperator)) {
@@ -1424,7 +1428,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         popValue(); // The subscript operator address.
         auto bytePtrTy = cc::PointerType::get(builder.getI8Type());
         return pushValue(
-            builder.create<cc::ComputePtrOp>(loc, bytePtrTy, lhs, rhs));
+            cc::ComputePtrOp::create(builder, loc, bytePtrTy, lhs, rhs));
       }
     }
     TODO_loc(loc, "unhandled std::vector<bool> member function, " + funcName);
@@ -1442,8 +1446,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           [[maybe_unused]] auto calleeTy = popType();
           assert(isa<FunctionType>(calleeTy));
           auto qregArg = popValue();
-          auto qrSize = builder.create<quake::VeqSizeOp>(
-              loc, builder.getI64Type(), qregArg);
+          auto qrSize = quake::VeqSizeOp::create(builder, loc,
+                                                 builder.getI64Type(), qregArg);
           return pushValue(qrSize);
         }
 
@@ -1459,15 +1463,15 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
             // Handle `r.front(n)` case.
             auto qrSize = actArgs.front();
             auto one = getConstantInt(builder, loc, 1, 64);
-            auto offset = builder.create<arith::SubIOp>(loc, qrSize, one);
+            auto offset = arith::SubIOp::create(builder, loc, qrSize, one);
             auto unsizedVecTy =
                 quake::VeqType::getUnsized(builder.getContext());
-            return pushValue(builder.create<quake::SubVeqOp>(
-                loc, unsizedVecTy, qregArg, zero, offset));
+            return pushValue(quake::SubVeqOp::create(builder, loc, unsizedVecTy,
+                                                     qregArg, zero, offset));
           }
           assert(actArgs.size() == 0);
           return pushValue(
-              builder.create<quake::ExtractRefOp>(loc, qregArg, zero));
+              quake::ExtractRefOp::create(builder, loc, qregArg, zero));
         }
 
     if (funcName == "back")
@@ -1477,22 +1481,22 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           assert(isa<FunctionType>(calleeTy));
           auto actArgs = lastValues(x->getNumArgs());
           auto qregArg = popValue();
-          auto qrSize = builder.create<quake::VeqSizeOp>(
-              loc, builder.getI64Type(), qregArg);
+          auto qrSize = quake::VeqSizeOp::create(builder, loc,
+                                                 builder.getI64Type(), qregArg);
           auto one = getConstantInt(builder, loc, 1, 64);
-          auto endOff = builder.create<arith::SubIOp>(loc, qrSize, one);
+          auto endOff = arith::SubIOp::create(builder, loc, qrSize, one);
           if (actArgs.size() == 1) {
             // Handle `r.back(n)` case.
             auto startOff =
-                builder.create<arith::SubIOp>(loc, qrSize, actArgs.front());
+                arith::SubIOp::create(builder, loc, qrSize, actArgs.front());
             auto unsizedVecTy =
                 quake::VeqType::getUnsized(builder.getContext());
-            return pushValue(builder.create<quake::SubVeqOp>(
-                loc, unsizedVecTy, qregArg, startOff, endOff));
+            return pushValue(quake::SubVeqOp::create(
+                builder, loc, unsizedVecTy, qregArg, startOff, endOff));
           }
           assert(actArgs.size() == 0);
           return pushValue(
-              builder.create<quake::ExtractRefOp>(loc, qregArg, endOff));
+              quake::ExtractRefOp::create(builder, loc, qregArg, endOff));
         }
 
     if (funcName == "slice") {
@@ -1506,11 +1510,11 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           auto count = actArgs[1];
 
           auto one = getConstantInt(builder, loc, 1, 64);
-          Value offset = builder.create<arith::AddIOp>(loc, start, count);
-          offset = builder.create<arith::SubIOp>(loc, offset, one);
+          Value offset = arith::AddIOp::create(builder, loc, start, count);
+          offset = arith::SubIOp::create(builder, loc, offset, one);
           auto unsizedVecTy = quake::VeqType::getUnsized(builder.getContext());
-          return pushValue(builder.create<quake::SubVeqOp>(
-              loc, unsizedVecTy, qregArg, start, offset));
+          return pushValue(quake::SubVeqOp::create(builder, loc, unsizedVecTy,
+                                                   qregArg, start, offset));
         }
     }
 
@@ -1556,7 +1560,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           pauliWord = load.getPtrvalue();
         } else if (isCharspanPointerType(v.getType())) {
           // Load the char span, which is a char*
-          auto span = builder.create<cc::LoadOp>(loc, v);
+          auto span = cc::LoadOp::create(builder, loc, v);
           pauliWord = span;
         } else if (isa<cudaq::cc::CharspanType>(v.getType())) {
           pauliWord = v;
@@ -1580,14 +1584,15 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         SmallVector<Value> quantumArgs;
         for (std::size_t i = 2; i < args.size(); i++)
           quantumArgs.push_back(args[i]);
-        targets.push_back(builder.create<quake::ConcatOp>(
-            loc, quake::VeqType::get(builder.getContext(), quantumArgs.size()),
+        targets.push_back(quake::ConcatOp::create(
+            builder, loc,
+            quake::VeqType::get(builder.getContext(), quantumArgs.size()),
             quantumArgs));
         addTheString(args[1]);
       }
 
-      builder.create<quake::ExpPauliOp>(loc, parameters, ValueRange{}, targets,
-                                        pauliWord);
+      quake::ExpPauliOp::create(builder, loc, parameters, ValueRange{}, targets,
+                                pauliWord);
       return true;
     }
 
@@ -1627,7 +1632,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
 
       if (auto callee = calleeOp.getDefiningOp<func::ConstantOp>()) {
         StringRef calleeName = callee.getValue();
-        builder.create<quake::ApplyNoiseOp>(loc, calleeName, params, qubits);
+        quake::ApplyNoiseOp::create(builder, loc, calleeName, params, qubits);
 
         // Add the declaration of the function to the module.
         SmallVector<Type> argTys;
@@ -1655,16 +1660,16 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         if (useStdvec)
           measTy = cc::StdvecType::get(measTy);
         if (funcName == "mx")
-          return builder.create<quake::MxOp>(loc, measTy, args).getMeasOut();
+          return quake::MxOp::create(builder, loc, measTy, args).getMeasOut();
         if (funcName == "my")
-          return builder.create<quake::MyOp>(loc, measTy, args).getMeasOut();
-        return builder.create<quake::MzOp>(loc, measTy, args).getMeasOut();
+          return quake::MyOp::create(builder, loc, measTy, args).getMeasOut();
+        return quake::MzOp::create(builder, loc, measTy, args).getMeasOut();
       }();
       Type resTy = builder.getI1Type();
       if (useStdvec)
         resTy = cc::StdvecType::get(resTy);
       return pushValue(
-          builder.create<quake::DiscriminateOp>(loc, resTy, measure));
+          quake::DiscriminateOp::create(builder, loc, resTy, measure));
     }
 
     // Handle the quantum gate set.
@@ -1729,7 +1734,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     if (funcName == "reset") {
       if (!negations.empty())
         reportNegateError();
-      return builder.create<quake::ResetOp>(loc, TypeRange{}, args[0]);
+      return quake::ResetOp::create(builder, loc, TypeRange{}, args[0]);
     }
     if (funcName == "swap") {
       const auto size = args.size();
@@ -1741,7 +1746,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       SmallVector<Value> ctrls(args.begin(), args.begin() + size - 2);
       auto negs =
           negatedControlsAttribute(builder.getContext(), ctrls, negations);
-      auto swap = builder.create<quake::SwapOp>(loc, ctrls, targets);
+      auto swap = quake::SwapOp::create(builder, loc, ctrls, targets);
       if (negs)
         swap->setAttr("negated_qubit_controls", negs);
       return true;
@@ -1823,16 +1828,16 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         if (!negations.empty())
           reportNegateError();
         Type i64Ty = builder.getI64Type();
-        auto size = builder.create<quake::VeqSizeOp>(
-            loc, builder.getIntegerType(64), target);
-        Value rank = builder.create<cudaq::cc::CastOp>(
-            loc, i64Ty, size, cudaq::cc::CastOpMode::Unsigned);
+        auto size = quake::VeqSizeOp::create(
+            builder, loc, builder.getIntegerType(64), target);
+        Value rank = cudaq::cc::CastOp::create(builder, loc, i64Ty, size,
+                                               cudaq::cc::CastOpMode::Unsigned);
         auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &,
                                Block &block) {
-          Value ref = builder.create<quake::ExtractRefOp>(loc, target,
-                                                          block.getArgument(0));
-          builder.create<quake::CustomUnitarySymbolOp>(loc, srefAttr,
-                                                       ValueRange(), ref);
+          Value ref = quake::ExtractRefOp::create(builder, loc, target,
+                                                  block.getArgument(0));
+          quake::CustomUnitarySymbolOp::create(builder, loc, srefAttr,
+                                               ValueRange(), ref);
         };
         cudaq::opt::factory::createInvariantLoop(builder, loc, rank,
                                                  bodyBuilder);
@@ -1849,9 +1854,9 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         SmallVector<Value> params;
         for (auto p : operands.take_front(paramCount))
           if (isa<cudaq::cc::PointerType>(p.getType()))
-            params.push_back(builder.create<cudaq::cc::LoadOp>(loc, p));
-        builder.create<quake::CustomUnitarySymbolOp>(
-            loc, srefAttr, isAdjoint, params, ctrls, targets, negs);
+            params.push_back(cudaq::cc::LoadOp::create(builder, loc, p));
+        quake::CustomUnitarySymbolOp::create(builder, loc, srefAttr, isAdjoint,
+                                             params, ctrls, targets, negs);
       }
       return true;
     }
@@ -1942,9 +1947,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         auto kernelArgs =
             convertKernelArgs(loc, 2, args, kernelTy.getInputs(), x);
         inlinedStartControlNegations();
-        builder.create<quake::ApplyOp>(loc, TypeRange{}, calleeSymbol,
-                                       /*isAdjoint=*/false, ctrlValues,
-                                       kernelArgs);
+        quake::ApplyOp::create(builder, loc, TypeRange{}, calleeSymbol,
+                               /*isAdjoint=*/false, ctrlValues, kernelArgs);
         return inlinedFinishControlNegations();
       }
       if (auto func = calleeValue.getDefiningOp<func::ConstantOp>()) {
@@ -1953,9 +1957,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         inlinedStartControlNegations();
         auto kernelArgs =
             convertKernelArgs(loc, 2, args, funcTy.getInputs(), x);
-        builder.create<quake::ApplyOp>(loc, funcTy.getResults(), callableSym,
-                                       /*isAdjoint=*/false, ctrlValues,
-                                       kernelArgs);
+        quake::ApplyOp::create(builder, loc, funcTy.getResults(), callableSym,
+                               /*isAdjoint=*/false, ctrlValues, kernelArgs);
         return inlinedFinishControlNegations();
       }
       if (auto ty = dyn_cast<cc::CallableType>(calleeValue.getType())) {
@@ -1994,13 +1997,13 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           auto kernelArgs =
               convertKernelArgs(loc, 2, args, funcTy.getInputs(), x);
           if (isKernelEntryPoint(callOperDecl)) {
-            builder.create<quake::ApplyOp>(
-                loc, funcTy.getResults(), calleeSymbol,
-                /*isAdjoint=*/false, ctrlValues, kernelArgs);
+            quake::ApplyOp::create(builder, loc, funcTy.getResults(),
+                                   calleeSymbol,
+                                   /*isAdjoint=*/false, ctrlValues, kernelArgs);
           } else {
-            builder.create<quake::ApplyOp>(
-                loc, funcTy.getResults(), calleeValue,
-                /*isAdjoint=*/false, ctrlValues, kernelArgs);
+            quake::ApplyOp::create(builder, loc, funcTy.getResults(),
+                                   calleeValue,
+                                   /*isAdjoint=*/false, ctrlValues, kernelArgs);
           }
           return inlinedFinishControlNegations();
         }
@@ -2054,17 +2057,17 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         assert(kernFunc && "kernel call operator must be present");
         auto kernTy = kernFunc.getFunctionType();
         auto kernArgs = convertKernelArgs(loc, 1, args, kernTy.getInputs(), x);
-        return builder.create<quake::ApplyOp>(loc, TypeRange{}, kernelSymbol,
-                                              /*isAdjoint=*/true, ValueRange{},
-                                              kernArgs);
+        return quake::ApplyOp::create(builder, loc, TypeRange{}, kernelSymbol,
+                                      /*isAdjoint=*/true, ValueRange{},
+                                      kernArgs);
       }
       if (auto func = kernelValue.getDefiningOp<func::ConstantOp>()) {
         auto kernSym = func.getValueAttr();
         auto funcTy = cast<FunctionType>(func.getType());
         auto kernArgs = convertKernelArgs(loc, 1, args, funcTy.getInputs(), x);
-        return builder.create<quake::ApplyOp>(loc, funcTy.getResults(), kernSym,
-                                              /*isAdjoint=*/true, ValueRange{},
-                                              kernArgs);
+        return quake::ApplyOp::create(
+            builder, loc, funcTy.getResults(), kernSym,
+            /*isAdjoint=*/true, ValueRange{}, kernArgs);
       }
       if (auto ty = dyn_cast<cc::CallableType>(kernelTy)) {
         // In order to autogenerate the control form of the called kernel, we
@@ -2100,12 +2103,12 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           auto kernelArgs =
               convertKernelArgs(loc, 1, args, funcTy.getInputs(), x);
           if (isKernelEntryPoint(callOperDecl)) {
-            return builder.create<quake::ApplyOp>(
-                loc, funcTy.getResults(), kernelSymbol,
+            return quake::ApplyOp::create(
+                builder, loc, funcTy.getResults(), kernelSymbol,
                 /*isAdjoint=*/true, ValueRange{}, kernelArgs);
           }
-          return builder.create<quake::ApplyOp>(
-              loc, funcTy.getResults(), kernelValue,
+          return quake::ApplyOp::create(
+              builder, loc, funcTy.getResults(), kernelValue,
               /*isAdjoint=*/true, ValueRange{}, kernelArgs);
         }
         TODO_loc(loc, "value has !cc.lambda type but decl isn't a lambda");
@@ -2114,13 +2117,13 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     }
 
     if (funcName == "compute_action") {
-      builder.create<quake::ComputeActionOp>(loc, /*is_dagger=*/false, args[0],
-                                             args[1]);
+      quake::ComputeActionOp::create(builder, loc, /*is_dagger=*/false, args[0],
+                                     args[1]);
       return true;
     }
     if (funcName == "compute_dag_action") {
-      builder.create<quake::ComputeActionOp>(loc, /*is_dagger=*/true, args[0],
-                                             args[1]);
+      quake::ComputeActionOp::create(builder, loc, /*is_dagger=*/true, args[0],
+                                     args[1]);
       return true;
     }
 
@@ -2132,7 +2135,7 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       }
       auto i64Ty = builder.getI64Type();
       return pushValue(
-          builder.create<func::CallOp>(loc, i64Ty, cudaqConvertToInteger, args)
+          func::CallOp::create(builder, loc, i64Ty, cudaqConvertToInteger, args)
               .getResult(0));
     }
 
@@ -2146,21 +2149,21 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       if (eleTy == builder.getI1Type()) {
         eleTy = cc::ArrayType::get(builder.getI8Type());
         ptrTy = cc::PointerType::get(eleTy);
-        vecPtr = builder.create<cc::StdvecDataOp>(loc, ptrTy, args[0]);
+        vecPtr = cc::StdvecDataOp::create(builder, loc, ptrTy, args[0]);
         auto bits = svecTy.getElementType().getIntOrFloatBitWidth();
         assert(bits > 0);
-        auto scale = builder.create<arith::ConstantIntOp>(loc, (bits + 7) / 8,
-                                                          args[1].getType());
-        offset = builder.create<arith::MulIOp>(loc, scale, args[1]);
+        auto scale = arith::ConstantIntOp::create(
+            builder, loc, args[1].getType(), (bits + 7) / 8);
+        offset = arith::MulIOp::create(builder, loc, scale, args[1]);
       } else {
         ptrTy = cc::PointerType::get(eleTy);
         auto arrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-        vecPtr = builder.create<cc::StdvecDataOp>(loc, arrTy, args[0]);
+        vecPtr = cc::StdvecDataOp::create(builder, loc, arrTy, args[0]);
       }
-      auto ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, vecPtr,
-                                                  ArrayRef<Value>{offset});
+      auto ptr = cc::ComputePtrOp::create(builder, loc, ptrTy, vecPtr,
+                                          ArrayRef<Value>{offset});
       return pushValue(
-          builder.create<cc::StdvecInitOp>(loc, svecTy, ptr, args[2]));
+          cc::StdvecInitOp::create(builder, loc, svecTy, ptr, args[2]));
     }
 
     if (funcName == "range") {
@@ -2171,31 +2174,33 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
             irBuilder.loadIntrinsic(module, setCudaqRangeVector);
         assert(succeeded(result) && "loading intrinsic should never fail");
         auto upVal = args[0];
-        auto upper = builder.create<cc::CastOp>(loc, i64Ty, upVal,
-                                                cc::CastOpMode::Unsigned);
-        auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, upper);
+        auto upper = cc::CastOp::create(builder, loc, i64Ty, upVal,
+                                        cc::CastOpMode::Unsigned);
+        auto buffer = cc::AllocaOp::create(builder, loc, i64Ty, upper);
         auto stdvecTy = cc::StdvecType::get(i64Ty);
-        auto call = builder.create<func::CallOp>(
-            loc, stdvecTy, setCudaqRangeVector, ValueRange{buffer, upper});
+        auto call =
+            func::CallOp::create(builder, loc, stdvecTy, setCudaqRangeVector,
+                                 ValueRange{buffer, upper});
         return pushValue(call.getResult(0));
       }
       assert(funcArity == 3);
       [[maybe_unused]] auto result =
           irBuilder.loadIntrinsic(module, setCudaqRangeVectorTriple);
       assert(succeeded(result) && "loading intrinsic should never fail");
-      Value start = builder.create<cc::CastOp>(loc, i64Ty, args[0],
-                                               cc::CastOpMode::Signed);
-      Value stop = builder.create<cc::CastOp>(loc, i64Ty, args[1],
-                                              cc::CastOpMode::Signed);
-      Value step = builder.create<cc::CastOp>(loc, i64Ty, args[2],
-                                              cc::CastOpMode::Signed);
-      auto lengthCall = builder.create<func::CallOp>(
-          loc, i64Ty, getCudaqSizeFromTriple, ValueRange{start, stop, step});
+      Value start = cc::CastOp::create(builder, loc, i64Ty, args[0],
+                                       cc::CastOpMode::Signed);
+      Value stop = cc::CastOp::create(builder, loc, i64Ty, args[1],
+                                      cc::CastOpMode::Signed);
+      Value step = cc::CastOp::create(builder, loc, i64Ty, args[2],
+                                      cc::CastOpMode::Signed);
+      auto lengthCall =
+          func::CallOp::create(builder, loc, i64Ty, getCudaqSizeFromTriple,
+                               ValueRange{start, stop, step});
       Value length = lengthCall.getResult(0);
-      auto buffer = builder.create<cc::AllocaOp>(loc, i64Ty, length);
+      auto buffer = cc::AllocaOp::create(builder, loc, i64Ty, length);
       auto stdvecTy = cc::StdvecType::get(i64Ty);
-      auto call =
-          builder.create<func::CallOp>(loc, stdvecTy, setCudaqRangeVectorTriple,
+      auto call = func::CallOp::create(builder, loc, stdvecTy,
+                                       setCudaqRangeVectorTriple,
                                        ValueRange{buffer, start, stop, step});
       return pushValue(call.getResult(0));
     }
@@ -2252,8 +2257,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           continue;
         }
         // Resolve the raw pointer from this device_ptr handle.
-        Value result = builder.create<cc::ResolveDevicePtrOp>(
-            loc, devFuncTy.getInputs()[i - argsOffset], args[i]);
+        Value result = cc::ResolveDevicePtrOp::create(
+            builder, loc, devFuncTy.getInputs()[i - argsOffset], args[i]);
         processedArgs.push_back(result);
       }
 
@@ -2263,16 +2268,18 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       auto devCall = [&]() {
         if (maybeGPULaunchParams) {
           auto [numBlocks, numThreads] = maybeGPULaunchParams.value();
-          Value blocks =
-              builder.create<arith::ConstantIntOp>(loc, numBlocks, 64);
-          Value threadsPerBlock =
-              builder.create<arith::ConstantIntOp>(loc, numThreads, 64);
-          return builder.create<cc::DeviceCallOp>(
-              loc, devFuncTy.getResults(), symbol, ValueRange{blocks},
-              ValueRange{threadsPerBlock}, deviceId, callArgs);
+          Value blocks = arith::ConstantIntOp::create(
+              builder, loc, builder.getI64Type(), numBlocks);
+          Value threadsPerBlock = arith::ConstantIntOp::create(
+              builder, loc, builder.getI64Type(), numThreads);
+          return cc::DeviceCallOp::create(builder, loc, devFuncTy.getResults(),
+                                          symbol, ValueRange{blocks},
+                                          ValueRange{threadsPerBlock}, deviceId,
+                                          callArgs, ArrayAttr{}, ArrayAttr{});
         }
-        return builder.create<cc::DeviceCallOp>(loc, devFuncTy.getResults(),
-                                                symbol, deviceId, callArgs);
+        return cc::DeviceCallOp::create(
+            builder, loc, devFuncTy.getResults(), symbol, ValueRange{},
+            ValueRange{}, deviceId, callArgs, ArrayAttr{}, ArrayAttr{});
       }();
       if (devFuncTy.getResults().empty())
         return true;
@@ -2292,8 +2299,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       // buffer. Create a loop that interchanges pairs as $(a_0, a_1-1)$,
       // $(a_0+1, a_1-2)$, ... until $a_0 + n \ge a_1 - n - 1$.
       auto i64Ty = builder.getI64Type();
-      auto hiInt = builder.create<cc::CastOp>(loc, i64Ty, args[1]);
-      auto loInt = builder.create<cc::CastOp>(loc, i64Ty, args[0]);
+      auto hiInt = cc::CastOp::create(builder, loc, i64Ty, args[1]);
+      auto loInt = cc::CastOp::create(builder, loc, i64Ty, args[0]);
       auto ptrTy = cast<cc::PointerType>(args[0].getType());
       auto eleTy = ptrTy.getElementType();
       auto arrTy = dyn_cast<cc::ArrayType>(eleTy);
@@ -2305,30 +2312,30 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       }
       auto eleSize = eleTy.getIntOrFloatBitWidth();
       auto adjust = getConstantInt(builder, loc, eleSize / 4, i64Ty);
-      auto dist = builder.create<arith::SubIOp>(loc, hiInt, loInt);
-      Value iters = builder.create<arith::DivSIOp>(loc, dist, adjust);
+      auto dist = arith::SubIOp::create(builder, loc, hiInt, loInt);
+      Value iters = arith::DivSIOp::create(builder, loc, dist, adjust);
       auto ptrArrTy = cc::PointerType::get(arrTy);
-      Value basePtr = builder.create<cc::CastOp>(loc, ptrArrTy, args[0]);
+      Value basePtr = cc::CastOp::create(builder, loc, ptrArrTy, args[0]);
       auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &,
                              Block &block) {
         OpBuilder::InsertionGuard guard(builder);
         builder.setInsertionPointToStart(&block);
         auto iterIdx = block.getArgument(0);
         auto ptrA =
-            builder.create<cc::ComputePtrOp>(loc, ptrTy, basePtr, iterIdx);
-        auto one = builder.create<arith::ConstantIntOp>(loc, 1, i64Ty);
-        auto iters1 = builder.create<arith::SubIOp>(loc, iters, one);
-        Value hiIdx = builder.create<arith::SubIOp>(loc, iters1, iterIdx);
+            cc::ComputePtrOp::create(builder, loc, ptrTy, basePtr, iterIdx);
+        auto one = arith::ConstantIntOp::create(builder, loc, i64Ty, 1);
+        auto iters1 = arith::SubIOp::create(builder, loc, iters, one);
+        Value hiIdx = arith::SubIOp::create(builder, loc, iters1, iterIdx);
         auto ptrB =
-            builder.create<cc::ComputePtrOp>(loc, ptrTy, basePtr, hiIdx);
-        Value loadA = builder.create<cc::LoadOp>(loc, ptrA);
-        Value loadB = builder.create<cc::LoadOp>(loc, ptrB);
-        builder.create<cc::StoreOp>(loc, loadA, ptrB);
-        builder.create<cc::StoreOp>(loc, loadB, ptrA);
+            cc::ComputePtrOp::create(builder, loc, ptrTy, basePtr, hiIdx);
+        Value loadA = cc::LoadOp::create(builder, loc, ptrA);
+        Value loadB = cc::LoadOp::create(builder, loc, ptrB);
+        cc::StoreOp::create(builder, loc, loadA, ptrB);
+        cc::StoreOp::create(builder, loc, loadB, ptrA);
       };
       auto idxTy = builder.getI64Type();
-      auto idxIters = builder.create<cudaq::cc::CastOp>(
-          loc, idxTy, iters, cudaq::cc::CastOpMode::Unsigned);
+      auto idxIters = cudaq::cc::CastOp::create(
+          builder, loc, idxTy, iters, cudaq::cc::CastOpMode::Unsigned);
       opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
       return true;
     }
@@ -2350,9 +2357,10 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       if (specArgs[0].getKind() == clang::TemplateArgument::ArgKind::Integral) {
         std::int32_t offset = specArgs[0].getAsIntegral().getExtValue();
         fixIfTuple(offset);
-        auto ptr = builder.create<cc::ComputePtrOp>(
-            loc, resultTy, args[0], ArrayRef<cc::ComputePtrArg>{offset});
-        return pushValue(builder.create<cc::LoadOp>(loc, ptr));
+        auto ptr =
+            cc::ComputePtrOp::create(builder, loc, resultTy, args[0],
+                                     ArrayRef<cc::ComputePtrArg>{offset});
+        return pushValue(cc::LoadOp::create(builder, loc, ptr));
       }
       auto *selectTy = specArgs[0].getAsType().getTypePtr();
       assert(specArgs[1].getKind() == clang::TemplateArgument::ArgKind::Pack);
@@ -2360,9 +2368,10 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       for (auto &templateArg : specArgs[1].pack_elements()) {
         if (templateArg.getAsType().getTypePtr() == selectTy) {
           fixIfTuple(offset);
-          auto ptr = builder.create<cc::ComputePtrOp>(
-              loc, resultTy, args[0], ArrayRef<cc::ComputePtrArg>{offset});
-          return pushValue(builder.create<cc::LoadOp>(loc, ptr));
+          auto ptr =
+              cc::ComputePtrOp::create(builder, loc, resultTy, args[0],
+                                       ArrayRef<cc::ComputePtrArg>{offset});
+          return pushValue(cc::LoadOp::create(builder, loc, ptr));
         }
         ++offset;
       }
@@ -2381,8 +2390,9 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
   auto funcResults = mlirFuncTy.getResults();
   auto convertedArgs =
       convertKernelArgs(loc, 0, args, mlirFuncTy.getInputs(), x);
-  auto call = builder.create<func::CallIndirectOp>(loc, funcResults, calleeOp,
-                                                   convertedArgs);
+  auto call =
+      func::CallIndirectOp::create(builder, loc, funcResults, calleeOp,
+                                   convertedArgs, ArrayAttr{}, ArrayAttr{});
   if (call.getNumResults() > 0) {
     if (call.getNumResults() != 1) {
       reportClangError(x, mangler, "expect exactly one return value");
@@ -2399,22 +2409,22 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
           return builder.getI8Type();
         return et;
       }();
-      auto data = builder.create<cudaq::cc::StdvecDataOp>(
-          loc, cudaq::cc::PointerType::get(eleTy), call.getResult(0));
+      auto data = cudaq::cc::StdvecDataOp::create(
+          builder, loc, cudaq::cc::PointerType::get(eleTy), call.getResult(0));
       auto i64Ty = builder.getI64Type();
-      auto len = builder.create<cudaq::cc::StdvecSizeOp>(loc, i64Ty,
-                                                         call.getResult(0));
-      auto eleSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
-      auto size = builder.create<arith::MulIOp>(loc, len, eleSize);
-      auto buffer = builder.create<cudaq::cc::AllocaOp>(loc, eleTy, size);
+      auto len = cudaq::cc::StdvecSizeOp::create(builder, loc, i64Ty,
+                                                 call.getResult(0));
+      auto eleSize = cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, eleTy);
+      auto size = arith::MulIOp::create(builder, loc, len, eleSize);
+      auto buffer = cudaq::cc::AllocaOp::create(builder, loc, eleTy, size);
       auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-      auto cbuffer = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
-      auto cdata = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, data);
-      builder.create<func::CallOp>(loc, TypeRange{},
-                                   "__nvqpp_vectorCopyToStack",
-                                   ValueRange{cbuffer, cdata, size});
+      auto cbuffer = cudaq::cc::CastOp::create(builder, loc, i8PtrTy, buffer);
+      auto cdata = cudaq::cc::CastOp::create(builder, loc, i8PtrTy, data);
+      func::CallOp::create(builder, loc, TypeRange{},
+                           "__nvqpp_vectorCopyToStack",
+                           ValueRange{cbuffer, cdata, size});
       Value newSpan =
-          builder.create<cudaq::cc::StdvecInitOp>(loc, vecTy, buffer, len);
+          cudaq::cc::StdvecInitOp::create(builder, loc, vecTy, buffer, len);
       return pushValue(newSpan);
     }
     return pushValue(call.getResult(0));
@@ -2495,7 +2505,7 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
         // extract `Op` to the symbol table, but always generate a new
         // `quake.extract_ref` `Op` to get the exact qubit (reference) value.
         auto address_qubit =
-            builder.create<quake::ExtractRefOp>(loc, qreg_var, idx_var);
+            quake::ExtractRefOp::create(builder, loc, qreg_var, idx_var);
         return replaceTOSValue(address_qubit);
       }
       // Get name of the qreg, e.g. qr, and use it to construct a name for the
@@ -2515,7 +2525,7 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       // in the symbol table, and return the AddressQubit operation's
       // resulting value.
       auto address_qubit =
-          builder.create<quake::ExtractRefOp>(loc, qreg_var, idx_var);
+          quake::ExtractRefOp::create(builder, loc, qreg_var, idx_var);
 
       // NB: varName is built from the variable name *and* the index value. This
       // front-end optimization is likely unnecessary as the compiler can always
@@ -2529,7 +2539,7 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       auto indexVar = popValue();
       auto svec = popValue();
       if (isa<cc::PointerType>(svec.getType()))
-        svec = builder.create<cc::LoadOp>(loc, svec);
+        svec = cc::LoadOp::create(builder, loc, svec);
       if (!isa<cc::StdvecType>(svec.getType())) {
         TODO_x(loc, x, mangler, "vector dereference");
         return false;
@@ -2539,9 +2549,9 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
         eleTy = builder.getI8Type();
       auto elePtrTy = cc::PointerType::get(eleTy);
       auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-      auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
-      auto eleAddr = builder.create<cc::ComputePtrOp>(loc, elePtrTy, vecPtr,
-                                                      ValueRange{indexVar});
+      auto vecPtr = cc::StdvecDataOp::create(builder, loc, eleArrTy, svec);
+      auto eleAddr = cc::ComputePtrOp::create(builder, loc, elePtrTy, vecPtr,
+                                              ValueRange{indexVar});
       return replaceTOSValue(eleAddr);
     }
     if (typeName == "_Bit_reference" || typeName == "__bit_reference" ||
@@ -2555,11 +2565,11 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       auto i8Ty = builder.getI8Type();
       auto elePtrTy = cc::PointerType::get(i8Ty);
       auto eleArrTy = cc::PointerType::get(cc::ArrayType::get(i8Ty));
-      auto vecPtr = builder.create<cc::StdvecDataOp>(loc, eleArrTy, svec);
-      auto eleAddr = builder.create<cc::ComputePtrOp>(loc, elePtrTy, vecPtr,
-                                                      ValueRange{indexVar});
+      auto vecPtr = cc::StdvecDataOp::create(builder, loc, eleArrTy, svec);
+      auto eleAddr = cc::ComputePtrOp::create(builder, loc, elePtrTy, vecPtr,
+                                              ValueRange{indexVar});
       auto i1PtrTy = cc::PointerType::get(builder.getI1Type());
-      auto i1Cast = builder.create<cudaq::cc::CastOp>(loc, i1PtrTy, eleAddr);
+      auto i1Cast = cudaq::cc::CastOp::create(builder, loc, i1PtrTy, eleAddr);
       return replaceTOSValue(i1Cast);
     }
     TODO_loc(loc, "unhandled operator call for quake conversion");
@@ -2582,8 +2592,9 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
         auto funcTy = cast<FunctionType>(indirect.getType());
         auto convertedArgs =
             convertKernelArgs(loc, 0, args, funcTy.getInputs(), x);
-        auto call = builder.create<func::CallIndirectOp>(
-            loc, funcTy.getResults(), indirect, convertedArgs);
+        auto call = func::CallIndirectOp::create(
+            builder, loc, funcTy.getResults(), indirect, convertedArgs,
+            ArrayAttr{}, ArrayAttr{});
         if (call.getResults().empty())
           return true;
         return pushValue(call.getResult(0));
@@ -2599,15 +2610,15 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       if (indCallTy) {
         [[maybe_unused]] auto discardedCallOp = popValue();
         auto funcTy = cast<FunctionType>(indCallTy.getSignature());
-        auto call = builder.create<cc::CallIndirectCallableOp>(
-            loc, funcTy.getResults(), tos, args);
+        auto call = cc::CallIndirectCallableOp::create(
+            builder, loc, funcTy.getResults(), tos, args);
         if (call.getResults().empty())
           return true;
         return pushValue(call.getResult(0));
       }
       auto callableTy = cast<cc::CallableType>(tosTy);
-      auto callInd = builder.create<cc::CallCallableOp>(
-          loc, callableTy.getSignature().getResults(), tos, args);
+      auto callInd = cc::CallCallableOp::create(
+          builder, loc, callableTy.getSignature().getResults(), tos, args);
       if (callInd.getResults().empty()) {
         popValue();
         return true;
@@ -2695,7 +2706,7 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
   auto initListTy = popType();
   if (size == 0) {
     // Nothing in the list. Just allocate the type.
-    return pushValue(builder.create<cc::AllocaOp>(loc, initListTy));
+    return pushValue(cc::AllocaOp::create(builder, loc, initListTy));
   }
 
   // List has 1 or more members.
@@ -2709,7 +2720,8 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
     return isa<quake::RefType, quake::VeqType>(v.getType());
   });
   if (allRef && isa<quake::StruqType>(initListTy))
-    return pushValue(builder.create<quake::MakeStruqOp>(loc, initListTy, last));
+    return pushValue(
+        quake::MakeStruqOp::create(builder, loc, initListTy, last));
 
   if (allRef && !isa<cc::StructType>(initListTy)) {
     // Initializer list contains all quantum reference types. In this case we
@@ -2728,7 +2740,7 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
         }
         return quake::VeqType::get(builder.getContext(), size);
       }();
-      return pushValue(builder.create<quake::ConcatOp>(loc, veqTy, last));
+      return pushValue(quake::ConcatOp::create(builder, loc, veqTy, last));
     }
     // Pass initialization list with one member as a Ref.
     return pushValue(last[0]);
@@ -2740,7 +2752,8 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
   std::int32_t structMems = structTy ? structTy.getMembers().size() : 0;
   std::int32_t numEles = structMems ? size / structMems : size;
   // Generate the array size value.
-  Value arrSize = builder.create<arith::ConstantIntOp>(loc, numEles, 64);
+  Value arrSize =
+      arith::ConstantIntOp::create(builder, loc, builder.getI64Type(), numEles);
 
   // Allocate the required memory chunk.
   Type eleTy = [&]() {
@@ -2775,23 +2788,22 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
     {
       OpBuilder::InsertionGuard guard(builder);
       builder.setInsertionPointToEnd(module.getBody());
-      builder
-          .create<cc::GlobalOp>(loc, globalTy, name, f64Attr,
-                                /*constant=*/true, /*external=*/false)
+      cc::GlobalOp::create(builder, loc, globalTy, name, f64Attr,
+                           /*constant=*/true, /*external=*/false)
           .setPrivate();
     }
     auto ptrTy = cc::PointerType::get(globalTy);
-    auto globalInit = builder.create<cc::AddressOfOp>(loc, ptrTy, name);
+    auto globalInit = cc::AddressOfOp::create(builder, loc, ptrTy, name);
     return pushValue(globalInit);
   }
 
   // If quantum, use value semantics with cc insert / extract value.
   if (isa<quake::StruqType>(eleTy))
-    return pushValue(builder.create<quake::MakeStruqOp>(loc, eleTy, last));
+    return pushValue(quake::MakeStruqOp::create(builder, loc, eleTy, last));
 
   Value alloca = (numEles > 1)
-                     ? builder.create<cc::AllocaOp>(loc, eleTy, arrSize)
-                     : builder.create<cc::AllocaOp>(loc, eleTy);
+                     ? cc::AllocaOp::create(builder, loc, eleTy, arrSize)
+                     : cc::AllocaOp::create(builder, loc, eleTy);
 
   // Store the values in the allocated memory
   for (auto iter : llvm::enumerate(last)) {
@@ -2802,32 +2814,32 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
       if (numEles > 1) {
         auto ptrTy =
             cc::PointerType::get(structTy.getMembers()[i % structMems]);
-        ptr = builder.create<cc::ComputePtrOp>(
-            loc, ptrTy, alloca,
+        ptr = cc::ComputePtrOp::create(
+            builder, loc, ptrTy, alloca,
             ArrayRef<cc::ComputePtrArg>{i / structMems, i % structMems});
       } else {
         auto ptrTy = cc::PointerType::get(structTy.getMembers()[i]);
-        ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, alloca,
-                                               ArrayRef<cc::ComputePtrArg>{i});
+        ptr = cc::ComputePtrOp::create(builder, loc, ptrTy, alloca,
+                                       ArrayRef<cc::ComputePtrArg>{i});
       }
     } else {
       if (numEles > 1) {
         auto ptrTy = cc::PointerType::get(eleTy);
-        ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, alloca,
-                                               ArrayRef<cc::ComputePtrArg>{i});
+        ptr = cc::ComputePtrOp::create(builder, loc, ptrTy, alloca,
+                                       ArrayRef<cc::ComputePtrArg>{i});
       } else {
         auto arrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-        auto cast = builder.create<cc::CastOp>(loc, arrTy, alloca);
+        auto cast = cc::CastOp::create(builder, loc, arrTy, alloca);
         auto ptrTy = cc::PointerType::get(eleTy);
-        ptr = builder.create<cc::ComputePtrOp>(loc, ptrTy, cast,
-                                               ArrayRef<cc::ComputePtrArg>{i});
+        ptr = cc::ComputePtrOp::create(builder, loc, ptrTy, cast,
+                                       ArrayRef<cc::ComputePtrArg>{i});
       }
     }
     assert(ptr &&
            (v.getType() ==
             cast<cc::PointerType>(ptr.getType()).getElementType()) &&
            "value type must match pointer element type");
-    builder.create<cc::StoreOp>(loc, v, ptr);
+    cc::StoreOp::create(builder, loc, v, ptr);
   }
 
   return pushValue(alloca);
@@ -2889,7 +2901,7 @@ bool QuakeBridgeVisitor::VisitCXXParenListInitExpr(
     return true;
   auto loc = toLocation(x);
   auto last = lastValues(structTy.getMembers().size());
-  return pushValue(builder.create<quake::MakeStruqOp>(loc, structTy, last));
+  return pushValue(quake::MakeStruqOp::create(builder, loc, structTy, last));
 }
 
 bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
@@ -2907,28 +2919,29 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
       if (ctorName == "qudit") {
         // This is a single qubit.
         assert(isa<quake::RefType>(ctorTy));
-        return pushValue(builder.create<quake::AllocaOp>(loc));
+        return pushValue(quake::AllocaOp::create(builder, loc));
       }
       // These classes have template arguments that may give a compile-time
       // constant size. qarray is the only one that requires it, however.
       if (ctorName == "qreg" || ctorName == "qarray" || ctorName == "qspan") {
         [[maybe_unused]] auto veqTy = cast<quake::VeqType>(ctorTy);
         assert(veqTy.hasSpecifiedSize());
-        return pushValue(builder.create<quake::AllocaOp>(loc, ctorTy));
+        return pushValue(quake::AllocaOp::create(builder, loc, ctorTy));
       }
       if (ctorName == "qvector") {
         // The default qvector ctor creates a veq of size 1.
         assert(isa<quake::VeqType>(ctorTy));
         auto veq1Ty = quake::VeqType::get(builder.getContext(), 1);
-        return pushValue(builder.create<quake::AllocaOp>(loc, veq1Ty));
+        return pushValue(quake::AllocaOp::create(builder, loc, veq1Ty));
       }
     } else if (x->getNumArgs() == 1) {
       if (ctorName == "qreg") {
         // This is a cudaq::qreg(std::size_t).
         auto sizeVal = popValue();
         assert(isa<IntegerType>(sizeVal.getType()));
-        return pushValue(builder.create<quake::AllocaOp>(
-            loc, quake::VeqType::getUnsized(builder.getContext()), sizeVal));
+        return pushValue(quake::AllocaOp::create(
+            builder, loc, quake::VeqType::getUnsized(builder.getContext()),
+            sizeVal));
       }
 
       if (ctorName == "state") {
@@ -2941,17 +2954,17 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
         if (auto stdvecTy = dyn_cast<cudaq::cc::StdvecType>(stdvec.getType())) {
           auto dataTy = cudaq::cc::PointerType::get(stdvecTy.getElementType());
           Value data =
-              builder.create<cudaq::cc::StdvecDataOp>(loc, dataTy, stdvec);
+              cudaq::cc::StdvecDataOp::create(builder, loc, dataTy, stdvec);
           auto i64Ty = builder.getI64Type();
           Value size =
-              builder.create<cudaq::cc::StdvecSizeOp>(loc, i64Ty, stdvec);
-          return pushValue(builder.create<quake::CreateStateOp>(
-              loc, stateTy, ValueRange{data, size}));
+              cudaq::cc::StdvecSizeOp::create(builder, loc, i64Ty, stdvec);
+          return pushValue(quake::CreateStateOp::create(
+              builder, loc, stateTy, ValueRange{data, size}));
         }
         if (auto alloc = stdvec.getDefiningOp<cudaq::cc::AllocaOp>()) {
           Value size = alloc.getSeqSize();
-          return pushValue(builder.create<quake::CreateStateOp>(
-              loc, stateTy, ValueRange{alloc, size}));
+          return pushValue(quake::CreateStateOp::create(
+              builder, loc, stateTy, ValueRange{alloc, size}));
         }
         TODO_loc(loc, "unhandled state constructor");
         return false;
@@ -2970,13 +2983,14 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
           if (auto load = initials.getDefiningOp<cudaq::cc::LoadOp>())
             initials = load.getPtrvalue();
         if (isStateType(initials.getType())) {
-          Value alloca = builder.create<quake::AllocaOp>(loc);
+          Value alloca = quake::AllocaOp::create(builder, loc);
           auto veq1Ty = quake::VeqType::get(builder.getContext(), 1);
-          Value initSt = builder.create<quake::InitializeStateOp>(
-              loc, veq1Ty, ValueRange{alloca, initials});
+          Value initSt = quake::InitializeStateOp::create(
+              builder, loc, veq1Ty, ValueRange{alloca, initials});
           if (auto initOp = initials.getDefiningOp<quake::CreateStateOp>())
-            builder.create<quake::DeleteStateOp>(loc, initOp);
-          return pushValue(builder.create<quake::ExtractRefOp>(loc, initSt, 0));
+            quake::DeleteStateOp::create(builder, loc, initOp);
+          return pushValue(
+              quake::ExtractRefOp::create(builder, loc, initSt, 0));
         }
         bool ok = false;
         if (auto ptrTy = dyn_cast<cc::PointerType>(initials.getType()))
@@ -2985,22 +2999,22 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
         if (!ok) {
           // Invalid initializer ignored, but emit an error.
           reportClangError(x, mangler, "invalid qudit initial value");
-          return pushValue(builder.create<quake::AllocaOp>(loc));
+          return pushValue(quake::AllocaOp::create(builder, loc));
         }
         auto *ctx = builder.getContext();
         auto veqTy = quake::VeqType::get(ctx, 1);
-        auto alloc = builder.create<quake::AllocaOp>(loc, veqTy);
-        auto init = builder.create<quake::InitializeStateOp>(loc, veqTy, alloc,
-                                                             initials);
-        return pushValue(builder.create<quake::ExtractRefOp>(loc, init, 0));
+        auto alloc = quake::AllocaOp::create(builder, loc, veqTy);
+        auto init = quake::InitializeStateOp::create(builder, loc, veqTy, alloc,
+                                                     initials);
+        return pushValue(quake::ExtractRefOp::create(builder, loc, init, 0));
       }
       if (ctorName == "qvector") {
         auto initials = popValue();
         auto *ctx = builder.getContext();
         if (isa<IntegerType>(initials.getType())) {
           // This is the cudaq::qvector(std::size_t) ctor.
-          return pushValue(builder.create<quake::AllocaOp>(
-              loc, quake::VeqType::getUnsized(ctx), initials));
+          return pushValue(quake::AllocaOp::create(
+              builder, loc, quake::VeqType::getUnsized(ctx), initials));
         }
         if (isa<quake::StateType>(initials.getType()))
           if (auto load = initials.getDefiningOp<cudaq::cc::LoadOp>())
@@ -3009,13 +3023,13 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
           Value state = initials;
           auto i64Ty = builder.getI64Type();
           auto numQubits =
-              builder.create<quake::GetNumberOfQubitsOp>(loc, i64Ty, state);
+              quake::GetNumberOfQubitsOp::create(builder, loc, i64Ty, state);
           auto veqTy = quake::VeqType::getUnsized(ctx);
-          Value alloc = builder.create<quake::AllocaOp>(loc, veqTy, numQubits);
-          Value initSt = builder.create<quake::InitializeStateOp>(loc, veqTy,
-                                                                  alloc, state);
+          Value alloc = quake::AllocaOp::create(builder, loc, veqTy, numQubits);
+          Value initSt = quake::InitializeStateOp::create(builder, loc, veqTy,
+                                                          alloc, state);
           if (auto initOp = initials.getDefiningOp<quake::CreateStateOp>())
-            builder.create<quake::DeleteStateOp>(loc, initOp);
+            quake::DeleteStateOp::create(builder, loc, initOp);
           return pushValue(initSt);
         }
 
@@ -3028,23 +3042,24 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
               if (auto allocOp = initials.getDefiningOp<cc::AllocaOp>())
                 if (auto size = allocOp.getSeqSize())
                   numQubits =
-                      builder.create<math::CountTrailingZerosOp>(loc, size);
+                      math::CountTrailingZerosOp::create(builder, loc, size);
             } else {
               std::size_t arraySize = arrTy.getSize();
               if (!std::has_single_bit(arraySize)) {
                 reportClangError(x, mangler,
                                  "state vector must be a power of 2 in length");
               }
-              numQubits = builder.create<arith::ConstantIntOp>(
-                  loc, std::countr_zero(arraySize), 64);
+              numQubits = arith::ConstantIntOp::create(
+                  builder, loc, builder.getI64Type(),
+                  std::countr_zero(arraySize));
             }
           }
         } else if (auto stdvecTy = dyn_cast<cc::StdvecType>(initialsTy)) {
-          Value vecLen = builder.create<cc::StdvecSizeOp>(
-              loc, builder.getI64Type(), initials);
-          numQubits = builder.create<math::CountTrailingZerosOp>(loc, vecLen);
+          Value vecLen = cc::StdvecSizeOp::create(
+              builder, loc, builder.getI64Type(), initials);
+          numQubits = math::CountTrailingZerosOp::create(builder, loc, vecLen);
           auto ptrTy = cc::PointerType::get(stdvecTy.getElementType());
-          initials = builder.create<cc::StdvecDataOp>(loc, ptrTy, initials);
+          initials = cc::StdvecDataOp::create(builder, loc, ptrTy, initials);
         }
         if (!numQubits) {
           reportClangError(
@@ -3053,9 +3068,9 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
           return false;
         }
         auto veqTy = quake::VeqType::getUnsized(ctx);
-        auto alloc = builder.create<quake::AllocaOp>(loc, veqTy, numQubits);
-        return pushValue(builder.create<quake::InitializeStateOp>(
-            loc, veqTy, alloc, initials));
+        auto alloc = quake::AllocaOp::create(builder, loc, veqTy, numQubits);
+        return pushValue(quake::InitializeStateOp::create(builder, loc, veqTy,
+                                                          alloc, initials));
       }
       if ((ctorName == "qspan" || ctorName == "qview") &&
           isa<quake::VeqType>(peekValue().getType())) {
@@ -3082,8 +3097,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
     if (ctorName == "complex") {
       Value imag = popValue();
       Value real = popValue();
-      return pushValue(builder.create<complex::CreateOp>(
-          loc, ComplexType::get(real.getType()), real, imag));
+      return pushValue(mlir::complex::CreateOp::create(
+          builder, loc, ComplexType::get(real.getType()), real, imag));
     }
     if (ctorName == "function") {
       // Are we converting a lambda expr to a std::function?
@@ -3101,10 +3116,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
           TODO_loc(loc, "callable class with data members");
         }
         // Constructor generated as degenerate reference to call operator.
-        auto *fromTy = x->getArg(0)->getType().getTypePtr();
-        // FIXME: May need to peel off more than one layer of sugar?
-        if (auto *elabTy = dyn_cast<clang::ElaboratedType>(fromTy))
-          fromTy = elabTy->desugar().getTypePtr();
+        auto *fromTy =
+            x->getArg(0)->getType().getTypePtr()->getUnqualifiedDesugaredType();
         auto *fromDecl = dyn_cast_or_null<clang::RecordType>(fromTy)->getDecl();
         if (!fromDecl)
           TODO_loc(loc, "recovering record type for a callable");
@@ -3123,13 +3136,13 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
         auto kernelCallTy = cast<cc::CallableType>(ctorTy);
         auto kernelName = generateCudaqKernelName(callOperDecl);
         popValue(); // replace value at TOS.
-        return pushValue(builder.create<cc::CreateLambdaOp>(
-            loc, kernelCallTy, [&](OpBuilder &builder, Location loc) {
+        return pushValue(cc::CreateLambdaOp::create(
+            builder, loc, kernelCallTy, [&](OpBuilder &builder, Location loc) {
               auto args = builder.getBlock()->getArguments();
-              auto call = builder.create<func::CallOp>(
-                  loc, kernelCallTy.getSignature().getResults(), kernelName,
-                  args);
-              builder.create<cc::ReturnOp>(loc, call.getResults());
+              auto call = func::CallOp::create(
+                  builder, loc, kernelCallTy.getSignature().getResults(),
+                  kernelName, args);
+              cc::ReturnOp::create(builder, loc, call.getResults());
             }));
       }
     }
@@ -3154,8 +3167,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
           if (auto ptrTy = dyn_cast<cc::PointerType>(allocation.getType()))
             if (auto arrayTy = dyn_cast<cc::ArrayType>(ptrTy.getElementType()))
               if (auto definingOp = allocation.getDefiningOp<cc::AllocaOp>())
-                return pushValue(builder.create<cc::StdvecInitOp>(
-                    loc, cc::StdvecType::get(arrayTy.getElementType()),
+                return pushValue(cc::StdvecInitOp::create(
+                    builder, loc, cc::StdvecType::get(arrayTy.getElementType()),
                     allocation, definingOp.getSeqSize()));
         }
 
@@ -3175,11 +3188,11 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
             // memory chunk.
             Type ty =
                 (eleTy == builder.getI1Type()) ? builder.getI8Type() : eleTy;
-            Value alloca = builder.create<cc::AllocaOp>(loc, ty, arrSize);
+            Value alloca = cc::AllocaOp::create(builder, loc, ty, arrSize);
 
             // Create the stdvec_init op
-            return pushValue(builder.create<cc::StdvecInitOp>(
-                loc, cc::StdvecType::get(eleTy), alloca, arrSize));
+            return pushValue(cc::StdvecInitOp::create(
+                builder, loc, cc::StdvecType::get(eleTy), alloca, arrSize));
           }
         return false;
       };
@@ -3217,7 +3230,7 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
 
   if (isa<quake::StruqType>(ctorTy)) {
     if (quake::isConstantQuantumRefType(ctorTy))
-      return pushValue(builder.create<quake::AllocaOp>(loc, ctorTy));
+      return pushValue(quake::AllocaOp::create(builder, loc, ctorTy));
     return true;
   }
 
@@ -3238,17 +3251,17 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
       // contain the object to load the value from.
       auto fromStruct = popValue();
       assert(isa<cc::StructType>(ctorTy) && "POD must be a struct type");
-      return pushValue(builder.create<cc::LoadOp>(loc, fromStruct));
+      return pushValue(cc::LoadOp::create(builder, loc, fromStruct));
     }
   }
 
   if (ctor->isCopyConstructor() && ctor->isTrivial() &&
       isa<cc::StructType>(ctorTy)) {
-    auto copyObj = builder.create<cc::AllocaOp>(loc, ctorTy);
+    auto copyObj = cc::AllocaOp::create(builder, loc, ctorTy);
     auto fromStruct = popValue();
-    auto fromVal = builder.create<cc::LoadOp>(loc, fromStruct);
-    builder.create<cc::StoreOp>(loc, fromVal, copyObj);
-    return pushValue(builder.create<cc::LoadOp>(loc, copyObj));
+    auto fromVal = cc::LoadOp::create(builder, loc, fromStruct);
+    cc::StoreOp::create(builder, loc, fromVal, copyObj);
+    return pushValue(cc::LoadOp::create(builder, loc, copyObj));
   }
 
   // TODO: remove this when we can handle ctors more generally.
@@ -3264,7 +3277,7 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
   // 2) Allocate a new object.
   // 3) If not POD, call the constructor passing the address of the allocation
   //    as `this`.
-  auto mem = builder.create<cc::AllocaOp>(loc, ctorTy);
+  auto mem = cc::AllocaOp::create(builder, loc, ctorTy);
 
   // No constructor call needed for POD types
   if (parent->isPOD())
@@ -3278,7 +3291,7 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
       FunctionType::get(builder.getContext(), TypeRange{mem.getType()}, {});
   auto func = getOrAddFunc(loc, mangledName, funcTy).first;
   // FIXME: The ctor may not be the default ctor. Get all the args.
-  builder.create<func::CallOp>(loc, func, ValueRange{mem});
+  func::CallOp::create(builder, loc, func, ValueRange{mem});
   return pushValue(mem);
 }
 
@@ -3330,8 +3343,8 @@ bool QuakeBridgeVisitor::VisitDeclRefExpr(clang::DeclRefExpr *x) {
 bool QuakeBridgeVisitor::VisitStringLiteral(clang::StringLiteral *x) {
   auto strLitTy = cc::PointerType::get(cc::ArrayType::get(
       builder.getContext(), builder.getI8Type(), x->getString().size() + 1));
-  return pushValue(builder.create<cc::CreateStringLiteralOp>(
-      toLocation(x), strLitTy, builder.getStringAttr(x->getString())));
+  return pushValue(cc::CreateStringLiteralOp::create(
+      builder, toLocation(x), strLitTy, builder.getStringAttr(x->getString())));
 }
 
 } // namespace cudaq::details
diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp
index 54bd9ca50ec..8512cca0b83 100644
--- a/lib/Frontend/nvqpp/ConvertStmt.cpp
+++ b/lib/Frontend/nvqpp/ConvertStmt.cpp
@@ -28,7 +28,7 @@ bool QuakeBridgeVisitor::VisitBreakStmt(clang::BreakStmt *x) {
   // statement. The bridge does not currently support switch statements.
   LLVM_DEBUG(llvm::dbgs() << "%% "; x->dump());
   if (builder.getBlock())
-    builder.create<cc::UnwindBreakOp>(toLocation(x));
+    cc::UnwindBreakOp::create(builder, toLocation(x));
   return true;
 }
 
@@ -36,7 +36,7 @@ bool QuakeBridgeVisitor::VisitContinueStmt(clang::ContinueStmt *x) {
   // It is a C++ syntax error if a continue statement is not in a loop.
   LLVM_DEBUG(llvm::dbgs() << "%% "; x->dump());
   if (builder.getBlock())
-    builder.create<cc::UnwindContinueOp>(toLocation(x));
+    cc::UnwindContinueOp::create(builder, toLocation(x));
   return true;
 }
 
@@ -69,53 +69,53 @@ bool QuakeBridgeVisitor::VisitCompoundAssignOperator(
     switch (x->getOpcode()) {
     case clang::BinaryOperatorKind::BO_AddAssign: {
       if (x->getType()->isIntegerType())
-        return builder.create<mlir::arith::AddIOp>(loc, lhs, rhs);
+        return mlir::arith::AddIOp::create(builder, loc, lhs, rhs);
       if (x->getType()->isFloatingType())
-        return builder.create<mlir::arith::AddFOp>(loc, lhs, rhs);
+        return mlir::arith::AddFOp::create(builder, loc, lhs, rhs);
       TODO_loc(loc, "Unknown type in assignment operator");
     }
     case clang::BinaryOperatorKind::BO_SubAssign: {
       if (x->getType()->isIntegerType())
-        return builder.create<mlir::arith::SubIOp>(loc, lhs, rhs);
+        return mlir::arith::SubIOp::create(builder, loc, lhs, rhs);
       if (x->getType()->isFloatingType())
-        return builder.create<mlir::arith::SubFOp>(loc, lhs, rhs);
+        return mlir::arith::SubFOp::create(builder, loc, lhs, rhs);
       TODO_loc(loc, "Unknown type in assignment operator");
     }
     case clang::BinaryOperatorKind::BO_MulAssign: {
       if (x->getType()->isIntegerType())
-        return builder.create<mlir::arith::MulIOp>(loc, lhs, rhs);
+        return mlir::arith::MulIOp::create(builder, loc, lhs, rhs);
       if (x->getType()->isFloatingType())
-        return builder.create<mlir::arith::MulFOp>(loc, lhs, rhs);
+        return mlir::arith::MulFOp::create(builder, loc, lhs, rhs);
       TODO_loc(loc, "Unknown type in assignment operator");
     }
     case clang::BinaryOperatorKind::BO_DivAssign: {
       if (x->getType()->isIntegerType())
         if (x->getType()->isUnsignedIntegerOrEnumerationType())
-          return builder.create<mlir::arith::DivUIOp>(loc, lhs, rhs);
-      return builder.create<mlir::arith::DivSIOp>(loc, lhs, rhs);
+          return mlir::arith::DivUIOp::create(builder, loc, lhs, rhs);
+      return mlir::arith::DivSIOp::create(builder, loc, lhs, rhs);
       if (x->getType()->isFloatingType())
-        return builder.create<mlir::arith::DivFOp>(loc, lhs, rhs);
+        return mlir::arith::DivFOp::create(builder, loc, lhs, rhs);
       TODO_loc(loc, "Unknown type in assignment operator");
     }
     case clang::BinaryOperatorKind::BO_ShlAssign:
-      return builder.create<mlir::arith::ShLIOp>(loc, lhs, rhs);
+      return mlir::arith::ShLIOp::create(builder, loc, lhs, rhs);
     case clang::BinaryOperatorKind::BO_ShrAssign:
       if (x->getType()->isUnsignedIntegerOrEnumerationType())
-        return builder.create<mlir::arith::ShRUIOp>(loc, lhs, rhs);
-      return builder.create<mlir::arith::ShRSIOp>(loc, lhs, rhs);
+        return mlir::arith::ShRUIOp::create(builder, loc, lhs, rhs);
+      return mlir::arith::ShRSIOp::create(builder, loc, lhs, rhs);
     case clang::BinaryOperatorKind::BO_OrAssign:
-      return builder.create<mlir::arith::OrIOp>(loc, lhs, rhs);
+      return mlir::arith::OrIOp::create(builder, loc, lhs, rhs);
     case clang::BinaryOperatorKind::BO_XorAssign:
-      return builder.create<mlir::arith::XOrIOp>(loc, lhs, rhs);
+      return mlir::arith::XOrIOp::create(builder, loc, lhs, rhs);
     case clang::BinaryOperatorKind::BO_AndAssign:
-      return builder.create<mlir::arith::AndIOp>(loc, lhs, rhs);
+      return mlir::arith::AndIOp::create(builder, loc, lhs, rhs);
     default:
       break;
     }
     TODO_loc(loc, "assignment operator");
   }();
 
-  builder.create<cudaq::cc::StoreOp>(loc, result, lhsPtr);
+  cudaq::cc::StoreOp::create(builder, loc, result, lhsPtr);
   return pushValue(lhsPtr);
 }
 
@@ -151,7 +151,7 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
     auto [iters, ptr, initial,
           stepBy] = [&]() -> std::tuple<Value, Value, Value, Value> {
       if (auto call = buffer.getDefiningOp<func::CallOp>()) {
-        if (call.getCallee().equals(setCudaqRangeVector)) {
+        if (call.getCallee() == setCudaqRangeVector) {
           // The std::vector was produced by cudaq::range(). Optimize this
           // special case to use the loop control directly. Erase the transient
           // buffer and call here since neither is required.
@@ -164,7 +164,7 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
             call->erase();
           }
           return {i, {}, {}, {}};
-        } else if (call.getCallee().equals(setCudaqRangeVectorTriple)) {
+        } else if (call.getCallee() == setCudaqRangeVectorTriple) {
           // Save operands before erasing the call.
           Value initial = call.getOperand(1);
           Value i = call.getOperand(2);
@@ -173,7 +173,7 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
             Operation *callGetSizeOp = nullptr;
             if (auto seqSize = alloc.getSeqSize()) {
               if (auto callSize = seqSize.getDefiningOp<func::CallOp>())
-                if (callSize.getCallee().equals(getCudaqSizeFromTriple))
+                if (callSize.getCallee() == getCudaqSizeFromTriple)
                   callGetSizeOp = callSize.getOperation();
             }
             call->erase(); // erase call must be first
@@ -187,8 +187,8 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
           return {i, {}, initial, stepBy};
         }
       }
-      Value i = builder.create<cc::StdvecSizeOp>(loc, i64Ty, buffer);
-      Value p = builder.create<cc::StdvecDataOp>(loc, dataArrPtrTy, buffer);
+      Value i = cc::StdvecSizeOp::create(builder, loc, i64Ty, buffer);
+      Value p = cc::StdvecDataOp::create(builder, loc, dataArrPtrTy, buffer);
       return {i, p, {}, {}};
     }();
 
@@ -206,7 +206,7 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
           symbolTable.insert(loopVar->getName(), index);
         } else {
           Value addr =
-              builder.create<cc::ComputePtrOp>(loc, dataPtrTy, ptr, index);
+              cc::ComputePtrOp::create(builder, loc, dataPtrTy, ptr, index);
           if (loopVar->getType().isConstQualified()) {
             // Read-only binding, so omit copy.
             symbolTable.insert(loopVar->getName(), addr);
@@ -220,48 +220,48 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
               return;
             }
             auto iterVar = popValue();
-            Value atOffset = builder.create<cc::LoadOp>(loc, addr);
+            Value atOffset = cc::LoadOp::create(builder, loc, addr);
             if (isBool)
-              atOffset = builder.create<cc::CastOp>(loc, builder.getI1Type(),
-                                                    atOffset);
-            builder.create<cc::StoreOp>(loc, atOffset, iterVar);
+              atOffset = cc::CastOp::create(builder, loc, builder.getI1Type(),
+                                            atOffset);
+            cc::StoreOp::create(builder, loc, atOffset, iterVar);
           }
         }
         if (!TraverseStmt(static_cast<clang::Stmt *>(body))) {
           result = false;
           return;
         }
-        builder.create<cc::ContinueOp>(loc);
+        cc::ContinueOp::create(builder, loc);
       };
-      builder.create<cc::ScopeOp>(loc, scopeBuilder);
+      cc::ScopeOp::create(builder, loc, scopeBuilder);
     };
 
     if (!initial) {
-      auto idxIters = builder.create<cudaq::cc::CastOp>(
-          loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
+      auto idxIters = cudaq::cc::CastOp::create(
+          builder, loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
       opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
     } else {
-      auto idxIters = builder.create<cudaq::cc::CastOp>(
-          loc, i64Ty, iters, cudaq::cc::CastOpMode::Signed);
+      auto idxIters = cudaq::cc::CastOp::create(builder, loc, i64Ty, iters,
+                                                cudaq::cc::CastOpMode::Signed);
       opt::factory::createMonotonicLoop(builder, loc, initial, idxIters, stepBy,
                                         bodyBuilder);
     }
   } else if (auto veqTy = dyn_cast<quake::VeqType>(buffer.getType());
              veqTy && veqTy.hasSpecifiedSize()) {
-    Value iters =
-        builder.create<arith::ConstantIntOp>(loc, veqTy.getSize(), i64Ty);
+    Value iters = arith::ConstantIntOp::create(
+        builder, loc, i64Ty, static_cast<int64_t>(veqTy.getSize()));
     auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &region,
                            Block &block) {
       OpBuilder::InsertionGuard guard(builder);
       builder.setInsertionPointToStart(&block);
       Value index = block.getArgument(0);
-      Value ref = builder.create<quake::ExtractRefOp>(loc, buffer, index);
+      Value ref = quake::ExtractRefOp::create(builder, loc, buffer, index);
       symbolTable.insert(loopVar->getName(), ref);
       if (!TraverseStmt(static_cast<clang::Stmt *>(body)))
         result = false;
     };
-    auto idxIters = builder.create<cudaq::cc::CastOp>(
-        loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
+    auto idxIters = cudaq::cc::CastOp::create(builder, loc, i64Ty, iters,
+                                              cudaq::cc::CastOpMode::Unsigned);
     opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
   } else {
     TODO_x(toLocation(x), x, mangler, "ranged for statement");
@@ -327,13 +327,13 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) {
     if (isa<cc::PointerType>(resTy)) {
       // Promote reference (T&) to value (T) on a return. (There is not
       // necessarily an explicit cast or promotion node in the AST.)
-      auto load = builder.create<cc::LoadOp>(loc, result);
+      auto load = cc::LoadOp::create(builder, loc, result);
       result = load.getResult();
       if (load.getType() == builder.getI8Type()) {
         auto fnTy = load->getParentOfType<func::FuncOp>().getFunctionType();
         auto i1Ty = builder.getI1Type();
         if (fnTy.getNumResults() == 1 && fnTy.getResult(0) == i1Ty)
-          result = builder.create<cc::CastOp>(loc, i1Ty, result);
+          result = cc::CastOp::create(builder, loc, i1Ty, result);
       }
     }
     if (auto vecTy = dyn_cast<cc::SpanLikeType>(resTy)) {
@@ -346,16 +346,15 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) {
       auto eleTy = vecTy.getElementType();
       auto createVectorInit = [&](Value eleSize) {
         auto ptrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-        Value resBuff = builder.create<cc::StdvecDataOp>(loc, ptrTy, result);
-        Value dynSize =
-            builder.create<cc::StdvecSizeOp>(loc, builder.getI64Type(), result);
+        Value resBuff = cc::StdvecDataOp::create(builder, loc, ptrTy, result);
+        Value dynSize = cc::StdvecSizeOp::create(builder, loc,
+                                                 builder.getI64Type(), result);
         Value heapCopy =
-            builder
-                .create<func::CallOp>(loc, ptrTy, "__nvqpp_vectorCopyCtor",
-                                      ValueRange{resBuff, dynSize, eleSize})
+            func::CallOp::create(builder, loc, ptrTy, "__nvqpp_vectorCopyCtor",
+                                 ValueRange{resBuff, dynSize, eleSize})
                 .getResult(0);
-        return builder.create<cc::StdvecInitOp>(loc, resTy,
-                                                ValueRange{heapCopy, dynSize});
+        return cc::StdvecInitOp::create(builder, loc, resTy,
+                                        ValueRange{heapCopy, dynSize});
       };
       IRBuilder irb(builder);
       Value tySize;
@@ -371,15 +370,15 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) {
       result = createVectorInit(tySize);
     }
     if (isFuncScope)
-      builder.create<cc::ReturnOp>(loc, result);
+      cc::ReturnOp::create(builder, loc, result);
     else
-      builder.create<cc::UnwindReturnOp>(loc, result);
+      cc::UnwindReturnOp::create(builder, loc, result);
     return true;
   }
   if (isFuncScope)
-    builder.create<cc::ReturnOp>(loc);
+    cc::ReturnOp::create(builder, loc);
   else
-    builder.create<cc::UnwindReturnOp>(loc);
+    cc::UnwindReturnOp::create(builder, loc);
   return true;
 }
 
@@ -407,10 +406,10 @@ bool QuakeBridgeVisitor::TraverseCompoundStmt(clang::CompoundStmt *stmt,
       traverseAndCheck(static_cast<clang::Stmt *>(cs));
     return true;
   }
-  builder.create<cc::ScopeOp>(loc, [&](OpBuilder &builder, Location loc) {
+  cc::ScopeOp::create(builder, loc, [&](OpBuilder &builder, Location loc) {
     for (auto *cs : stmt->body())
       traverseAndCheck(static_cast<clang::Stmt *>(cs));
-    builder.create<cc::ContinueOp>(loc);
+    cc::ContinueOp::create(builder, loc);
   });
   return true;
 }
@@ -433,7 +432,7 @@ bool QuakeBridgeVisitor::traverseDoOrWhileStmt(S *x) {
       return;
     }
     auto val = popValue();
-    builder.create<cc::ConditionOp>(loc, val, ValueRange{});
+    cc::ConditionOp::create(builder, loc, val, ValueRange{});
   };
   auto *body = x->getBody();
   auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &region) {
@@ -448,11 +447,11 @@ bool QuakeBridgeVisitor::traverseDoOrWhileStmt(S *x) {
       return;
     }
     if (!hasTerminator(region.back()))
-      builder.create<cc::ContinueOp>(loc);
+      cc::ContinueOp::create(builder, loc);
   };
   LLVM_DEBUG(llvm::dbgs() << "%% "; x->dump());
-  builder.create<cc::LoopOp>(loc, ValueRange{}, postCondition, whileBuilder,
-                             bodyBuilder);
+  cc::LoopOp::create(builder, loc, ValueRange{}, postCondition, whileBuilder,
+                     bodyBuilder);
   return result;
 }
 
@@ -483,27 +482,26 @@ bool QuakeBridgeVisitor::TraverseIfStmt(clang::IfStmt *x,
         return;
       }
       if (!hasTerminator(region.back()))
-        builder.create<cc::ContinueOp>(loc);
+        cc::ContinueOp::create(builder, loc);
     };
   };
   auto *cond = x->getCond();
   assert(cond && "if statement should have a condition");
   LLVM_DEBUG(llvm::dbgs() << "%% "; x->dump());
   if (auto *init = x->getInit()) {
-    builder.create<cc::ScopeOp>(loc, [&](OpBuilder &builder, Location loc) {
+    cc::ScopeOp::create(builder, loc, [&](OpBuilder &builder, Location loc) {
       SymbolTableScope varScope(symbolTable);
       if (!TraverseStmt(init) || !TraverseStmt(cond)) {
         result = false;
         return;
       }
       if (x->getElse())
-        builder.create<cc::IfOp>(loc, TypeRange{}, popValue(),
-                                 stmtBuilder(x->getThen()),
-                                 stmtBuilder(x->getElse()));
+        cc::IfOp::create(builder, loc, TypeRange{}, popValue(),
+                         stmtBuilder(x->getThen()), stmtBuilder(x->getElse()));
       else
-        builder.create<cc::IfOp>(loc, TypeRange{}, popValue(),
-                                 stmtBuilder(x->getThen()));
-      builder.create<cc::ContinueOp>(loc);
+        cc::IfOp::create(builder, loc, TypeRange{}, popValue(),
+                         stmtBuilder(x->getThen()));
+      cc::ContinueOp::create(builder, loc);
     });
   } else {
     // If there is no initialization expression, skip creating an `if` scope.
@@ -516,19 +514,18 @@ bool QuakeBridgeVisitor::TraverseIfStmt(clang::IfStmt *x,
     // and add the required a load and cast.
     if (auto ptrTy = dyn_cast<cc::PointerType>(peekValue().getType())) {
       Value v = popValue();
-      pushValue(builder.create<cc::LoadOp>(loc, v));
+      pushValue(cc::LoadOp::create(builder, loc, v));
       if (ptrTy != builder.getI1Type()) {
         reportClangError(x, mangler,
                          "expression in condition not yet supported");
       }
     }
     if (x->getElse())
-      builder.create<cc::IfOp>(loc, TypeRange{}, popValue(),
-                               stmtBuilder(x->getThen()),
-                               stmtBuilder(x->getElse()));
+      cc::IfOp::create(builder, loc, TypeRange{}, popValue(),
+                       stmtBuilder(x->getThen()), stmtBuilder(x->getElse()));
     else
-      builder.create<cc::IfOp>(loc, TypeRange{}, popValue(),
-                               stmtBuilder(x->getThen()));
+      cc::IfOp::create(builder, loc, TypeRange{}, popValue(),
+                       stmtBuilder(x->getThen()));
   }
   return result;
 }
@@ -551,7 +548,7 @@ bool QuakeBridgeVisitor::TraverseForStmt(clang::ForStmt *x,
       return;
     }
     auto val = popValue();
-    builder.create<cc::ConditionOp>(loc, val, ValueRange{});
+    cc::ConditionOp::create(builder, loc, val, ValueRange{});
   };
   auto *body = x->getBody();
   auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &region) {
@@ -566,7 +563,7 @@ bool QuakeBridgeVisitor::TraverseForStmt(clang::ForStmt *x,
       return;
     }
     if (!hasTerminator(region.back()))
-      builder.create<cc::ContinueOp>(loc);
+      cc::ContinueOp::create(builder, loc);
   };
   auto *incr = x->getInc();
   auto stepBuilder = [&](OpBuilder &builder, Location loc, Region &region) {
@@ -584,19 +581,19 @@ bool QuakeBridgeVisitor::TraverseForStmt(clang::ForStmt *x,
   LLVM_DEBUG(llvm::dbgs() << "%% "; x->dump());
   if (auto *init = x->getInit()) {
     SymbolTableScope var_scope(symbolTable);
-    builder.create<cc::ScopeOp>(loc, [&](OpBuilder &builder, Location loc) {
+    cc::ScopeOp::create(builder, loc, [&](OpBuilder &builder, Location loc) {
       if (!TraverseStmt(static_cast<clang::Stmt *>(init))) {
         result = false;
         return;
       }
-      builder.create<cc::LoopOp>(loc, ValueRange{}, postCondition, whileBuilder,
-                                 bodyBuilder, stepBuilder);
-      builder.create<cc::ContinueOp>(loc);
+      cc::LoopOp::create(builder, loc, ValueRange{}, postCondition,
+                         whileBuilder, bodyBuilder, stepBuilder);
+      cc::ContinueOp::create(builder, loc);
     });
   } else {
     // If there is no initialization expression, skip creating a `for` scope.
-    builder.create<cc::LoopOp>(loc, ValueRange{}, postCondition, whileBuilder,
-                               bodyBuilder);
+    cc::LoopOp::create(builder, loc, ValueRange{}, postCondition, whileBuilder,
+                       bodyBuilder);
   }
   const auto finalValueDepth = valueStack.size();
   if (finalValueDepth > initialValueDepth) {
diff --git a/lib/Frontend/nvqpp/ConvertType.cpp b/lib/Frontend/nvqpp/ConvertType.cpp
index c21ef8d56a9..e38b6d47329 100644
--- a/lib/Frontend/nvqpp/ConvertType.cpp
+++ b/lib/Frontend/nvqpp/ConvertType.cpp
@@ -175,7 +175,8 @@ QuakeBridgeVisitor::findCallOperator(const clang::CXXRecordDecl *decl) {
   return nullptr;
 }
 
-bool QuakeBridgeVisitor::TraverseRecordType(clang::RecordType *t) {
+bool QuakeBridgeVisitor::TraverseRecordType(clang::RecordType *t,
+                                            bool &visitChildren) {
   auto *recDecl = t->getDecl();
 
   if (ignoredClass(recDecl))
@@ -222,10 +223,10 @@ std::pair<std::uint64_t, unsigned>
 QuakeBridgeVisitor::getWidthAndAlignment(clang::RecordDecl *x) {
   auto *defn = x->getDefinition();
   assert(defn && "struct must be defined here");
-  auto *ty = defn->getTypeForDecl();
-  if (ty->isDependentType())
+  auto qualTy = getContext()->getCanonicalTagType(defn);
+  if (qualTy->isDependentType())
     return {0, 0};
-  auto ti = getContext()->getTypeInfo(ty);
+  auto ti = getContext()->getTypeInfo(qualTy);
   return {ti.Width, llvm::PowerOf2Ceil(ti.Align) / 8};
 }
 
diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp
index 12ab0feb5ca..e804a169c3f 100644
--- a/lib/Optimizer/Builder/Factory.cpp
+++ b/lib/Optimizer/Builder/Factory.cpp
@@ -91,39 +91,43 @@ factory::buildInvokeStructType(FunctionType funcTy,
   return cudaq::cc::StructType::get(ctx, eleTys, /*packed=*/false);
 }
 
-Value factory::packIsArrayAndLengthArray(Location loc,
-                                         ConversionPatternRewriter &rewriter,
-                                         ModuleOp parentModule,
-                                         std::size_t numOperands,
-                                         ValueRange operands) {
+Value factory::packIsArrayAndLengthArray(
+    Location loc, ConversionPatternRewriter &rewriter, ModuleOp parentModule,
+    std::size_t numOperands, ValueRange operands, ValueRange originalControls) {
   // Create an integer array where the kth element is N if the kth control
   // operand is a veq<N>, and 0 otherwise.
   auto i64Type = rewriter.getI64Type();
-  auto context = rewriter.getContext();
-  Value isArrayAndLengthArr = createLLVMTemporary(
-      loc, rewriter, LLVM::LLVMPointerType::get(i64Type), numOperands);
-  auto intPtrTy = LLVM::LLVMPointerType::get(i64Type);
-  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
+  auto *context = rewriter.getContext();
+  auto alignment = IntegerAttr::get(i64Type, 8);
+  auto ptrTy = LLVM::LLVMPointerType::get(context);
+  Value numOpnds = arith::ConstantIntOp::create(rewriter, loc, numOperands, 64);
+  Value isArrayAndLengthArr = LLVM::AllocaOp::create(
+      rewriter, loc, ptrTy, numOpnds, alignment, TypeAttr::get(i64Type));
+  Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
   auto getSizeSymbolRef = opt::factory::createLLVMFunctionSymbol(
-      opt::QIRArrayGetSize, i64Type, {opt::getArrayType(context)},
+      opt::QIRArrayGetSize, i64Type, {cg::getLLVMArrayType(context)},
       parentModule);
   for (auto iter : llvm::enumerate(operands)) {
     auto operand = iter.value();
     auto i = iter.index();
-    Value idx = rewriter.create<arith::ConstantIntOp>(loc, i, 64);
-    Value ptr = rewriter.create<LLVM::GEPOp>(loc, intPtrTy, isArrayAndLengthArr,
-                                             ValueRange{idx});
+    Value idx = arith::ConstantIntOp::create(rewriter, loc, i, 64);
+    Value ptr = LLVM::GEPOp::create(rewriter, loc, ptrTy, i64Type,
+                                    isArrayAndLengthArr, ValueRange{idx});
     Value element;
-    if (operand.getType() == opt::getQubitType(context))
+    // With opaque pointers, both qubit (RefType) and array (VeqType) convert
+    // to the same !llvm.ptr type, so we must check the original quake types
+    // to distinguish them.
+    bool isQubit = isa<quake::RefType>(originalControls[i].getType());
+    if (isQubit) {
       element = zero;
-    else
+    } else {
       // get array size with the runtime function
-      element = rewriter
-                    .create<LLVM::CallOp>(loc, rewriter.getI64Type(),
-                                          getSizeSymbolRef, ValueRange{operand})
+      element = LLVM::CallOp::create(rewriter, loc, i64Type, getSizeSymbolRef,
+                                     ValueRange{operand})
                     .getResult();
+    }
 
-    rewriter.create<LLVM::StoreOp>(loc, element, ptr);
+    LLVM::StoreOp::create(rewriter, loc, element, ptr);
   }
   return isArrayAndLengthArr;
 }
@@ -145,7 +149,7 @@ FlatSymbolRefAttr factory::createLLVMFunctionSymbol(StringRef name,
     // Insert the function since it hasn't been seen yet
     auto insPt = rewriter.saveInsertionPoint();
     rewriter.setInsertionPointToStart(module.getBody());
-    rewriter.create<LLVM::LLVMFuncOp>(module->getLoc(), name, fType);
+    LLVM::LLVMFuncOp::create(rewriter, module->getLoc(), name, fType);
     symbolRef = SymbolRefAttr::get(context, name);
     rewriter.restoreInsertionPoint(insPt);
   }
@@ -166,7 +170,7 @@ func::FuncOp factory::createFunction(StringRef name, ArrayRef<Type> retTypes,
   // Insert the function since it hasn't been seen yet
   auto insPt = rewriter.saveInsertionPoint();
   rewriter.setInsertionPointToStart(module.getBody());
-  auto func = rewriter.create<func::FuncOp>(module->getLoc(), name, fType);
+  auto func = func::FuncOp::create(rewriter, module->getLoc(), name, fType);
   rewriter.restoreInsertionPoint(insPt);
   return func;
 }
@@ -199,40 +203,43 @@ void factory::createGlobalCtorCall(ModuleOp mod, FlatSymbolRefAttr ctor) {
   auto i32Ty = builder.getI32Type();
   constexpr int prio = 17;
   auto prioAttr = ArrayAttr::get(ctx, {IntegerAttr::get(i32Ty, prio)});
-  builder.create<LLVM::GlobalCtorsOp>(loc, ctorAttr, prioAttr);
+  llvm::SmallVector<mlir::Attribute> data;
+  data.push_back(mlir::LLVM::ZeroAttr::get(mod.getContext()));
+  LLVM::GlobalCtorsOp::create(builder, loc, ctorAttr, prioAttr,
+                              ArrayAttr::get(ctx, data));
 }
 
 cc::LoopOp factory::createInvariantLoop(
     OpBuilder &builder, Location loc, Value totalIterations,
     llvm::function_ref<void(OpBuilder &, Location, Region &, Block &)>
         bodyBuilder) {
-  Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-  Value one = builder.create<arith::ConstantIntOp>(loc, 1, 64);
+  Value zero = arith::ConstantIntOp::create(builder, loc, 0, 64);
+  Value one = arith::ConstantIntOp::create(builder, loc, 1, 64);
   Type i64Ty = builder.getI64Type();
   SmallVector<Value> inputs = {zero};
   SmallVector<Type> resultTys = {i64Ty};
-  auto loop = builder.create<cc::LoopOp>(
-      loc, resultTys, inputs, /*postCondition=*/false,
+  auto loop = cc::LoopOp::create(
+      builder, loc, resultTys, inputs, /*postCondition=*/false,
       [&](OpBuilder &builder, Location loc, Region &region) {
         cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
         auto &block = *builder.getBlock();
-        Value cmpi = builder.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::slt, block.getArgument(0),
-            totalIterations);
-        builder.create<cc::ConditionOp>(loc, cmpi, block.getArguments());
+        Value cmpi =
+            arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::slt,
+                                  block.getArgument(0), totalIterations);
+        cc::ConditionOp::create(builder, loc, cmpi, block.getArguments());
       },
       [&](OpBuilder &builder, Location loc, Region &region) {
         cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
         auto &block = *builder.getBlock();
         bodyBuilder(builder, loc, region, block);
-        builder.create<cc::ContinueOp>(loc, block.getArguments());
+        cc::ContinueOp::create(builder, loc, block.getArguments());
       },
       [&](OpBuilder &builder, Location loc, Region &region) {
         cc::RegionBuilderGuard guard(builder, loc, region, TypeRange{i64Ty});
         auto &block = *builder.getBlock();
         auto incr =
-            builder.create<arith::AddIOp>(loc, block.getArgument(0), one);
-        builder.create<cc::ContinueOp>(loc, ValueRange{incr});
+            arith::AddIOp::create(builder, loc, block.getArgument(0), one);
+        cc::ContinueOp::create(builder, loc, ValueRange{incr});
       });
   loop->setAttr("invariant", builder.getUnitAttr());
   return loop;
@@ -252,7 +259,9 @@ Value factory::createLLVMTemporary(Location loc, OpBuilder &builder, Type type,
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToStart(entryBlock);
   Value len = genLlvmI64Constant(loc, builder, size);
-  return builder.create<LLVM::AllocaOp>(loc, type, ArrayRef<Value>{len});
+  return LLVM::AllocaOp::create(
+      builder, loc, LLVM::LLVMPointerType::get(builder.getContext()), type,
+      len);
 }
 
 Value factory::createTemporary(Location loc, OpBuilder &builder, Type type,
@@ -266,8 +275,8 @@ Value factory::createTemporary(Location loc, OpBuilder &builder, Type type,
   assert(entryBlock && "function must have an entry block");
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToStart(entryBlock);
-  Value len = builder.create<arith::ConstantIntOp>(loc, size, 64);
-  return builder.create<cudaq::cc::AllocaOp>(loc, type, len);
+  Value len = arith::ConstantIntOp::create(builder, loc, size, 64);
+  return cudaq::cc::AllocaOp::create(builder, loc, type, len);
 }
 
 // This builder will transform the monotonic loop into an invariant loop during
@@ -284,44 +293,45 @@ cc::LoopOp factory::createMonotonicLoop(
   assert(succeeded(loadedIntrinsic) && "loading intrinsic should never fail");
   auto i64Ty = builder.getI64Type();
   Value begin =
-      builder.create<cc::CastOp>(loc, i64Ty, start, cc::CastOpMode::Signed);
+      cc::CastOp::create(builder, loc, i64Ty, start, cc::CastOpMode::Signed);
   Value stepBy =
-      builder.create<cc::CastOp>(loc, i64Ty, step, cc::CastOpMode::Signed);
+      cc::CastOp::create(builder, loc, i64Ty, step, cc::CastOpMode::Signed);
   Value end =
-      builder.create<cc::CastOp>(loc, i64Ty, stop, cc::CastOpMode::Signed);
-  Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+      cc::CastOp::create(builder, loc, i64Ty, stop, cc::CastOpMode::Signed);
+  Value zero = arith::ConstantIntOp::create(builder, loc, 0, 64);
   SmallVector<Value> inputs = {zero, begin};
   SmallVector<Type> resultTys = {i64Ty, i64Ty};
-  auto totalIters = builder.create<func::CallOp>(
-      loc, i64Ty, getCudaqSizeFromTriple, ValueRange{begin, end, stepBy});
-  auto loop = builder.create<cc::LoopOp>(
-      loc, resultTys, inputs, /*postCondition=*/false,
+  auto totalIters =
+      func::CallOp::create(builder, loc, i64Ty, getCudaqSizeFromTriple,
+                           ValueRange{begin, end, stepBy});
+  auto loop = cc::LoopOp::create(
+      builder, loc, resultTys, inputs, /*postCondition=*/false,
       [&](OpBuilder &builder, Location loc, Region &region) {
         cc::RegionBuilderGuard guard(builder, loc, region,
                                      TypeRange{i64Ty, i64Ty});
         auto &block = *builder.getBlock();
-        Value cmpi = builder.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::slt, block.getArgument(0),
+        Value cmpi = arith::CmpIOp::create(
+            builder, loc, arith::CmpIPredicate::slt, block.getArgument(0),
             totalIters.getResult(0));
-        builder.create<cc::ConditionOp>(loc, cmpi, block.getArguments());
+        cc::ConditionOp::create(builder, loc, cmpi, block.getArguments());
       },
       [&](OpBuilder &builder, Location loc, Region &region) {
         cc::RegionBuilderGuard guard(builder, loc, region,
                                      TypeRange{i64Ty, i64Ty});
         auto &block = *builder.getBlock();
         bodyBuilder(builder, loc, region, block);
-        builder.create<cc::ContinueOp>(loc, block.getArguments());
+        cc::ContinueOp::create(builder, loc, block.getArguments());
       },
       [&](OpBuilder &builder, Location loc, Region &region) {
         cc::RegionBuilderGuard guard(builder, loc, region,
                                      TypeRange{i64Ty, i64Ty});
         auto &block = *builder.getBlock();
-        auto one = builder.create<arith::ConstantIntOp>(loc, 1, 64);
+        auto one = arith::ConstantIntOp::create(builder, loc, 1, 64);
         Value count =
-            builder.create<arith::AddIOp>(loc, block.getArgument(0), one);
+            arith::AddIOp::create(builder, loc, block.getArgument(0), one);
         Value incr =
-            builder.create<arith::AddIOp>(loc, block.getArgument(1), stepBy);
-        builder.create<cc::ContinueOp>(loc, ValueRange{count, incr});
+            arith::AddIOp::create(builder, loc, block.getArgument(1), stepBy);
+        cc::ContinueOp::create(builder, loc, ValueRange{count, incr});
       });
   loop->setAttr("invariant", builder.getUnitAttr());
   return loop;
@@ -508,7 +518,7 @@ static bool shouldExpand(SmallVectorImpl<Type> &packedTys,
     } else if (theSet.size() == 1) {
       packedTys[packIdx] = theSet[0];
     } else {
-      assert(theSet[0] == FloatType::getF32(ctx) && "must be float");
+      assert(theSet[0] == Float32Type::get(ctx) && "must be float");
       packedTys[packIdx] =
           VectorType::get(ArrayRef<std::int64_t>{2}, theSet[0]);
     }
@@ -743,9 +753,9 @@ Value factory::createCast(OpBuilder &builder, Location loc, Type toType,
     return fromValue;
   auto unit = UnitAttr::get(builder.getContext());
   UnitAttr none;
-  return builder.create<cudaq::cc::CastOp>(loc, toType, fromValue,
-                                           signExtend ? unit : none,
-                                           zeroExtend ? unit : none);
+  return cudaq::cc::CastOp::create(builder, loc, toType, fromValue,
+                                   signExtend ? unit : none,
+                                   zeroExtend ? unit : none);
 }
 
 std::vector<std::complex<double>>
@@ -796,7 +806,7 @@ factory::getOrAddFunc(mlir::Location loc, mlir::StringRef funcName,
   OpBuilder::InsertionGuard guard(build);
   build.setInsertionPointToEnd(module.getBody());
   SmallVector<NamedAttribute> attrs;
-  func = build.create<func::FuncOp>(loc, funcName, funcTy, attrs);
+  func = func::FuncOp::create(build, loc, funcName, funcTy, attrs);
   func.setPrivate();
   return {func, /*defined=*/false};
 }
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index c611b15a1f5..fd3857fe53d 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -75,7 +75,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
 )#"},
 
     {cudaq::runtime::deviceCodeHolderAdd, {}, R"#(
-  llvm.func @__cudaq_deviceCodeHolderAdd(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+  llvm.func @__cudaq_deviceCodeHolderAdd(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
 )#"},
 
     {cudaq::runtime::getLinkableKernelKey, {}, R"#(
@@ -220,7 +220,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     %false = arith.constant false
     %to0 = cc.cast %dest : (!cc.ptr<i64>) -> !cc.ptr<i8>
     %from0 = cc.cast %src : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i8>
-    call @llvm.memcpy.p0i8.p0i8.i64(%to0, %from0, %len, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+    call @llvm.memcpy.p0.p0.i64(%to0, %from0, %len, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
     return
   }
 )#"},
@@ -272,11 +272,11 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     %3 = call @malloc(%2) : (i64) -> !cc.ptr<i8>
     %10 = cc.cast %3 : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
     %false = arith.constant false
-    call @llvm.memcpy.p0i8.p0i8.i64(%3, %arg0, %arg1, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+    call @llvm.memcpy.p0.p0.i64(%3, %arg0, %arg1, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
     %4 = cc.compute_ptr %arg2[0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
     %5 = cc.load %4 : !cc.ptr<!cc.ptr<i8>>
     %6 = cc.compute_ptr %10[%arg1] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-    call @llvm.memcpy.p0i8.p0i8.i64(%6, %5, %1, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+    call @llvm.memcpy.p0.p0.i64(%6, %5, %1, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
     %7 = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
     %8 = cc.insert_value %7[0], %3 : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
     %9 = cc.insert_value %8[1], %2 : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
@@ -401,7 +401,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     %size = arith.muli %arg1, %arg2 : i64
     %0 = call @malloc(%size) : (i64) -> !cc.ptr<i8>
     %false = arith.constant false
-    call @llvm.memcpy.p0i8.p0i8.i64(%0, %arg0, %size, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+    call @llvm.memcpy.p0.p0.i64(%0, %arg0, %size, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
     return %0 : !cc.ptr<i8>
   }
 )#"},
@@ -412,7 +412,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     {"__nvqpp_vectorCopyToStack", {cudaq::llvmMemCopyIntrinsic, "free"}, R"#(
   func.func private @__nvqpp_vectorCopyToStack(%to: !cc.ptr<i8>, %from: !cc.ptr<i8>, %size: i64) {
     %false = arith.constant false
-    call @llvm.memcpy.p0i8.p0i8.i64(%to, %from, %size, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+    call @llvm.memcpy.p0.p0.i64(%to, %from, %size, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
     call @free(%from) : (!cc.ptr<i8>) -> ()
     return
   })#"},
@@ -502,7 +502,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
      "func.func private @cudaqRegisterKernelName(!cc.ptr<i8>) -> ()"},
 
     {cudaq::runtime::CudaqRegisterLambdaName, {}, R"#(
-  llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+  llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
 )#"},
 
     {"free", {}, "func.func private @free(!cc.ptr<i8>) -> ()"},
@@ -513,15 +513,15 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 )#"},
 
-    // llvm.memcpy.p0i8.p0i8.i64
+    // llvm.memcpy.p0.p0.i64
     {cudaq::llvmMemCopyIntrinsic, {}, R"#(
-  func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+  func.func private @llvm.memcpy.p0.p0.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 )#"},
 
-    {cudaq::llvmMemSetIntrinsic, // llvm.memset.p0i8.i64
+    {cudaq::llvmMemSetIntrinsic, // llvm.memset.p0.i64
      {},
      R"#(
-  func.func private @llvm.memset.p0i8.i64(!cc.ptr<i8>, i8, i64, i1) -> ())#"},
+  func.func private @llvm.memset.p0.i64(!cc.ptr<i8>, i8, i64, i1) -> ())#"},
 
     // NB: load llvmStackSave to get both.
     {cudaq::llvmStackRestore,
@@ -647,8 +647,8 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   !qir_array = !cc.ptr<none>
   !qir_qubit = !cc.ptr<none>
   !qir_result = !cc.ptr<none>
-  !qir_charptr = !cc.ptr<none>
-  !qir_llvmptr = !llvm.ptr<i8>
+  !qir_charptr = !cc.ptr<i8>
+  !qir_llvmptr = !llvm.ptr
 )#"},
     // Use the obsolete LLVM opaque struct type.
     {"qir_opaque_struct", {}, R"#(
@@ -656,7 +656,7 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   !qir_qubit = !cc.ptr<!llvm.struct<"Qubit", opaque>>
   !qir_result = !cc.ptr<!llvm.struct<"Result", opaque>>
   !qir_charptr = !cc.ptr<i8>
-  !qir_llvmptr = !llvm.ptr<i8>
+  !qir_llvmptr = !llvm.ptr
 )#"},
 
     // streamlinedLaunchKernel(kernelName, vectorArgPtrs)
@@ -697,7 +697,7 @@ LLVM::GlobalOp IRBuilder::genCStringLiteral(Location loc, ModuleOp module,
   auto stringAttr = getStringAttr(cstring);
   OpBuilder::InsertionGuard guard(*this);
   setInsertionPointToEnd(module.getBody());
-  return create<LLVM::GlobalOp>(loc, cstringTy, /*isConstant=*/true,
+  return LLVM::GlobalOp::create(*this, loc, cstringTy, /*isConstant=*/true,
                                 LLVM::Linkage::Private, uniqName, stringAttr,
                                 /*alignment=*/0);
 }
@@ -825,9 +825,9 @@ static cc::GlobalOp buildVectorOfConstantElements(Location loc, ModuleOp module,
   builder.setInsertionPointToEnd(module.getBody());
   auto globalTy = cc::ArrayType::get(ctx, eleTy, arrayAttr.size());
   auto global =
-      builder.create<cudaq::cc::GlobalOp>(loc, globalTy, name, arrayAttr,
-                                          /*constant=*/true,
-                                          /*external=*/false);
+      cudaq::cc::GlobalOp::create(builder, loc, globalTy, name, arrayAttr,
+                                  /*constant=*/true,
+                                  /*external=*/false);
   global.setPrivate();
   return global;
 }
diff --git a/lib/Optimizer/Builder/Marshal.cpp b/lib/Optimizer/Builder/Marshal.cpp
index 7c272eb3f12..9cf7a481322 100644
--- a/lib/Optimizer/Builder/Marshal.cpp
+++ b/lib/Optimizer/Builder/Marshal.cpp
@@ -28,21 +28,22 @@ Value genStringLength(Location loc, OpBuilder &builder, Value stringArg,
   if constexpr (FromQPU) {
     Type stringTy = stringArg.getType();
     assert(isa<cudaq::cc::CharspanType>(stringTy));
-    return builder.create<cudaq::cc::StdvecSizeOp>(loc, builder.getI64Type(),
-                                                   stringArg);
+    return cudaq::cc::StdvecSizeOp::create(builder, loc, builder.getI64Type(),
+                                           stringArg);
   } else /*constexpr */ {
     Type stringTy = stringArg.getType();
     assert(isa<cudaq::cc::PointerType>(stringTy) &&
            isa<cudaq::cc::ArrayType>(
                cast<cudaq::cc::PointerType>(stringTy).getElementType()) &&
            "host side string expected");
-    auto callArg = builder.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(builder.getI8Type()), stringArg);
+    auto callArg = cudaq::cc::CastOp::create(
+        builder, loc, cudaq::cc::PointerType::get(builder.getI8Type()),
+        stringArg);
     StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName)
                                ? cudaq::runtime::getPauliWordSize
                                : cudaq::runtime::bindingGetStringSize;
-    auto lenRes = builder.create<func::CallOp>(loc, builder.getI64Type(),
-                                               helperName, ValueRange{callArg});
+    auto lenRes = func::CallOp::create(builder, loc, builder.getI64Type(),
+                                       helperName, ValueRange{callArg});
     return lenRes.getResult(0);
   }
 }
@@ -70,8 +71,8 @@ Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) {
   if constexpr (FromQPU) {
     Type vecArgTy = vecArg.getType();
     assert(isa<cudaq::cc::StdvecType>(vecArgTy));
-    return builder.create<cudaq::cc::StdvecSizeOp>(loc, builder.getI64Type(),
-                                                   vecArg);
+    return cudaq::cc::StdvecSizeOp::create(builder, loc, builder.getI64Type(),
+                                           vecArg);
   } else /* constexpr */ {
     auto vecTy = cast<cudaq::cc::PointerType>(vecArg.getType());
     auto vecStructTy = cast<cudaq::cc::StructType>(vecTy.getElementType());
@@ -82,24 +83,26 @@ Value genVectorSize(Location loc, OpBuilder &builder, Value vecArg) {
     auto vecElePtrTy = cudaq::cc::PointerType::get(vecStructTy.getMember(0));
 
     // Get the pointer to the pointer of the end of the array
-    Value endPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, vecElePtrTy, vecArg, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+    Value endPtr =
+        cudaq::cc::ComputePtrOp::create(builder, loc, vecElePtrTy, vecArg,
+                                        ArrayRef<cudaq::cc::ComputePtrArg>{1});
 
     // Get the pointer to the pointer of the beginning of the array
-    Value beginPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, vecElePtrTy, vecArg, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+    Value beginPtr =
+        cudaq::cc::ComputePtrOp::create(builder, loc, vecElePtrTy, vecArg,
+                                        ArrayRef<cudaq::cc::ComputePtrArg>{0});
 
     // Load to a T*
-    endPtr = builder.create<cudaq::cc::LoadOp>(loc, endPtr);
-    beginPtr = builder.create<cudaq::cc::LoadOp>(loc, beginPtr);
+    endPtr = cudaq::cc::LoadOp::create(builder, loc, endPtr);
+    beginPtr = cudaq::cc::LoadOp::create(builder, loc, beginPtr);
 
     // Map those pointers to integers
     Type i64Ty = builder.getI64Type();
-    Value endInt = builder.create<cudaq::cc::CastOp>(loc, i64Ty, endPtr);
-    Value beginInt = builder.create<cudaq::cc::CastOp>(loc, i64Ty, beginPtr);
+    Value endInt = cudaq::cc::CastOp::create(builder, loc, i64Ty, endPtr);
+    Value beginInt = cudaq::cc::CastOp::create(builder, loc, i64Ty, beginPtr);
 
     // Subtracting these will give us the size in bytes.
-    return builder.create<arith::SubIOp>(loc, endInt, beginInt);
+    return arith::SubIOp::create(builder, loc, endInt, beginInt);
   }
 }
 
@@ -107,11 +110,11 @@ Value cudaq::opt::marshal::genComputeReturnOffset(
     Location loc, OpBuilder &builder, FunctionType funcTy,
     cudaq::cc::StructType msgStructTy) {
   if (funcTy.getNumResults() == 0)
-    return builder.create<arith::ConstantIntOp>(loc, NoResultOffset, 64);
+    return arith::ConstantIntOp::create(builder, loc, NoResultOffset, 64);
   std::int32_t numKernelArgs = funcTy.getNumInputs();
   auto i64Ty = builder.getI64Type();
-  return builder.create<cc::OffsetOfOp>(loc, i64Ty, msgStructTy,
-                                        ArrayRef<std::int32_t>{numKernelArgs});
+  return cc::OffsetOfOp::create(builder, loc, i64Ty, msgStructTy,
+                                ArrayRef<std::int32_t>{numKernelArgs});
 }
 
 void cudaq::opt::marshal::genReturnOffsetFunction(
@@ -120,13 +123,13 @@ void cudaq::opt::marshal::genReturnOffsetFunction(
   auto *ctx = builder.getContext();
   auto i64Ty = builder.getI64Type();
   auto funcTy = FunctionType::get(ctx, {}, {i64Ty});
-  auto returnOffsetFunc =
-      builder.create<func::FuncOp>(loc, classNameStr + ".returnOffset", funcTy);
+  auto returnOffsetFunc = func::FuncOp::create(
+      builder, loc, classNameStr + ".returnOffset", funcTy);
   OpBuilder::InsertionGuard guard(builder);
   auto *entry = returnOffsetFunc.addEntryBlock();
   builder.setInsertionPointToStart(entry);
   auto result = genComputeReturnOffset(loc, builder, devKernelTy, msgStructTy);
-  builder.create<func::ReturnOp>(loc, result);
+  func::ReturnOp::create(builder, loc, result);
 }
 
 static cudaq::cc::PointerType getByteAddressableType(OpBuilder &builder) {
@@ -159,10 +162,10 @@ genByteSizeAndElementCount(Location loc, OpBuilder &builder, ModuleOp module,
     auto fTy = cast<cudaq::cc::StructType>(eTy).getMember(0);
     auto tTy = cast<cudaq::cc::PointerType>(fTy).getElementType();
     auto i64Ty = builder.getI64Type();
-    auto eleSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, tTy);
-    Value count = builder.create<arith::DivSIOp>(loc, size, eleSize);
-    auto ate = builder.create<arith::ConstantIntOp>(loc, 8, 64);
-    size = builder.create<arith::MulIOp>(loc, count, ate);
+    auto eleSize = cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, tTy);
+    Value count = arith::DivSIOp::create(builder, loc, size, eleSize);
+    auto ate = arith::ConstantIntOp::create(builder, loc, 8, 64);
+    size = arith::MulIOp::create(builder, loc, count, ate);
     return {size, count};
   }
 
@@ -171,10 +174,10 @@ genByteSizeAndElementCount(Location loc, OpBuilder &builder, ModuleOp module,
   if (isa<cudaq::cc::CharspanType>(eleTy)) {
     auto arrTy = cudaq::opt::factory::genHostStringType(module);
     auto words =
-        builder.create<arith::ConstantIntOp>(loc, arrTy.getSize() / 8, 64);
-    size = builder.create<arith::DivSIOp>(loc, size, words);
-    auto ate = builder.create<arith::ConstantIntOp>(loc, 8, 64);
-    Value count = builder.create<arith::DivSIOp>(loc, size, ate);
+        arith::ConstantIntOp::create(builder, loc, arrTy.getSize() / 8, 64);
+    size = arith::DivSIOp::create(builder, loc, size, words);
+    auto ate = arith::ConstantIntOp::create(builder, loc, 8, 64);
+    Value count = arith::DivSIOp::create(builder, loc, size, ate);
     return {size, count};
   }
 
@@ -186,11 +189,11 @@ genByteSizeAndElementCount(Location loc, OpBuilder &builder, ModuleOp module,
     auto vecEleTy = cast<cudaq::cc::PointerType>(vecEleRefTy).getElementType();
     auto i64Ty = builder.getI64Type();
     auto hostStrSize =
-        builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, vecEleTy);
-    Value count = builder.create<arith::DivSIOp>(loc, size, hostStrSize);
+        cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, vecEleTy);
+    Value count = arith::DivSIOp::create(builder, loc, size, hostStrSize);
     Type packedTy = cudaq::opt::factory::genArgumentBufferType(eleTy);
-    auto packSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, packedTy);
-    size = builder.create<arith::MulIOp>(loc, count, packSize);
+    auto packSize = cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, packedTy);
+    size = arith::MulIOp::create(builder, loc, count, packSize);
     return {size, count};
   }
   return {};
@@ -255,10 +258,10 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module,
         cudaq::opt::factory::stlVectorType(stdvecTy.getElementType());
     Value tmp = preallocated.has_value()
                     ? *preallocated
-                    : builder.create<cudaq::cc::AllocaOp>(loc, stdvecHostTy);
-    builder.create<func::CallOp>(loc, std::nullopt,
-                                 cudaq::stdvecBoolUnpackToInitList,
-                                 ArrayRef<Value>{tmp, arg, heapTracker});
+                    : cudaq::cc::AllocaOp::create(builder, loc, stdvecHostTy);
+    func::CallOp::create(builder, loc, TypeRange{},
+                         cudaq::stdvecBoolUnpackToInitList,
+                         ArrayRef<Value>{tmp, arg, heapTracker});
     return {tmp, true};
   }
 
@@ -271,20 +274,21 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module,
     auto argVecTy = cast<cudaq::cc::StructType>(ptrArgTy.getElementType());
     auto subVecPtrTy = cudaq::cc::PointerType::get(argVecTy.getMember(0));
     // Compute the pointer to the pointer to the first T element.
-    auto inputRef = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, subVecPtrTy, arg, ArrayRef<cudaq::cc::ComputePtrArg>{0});
-    auto startInput = builder.create<cudaq::cc::LoadOp>(loc, inputRef);
+    auto inputRef = cudaq::cc::ComputePtrOp::create(
+        builder, loc, subVecPtrTy, arg, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+    auto startInput = cudaq::cc::LoadOp::create(builder, loc, inputRef);
     auto startTy = startInput.getType();
     auto subArrTy = cudaq::cc::ArrayType::get(
         cast<cudaq::cc::PointerType>(startTy).getElementType());
-    auto input = builder.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(subArrTy), startInput);
+    auto input = cudaq::cc::CastOp::create(
+        builder, loc, cudaq::cc::PointerType::get(subArrTy), startInput);
     auto transientTy = convertToTransientType(sty, module);
     auto tmp = [&]() -> Value {
       if (preallocated)
-        return builder.create<cudaq::cc::CastOp>(
-            loc, cudaq::cc::PointerType::get(transientTy), *preallocated);
-      return builder.create<cudaq::cc::AllocaOp>(loc, transientTy);
+        return cudaq::cc::CastOp::create(
+            builder, loc, cudaq::cc::PointerType::get(transientTy),
+            *preallocated);
+      return cudaq::cc::AllocaOp::create(builder, loc, transientTy);
     }();
     Value sizeDelta = genVectorSize</*FromQPU=*/false>(loc, builder, arg);
     auto count = [&]() -> Value {
@@ -293,39 +297,39 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module,
                                             sizeDelta, arg, sty);
         return p.second;
       }
-      auto sizeEle = builder.create<cudaq::cc::SizeOfOp>(
-          loc, builder.getI64Type(), seleTy);
-      return builder.create<arith::DivSIOp>(loc, sizeDelta, sizeEle);
+      auto sizeEle = cudaq::cc::SizeOfOp::create(builder, loc,
+                                                 builder.getI64Type(), seleTy);
+      return arith::DivSIOp::create(builder, loc, sizeDelta, sizeEle);
     }();
     auto transEleTy = cast<cudaq::cc::StructType>(transientTy).getMember(0);
     auto dataTy = cast<cudaq::cc::PointerType>(transEleTy).getElementType();
     auto sizeTransientTy =
-        builder.create<cudaq::cc::SizeOfOp>(loc, builder.getI64Type(), dataTy);
+        cudaq::cc::SizeOfOp::create(builder, loc, builder.getI64Type(), dataTy);
     Value sizeInBytes =
-        builder.create<arith::MulIOp>(loc, count, sizeTransientTy);
+        arith::MulIOp::create(builder, loc, count, sizeTransientTy);
 
     // Create a new vector that we'll store the converted data into.
-    Value byteBuffer = builder.create<cudaq::cc::AllocaOp>(
-        loc, builder.getI8Type(), sizeInBytes);
+    Value byteBuffer = cudaq::cc::AllocaOp::create(
+        builder, loc, builder.getI8Type(), sizeInBytes);
 
     // Initialize the temporary vector.
     auto vecEleTy = cudaq::cc::PointerType::get(transEleTy);
-    auto tmpBegin = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+    auto tmpBegin = cudaq::cc::ComputePtrOp::create(
+        builder, loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{0});
     auto bufferBegin =
-        builder.create<cudaq::cc::CastOp>(loc, transEleTy, byteBuffer);
-    builder.create<cudaq::cc::StoreOp>(loc, bufferBegin, tmpBegin);
-    auto tmpEnd = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-    auto byteBufferEnd = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, cudaq::cc::PointerType::get(builder.getI8Type()), byteBuffer,
-        ArrayRef<cudaq::cc::ComputePtrArg>{sizeInBytes});
+        cudaq::cc::CastOp::create(builder, loc, transEleTy, byteBuffer);
+    cudaq::cc::StoreOp::create(builder, loc, bufferBegin, tmpBegin);
+    auto tmpEnd = cudaq::cc::ComputePtrOp::create(
+        builder, loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+    auto byteBufferEnd = cudaq::cc::ComputePtrOp::create(
+        builder, loc, cudaq::cc::PointerType::get(builder.getI8Type()),
+        byteBuffer, ArrayRef<cudaq::cc::ComputePtrArg>{sizeInBytes});
     auto bufferEnd =
-        builder.create<cudaq::cc::CastOp>(loc, transEleTy, byteBufferEnd);
-    builder.create<cudaq::cc::StoreOp>(loc, bufferEnd, tmpEnd);
-    auto tmpEnd2 = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{2});
-    builder.create<cudaq::cc::StoreOp>(loc, bufferEnd, tmpEnd2);
+        cudaq::cc::CastOp::create(builder, loc, transEleTy, byteBufferEnd);
+    cudaq::cc::StoreOp::create(builder, loc, bufferEnd, tmpEnd);
+    auto tmpEnd2 = cudaq::cc::ComputePtrOp::create(
+        builder, loc, vecEleTy, tmp, ArrayRef<cudaq::cc::ComputePtrArg>{2});
+    cudaq::cc::StoreOp::create(builder, loc, bufferEnd, tmpEnd2);
 
     // Loop over each element in the outer vector and initialize it to the inner
     // vector value. The data may be heap allocated.)
@@ -333,16 +337,17 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module,
     auto transientBufferTy =
         cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(transientEleTy));
     auto buffer =
-        builder.create<cudaq::cc::CastOp>(loc, transientBufferTy, byteBuffer);
+        cudaq::cc::CastOp::create(builder, loc, transientBufferTy, byteBuffer);
 
     cudaq::opt::factory::createInvariantLoop(
         builder, loc, count,
         [&](OpBuilder &builder, Location loc, Region &, Block &block) {
           Value i = block.getArgument(0);
-          Value inp = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, startTy, input, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-          auto currentVector = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(transientEleTy), buffer,
+          Value inp = cudaq::cc::ComputePtrOp::create(
+              builder, loc, startTy, input,
+              ArrayRef<cudaq::cc::ComputePtrArg>{i});
+          auto currentVector = cudaq::cc::ComputePtrOp::create(
+              builder, loc, cudaq::cc::PointerType::get(transientEleTy), buffer,
               ArrayRef<cudaq::cc::ComputePtrArg>{i});
           convertAllStdVectorBool(loc, builder, module, inp, seleTy,
                                   heapTracker, currentVector);
@@ -360,21 +365,21 @@ convertAllStdVectorBool(Location loc, OpBuilder &builder, ModuleOp module,
     // we'll store the converted data into.
     auto buffer = [&]() -> Value {
       if (preallocated)
-        return builder.create<cudaq::cc::CastOp>(
-            loc, cudaq::cc::PointerType::get(bufferTy), *preallocated);
-      return builder.create<cudaq::cc::AllocaOp>(loc, bufferTy);
+        return cudaq::cc::CastOp::create(
+            builder, loc, cudaq::cc::PointerType::get(bufferTy), *preallocated);
+      return cudaq::cc::AllocaOp::create(builder, loc, bufferTy);
     }();
 
     // Loop over each element. Replace each with the converted value.
     for (auto iter : llvm::enumerate(sty.getMembers())) {
       std::int32_t i = iter.index();
       Type memTy = iter.value();
-      auto fromPtr = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(argStrTy.getMember(i)), arg,
+      auto fromPtr = cudaq::cc::ComputePtrOp::create(
+          builder, loc, cudaq::cc::PointerType::get(argStrTy.getMember(i)), arg,
           ArrayRef<cudaq::cc::ComputePtrArg>{i});
       auto transientTy = convertToTransientType(memTy, module);
-      Value toPtr = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(transientTy), buffer,
+      Value toPtr = cudaq::cc::ComputePtrOp::create(
+          builder, loc, cudaq::cc::PointerType::get(transientTy), buffer,
           ArrayRef<cudaq::cc::ComputePtrArg>{i});
       convertAllStdVectorBool(loc, builder, module, fromPtr, memTy, heapTracker,
                               toPtr);
@@ -425,30 +430,30 @@ Value descendThroughDynamicType(Location loc, OpBuilder &builder,
             // type, so walk over the vector and recurse on each element.
             // `size` is already the proper size of the lengths of each of the
             // elements in turn.
-            builder.create<cudaq::cc::StoreOp>(loc, size, tmp);
+            cudaq::cc::StoreOp::create(builder, loc, size, tmp);
             auto ptrTy = cast<cudaq::cc::PointerType>(arg.getType());
             auto strTy = cast<cudaq::cc::StructType>(ptrTy.getElementType());
             auto memTy = cast<cudaq::cc::PointerType>(strTy.getMember(0));
             auto arrTy =
                 cudaq::cc::PointerType::get(cudaq::cc::PointerType::get(
                     cudaq::cc::ArrayType::get(memTy.getElementType())));
-            auto castPtr = builder.create<cudaq::cc::CastOp>(loc, arrTy, arg);
-            auto castArg = builder.create<cudaq::cc::LoadOp>(loc, castPtr);
+            auto castPtr = cudaq::cc::CastOp::create(builder, loc, arrTy, arg);
+            auto castArg = cudaq::cc::LoadOp::create(builder, loc, castPtr);
             auto castPtrTy =
                 cudaq::cc::PointerType::get(memTy.getElementType());
             cudaq::opt::factory::createInvariantLoop(
                 builder, loc, count,
                 [&](OpBuilder &builder, Location loc, Region &, Block &block) {
                   Value i = block.getArgument(0);
-                  auto ai = builder.create<cudaq::cc::ComputePtrOp>(
-                      loc, castPtrTy, castArg,
+                  auto ai = cudaq::cc::ComputePtrOp::create(
+                      builder, loc, castPtrTy, castArg,
                       ArrayRef<cudaq::cc::ComputePtrArg>{i});
-                  auto tmpVal = builder.create<cudaq::cc::LoadOp>(loc, tmp);
+                  auto tmpVal = cudaq::cc::LoadOp::create(builder, loc, tmp);
                   Value innerSize = descendThroughDynamicType<FromQPU>(
                       loc, builder, module, eleTy, tmpVal, ai, tmp);
-                  builder.create<cudaq::cc::StoreOp>(loc, innerSize, tmp);
+                  cudaq::cc::StoreOp::create(builder, loc, innerSize, tmp);
                 });
-            return builder.create<cudaq::cc::LoadOp>(loc, tmp);
+            return cudaq::cc::LoadOp::create(builder, loc, tmp);
           })
           // A struct can be dynamic if it contains dynamic members. Get the
           // static portion of the struct first, which will have length slots.
@@ -457,7 +462,7 @@ Value descendThroughDynamicType(Location loc, OpBuilder &builder,
             if (cudaq::cc::isDynamicType(t)) {
               Type packedTy = cudaq::opt::factory::genArgumentBufferType(t);
               Value strSize =
-                  builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, packedTy);
+                  cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, packedTy);
               for (auto iter : llvm::enumerate(t.getMembers())) {
                 std::int32_t i = iter.index();
                 auto m = iter.value();
@@ -466,20 +471,21 @@ Value descendThroughDynamicType(Location loc, OpBuilder &builder,
                   auto hostStrTy =
                       cast<cudaq::cc::StructType>(hostPtrTy.getElementType());
                   auto pm = cudaq::cc::PointerType::get(hostStrTy.getMember(i));
-                  auto ai = builder.create<cudaq::cc::ComputePtrOp>(
-                      loc, pm, arg, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+                  auto ai = cudaq::cc::ComputePtrOp::create(
+                      builder, loc, pm, arg,
+                      ArrayRef<cudaq::cc::ComputePtrArg>{i});
                   strSize = descendThroughDynamicType<FromQPU>(
                       loc, builder, module, m, strSize, ai, tmp);
                 }
               }
               return strSize;
             }
-            return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, t);
+            return cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, t);
           })
           .Default([&](Type t) -> Value {
-            return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, t);
+            return cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, t);
           });
-  return builder.create<arith::AddIOp>(loc, tySize, addend);
+  return arith::AddIOp::create(builder, loc, tySize, addend);
 }
 
 template <bool FromQPU>
@@ -488,7 +494,7 @@ Value genSizeOfDynamicMessageBufferImpl(
     cudaq::cc::StructType structTy,
     ArrayRef<std::tuple<unsigned, Value, Type>> zippy, Value tmp) {
   auto i64Ty = builder.getI64Type();
-  Value initSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+  Value initSize = cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, structTy);
   for (auto [_, a, t] : zippy)
     if (cudaq::cc::isDynamicType(t))
       initSize = descendThroughDynamicType<FromQPU>(loc, builder, module, t,
@@ -516,28 +522,29 @@ template <bool FromQPU>
 Value populateStringAddendum(Location loc, OpBuilder &builder, Value host,
                              Value sizeSlot, Value addendum, ModuleOp module) {
   Value size = genStringLength<FromQPU>(loc, builder, host, module);
-  builder.create<cudaq::cc::StoreOp>(loc, size, sizeSlot);
+  cudaq::cc::StoreOp::create(builder, loc, size, sizeSlot);
   auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type());
   Value dataPtr;
   if constexpr (FromQPU) {
-    dataPtr = builder.create<cudaq::cc::StdvecDataOp>(loc, ptrI8Ty, host);
+    dataPtr = cudaq::cc::StdvecDataOp::create(builder, loc, ptrI8Ty, host);
   } else /*constexpr*/ {
-    auto fromPtr = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, host);
+    auto fromPtr = cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, host);
     StringRef helperName = module->getAttr(cudaq::runtime::sizeofStringAttrName)
                                ? cudaq::runtime::getPauliWordData
                                : cudaq::runtime::bindingGetStringData;
-    auto call = builder.create<func::CallOp>(loc, ptrI8Ty, helperName,
-                                             ValueRange{fromPtr});
+    auto call = func::CallOp::create(builder, loc, ptrI8Ty, helperName,
+                                     ValueRange{fromPtr});
     dataPtr = call.getResult(0);
   }
-  auto notVolatile = builder.create<arith::ConstantIntOp>(loc, 0, 1);
-  auto toPtr = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, addendum);
-  builder.create<func::CallOp>(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
-                               ValueRange{toPtr, dataPtr, size, notVolatile});
+  auto notVolatile = arith::ConstantIntOp::create(builder, loc, 0, 1);
+  auto toPtr = cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, addendum);
+  func::CallOp::create(builder, loc, TypeRange{}, cudaq::llvmMemCopyIntrinsic,
+                       ValueRange{toPtr, dataPtr, size, notVolatile});
   auto ptrI8Arr = getByteAddressableType(builder);
-  auto addBytes = builder.create<cudaq::cc::CastOp>(loc, ptrI8Arr, addendum);
-  return builder.create<cudaq::cc::ComputePtrOp>(
-      loc, ptrI8Ty, addBytes, ArrayRef<cudaq::cc::ComputePtrArg>{size});
+  auto addBytes = cudaq::cc::CastOp::create(builder, loc, ptrI8Arr, addendum);
+  return cudaq::cc::ComputePtrOp::create(
+      builder, loc, ptrI8Ty, addBytes,
+      ArrayRef<cudaq::cc::ComputePtrArg>{size});
 }
 
 // Simple case when the vector data is known to not hold dynamic data.
@@ -545,7 +552,7 @@ template <bool FromQPU>
 Value populateVectorAddendum(Location loc, OpBuilder &builder, Value host,
                              Value sizeSlot, Value addendum) {
   Value size = genVectorSize<FromQPU>(loc, builder, host);
-  builder.create<cudaq::cc::StoreOp>(loc, size, sizeSlot);
+  cudaq::cc::StoreOp::create(builder, loc, size, sizeSlot);
   auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type());
   auto ptrPtrI8 = cudaq::opt::marshal::getPointerToPointerType(builder);
   Value dataPtr = [&]() -> Value {
@@ -553,21 +560,22 @@ Value populateVectorAddendum(Location loc, OpBuilder &builder, Value host,
       auto eleTy = cast<cudaq::cc::StdvecType>(host.getType()).getElementType();
       auto ptrTy = cudaq::cc::PointerType::get(eleTy);
       auto vecDataPtr =
-          builder.create<cudaq::cc::StdvecDataOp>(loc, ptrTy, host);
-      return builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, vecDataPtr);
+          cudaq::cc::StdvecDataOp::create(builder, loc, ptrTy, host);
+      return cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, vecDataPtr);
     } else /*constexpr*/ {
-      auto fromPtrPtr = builder.create<cudaq::cc::CastOp>(loc, ptrPtrI8, host);
-      return builder.create<cudaq::cc::LoadOp>(loc, fromPtrPtr);
+      auto fromPtrPtr = cudaq::cc::CastOp::create(builder, loc, ptrPtrI8, host);
+      return cudaq::cc::LoadOp::create(builder, loc, fromPtrPtr);
     }
   }();
-  auto notVolatile = builder.create<arith::ConstantIntOp>(loc, 0, 1);
-  auto toPtr = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, addendum);
-  builder.create<func::CallOp>(loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
-                               ValueRange{toPtr, dataPtr, size, notVolatile});
+  auto notVolatile = arith::ConstantIntOp::create(builder, loc, 0, 1);
+  auto toPtr = cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, addendum);
+  func::CallOp::create(builder, loc, TypeRange{}, cudaq::llvmMemCopyIntrinsic,
+                       ValueRange{toPtr, dataPtr, size, notVolatile});
   auto ptrI8Arr = getByteAddressableType(builder);
-  auto addBytes = builder.create<cudaq::cc::CastOp>(loc, ptrI8Arr, addendum);
-  return builder.create<cudaq::cc::ComputePtrOp>(
-      loc, ptrI8Ty, addBytes, ArrayRef<cudaq::cc::ComputePtrArg>{size});
+  auto addBytes = cudaq::cc::CastOp::create(builder, loc, ptrI8Arr, addendum);
+  return cudaq::cc::ComputePtrOp::create(
+      builder, loc, ptrI8Ty, addBytes,
+      ArrayRef<cudaq::cc::ComputePtrArg>{size});
 }
 
 template <bool FromQPU>
@@ -585,16 +593,16 @@ Value populateDynamicAddendum(Location loc, OpBuilder &builder, ModuleOp module,
       auto [bytes, count] = genByteSizeAndElementCount(
           loc, builder, module, eleTy, size, host, devArgTy);
       size = bytes;
-      builder.create<cudaq::cc::StoreOp>(loc, size, sizeSlot);
+      cudaq::cc::StoreOp::create(builder, loc, size, sizeSlot);
 
       // Convert from bytes to vector length in elements.
       // Compute new addendum start.
       auto addrTy = getByteAddressableType(builder);
-      auto castEnd = builder.create<cudaq::cc::CastOp>(loc, addrTy, addendum);
-      Value newAddendum = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, addendum.getType(), castEnd,
+      auto castEnd = cudaq::cc::CastOp::create(builder, loc, addrTy, addendum);
+      Value newAddendum = cudaq::cc::ComputePtrOp::create(
+          builder, loc, addendum.getType(), castEnd,
           ArrayRef<cudaq::cc::ComputePtrArg>{size});
-      builder.create<cudaq::cc::StoreOp>(loc, newAddendum, addendumScratch);
+      cudaq::cc::StoreOp::create(builder, loc, newAddendum, addendumScratch);
       Type dataTy = cudaq::opt::factory::genArgumentBufferType(eleTy);
       auto arrDataTy = cudaq::cc::ArrayType::get(dataTy);
       auto sizeBlockTy = cudaq::cc::PointerType::get(arrDataTy);
@@ -605,7 +613,7 @@ Value populateDynamicAddendum(Location loc, OpBuilder &builder, ModuleOp module,
       // and expressed in bytes. Each size will be the size of the span of the
       // element (or its subfields) at that offset.
       auto sizeBlock =
-          builder.create<cudaq::cc::CastOp>(loc, sizeBlockTy, addendum);
+          cudaq::cc::CastOp::create(builder, loc, sizeBlockTy, addendum);
       auto hostEleTy =
           cast<cudaq::cc::PointerType>(host.getType()).getElementType();
       auto ptrPtrBlockTy = cudaq::cc::PointerType::get(
@@ -615,14 +623,15 @@ Value populateDynamicAddendum(Location loc, OpBuilder &builder, ModuleOp module,
       // "front" out of the vector (the first pointer in the triple) and step
       // over the contiguous range of vectors in the host block. The vector of
       // vectors forms a ragged array structure in host memory.
-      auto hostBeginPtrRef = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrBlockTy, host, ArrayRef<cudaq::cc::ComputePtrArg>{0});
-      auto hostBegin = builder.create<cudaq::cc::LoadOp>(loc, hostBeginPtrRef);
+      auto hostBeginPtrRef = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptrPtrBlockTy, host,
+          ArrayRef<cudaq::cc::ComputePtrArg>{0});
+      auto hostBegin = cudaq::cc::LoadOp::create(builder, loc, hostBeginPtrRef);
       auto hostBeginEleTy = cast<cudaq::cc::PointerType>(hostBegin.getType());
       auto hostBlockTy = cudaq::cc::PointerType::get(
           cudaq::cc::ArrayType::get(hostBeginEleTy.getElementType()));
       auto hostBlock =
-          builder.create<cudaq::cc::CastOp>(loc, hostBlockTy, hostBegin);
+          cudaq::cc::CastOp::create(builder, loc, hostBlockTy, hostBegin);
 
       // Loop over each vector element in the vector (recursively).
       cudaq::opt::factory::createInvariantLoop(
@@ -630,19 +639,19 @@ Value populateDynamicAddendum(Location loc, OpBuilder &builder, ModuleOp module,
           [&](OpBuilder &builder, Location loc, Region &, Block &block) {
             Value i = block.getArgument(0);
             Value addm =
-                builder.create<cudaq::cc::LoadOp>(loc, addendumScratch);
-            auto subSlot = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, ptrDataTy, sizeBlock,
+                cudaq::cc::LoadOp::create(builder, loc, addendumScratch);
+            auto subSlot = cudaq::cc::ComputePtrOp::create(
+                builder, loc, ptrDataTy, sizeBlock,
                 ArrayRef<cudaq::cc::ComputePtrArg>{i});
-            auto subHost = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, hostBeginEleTy, hostBlock,
+            auto subHost = cudaq::cc::ComputePtrOp::create(
+                builder, loc, hostBeginEleTy, hostBlock,
                 ArrayRef<cudaq::cc::ComputePtrArg>{i});
             Value newAddm = populateDynamicAddendum<FromQPU>(
                 loc, builder, module, eleTy, subHost, subSlot, addm,
                 addendumScratch);
-            builder.create<cudaq::cc::StoreOp>(loc, newAddm, addendumScratch);
+            cudaq::cc::StoreOp::create(builder, loc, newAddm, addendumScratch);
           });
-      return builder.create<cudaq::cc::LoadOp>(loc, addendumScratch);
+      return cudaq::cc::LoadOp::create(builder, loc, addendumScratch);
     }
     return populateVectorAddendum<FromQPU>(loc, builder, host, sizeSlot,
                                            addendum);
@@ -656,23 +665,23 @@ Value populateDynamicAddendum(Location loc, OpBuilder &builder, ModuleOp module,
     auto hostPtrTy = cast<cudaq::cc::PointerType>(host.getType());
     auto hostMemTy = cast<cudaq::cc::StructType>(hostPtrTy.getElementType())
                          .getMember(iterIdx);
-    auto val = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, cudaq::cc::PointerType::get(hostMemTy), host,
+    auto val = cudaq::cc::ComputePtrOp::create(
+        builder, loc, cudaq::cc::PointerType::get(hostMemTy), host,
         ArrayRef<cudaq::cc::ComputePtrArg>{iterIdx});
     Type iterTy = iter.value();
     if (cudaq::cc::isDynamicType(iterTy)) {
-      Value fieldInSlot = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(builder.getI64Type()), sizeSlot,
-          ArrayRef<cudaq::cc::ComputePtrArg>{iterIdx});
+      Value fieldInSlot = cudaq::cc::ComputePtrOp::create(
+          builder, loc, cudaq::cc::PointerType::get(builder.getI64Type()),
+          sizeSlot, ArrayRef<cudaq::cc::ComputePtrArg>{iterIdx});
       addendum = populateDynamicAddendum<FromQPU>(loc, builder, module, iterTy,
                                                   val, fieldInSlot, addendum,
                                                   addendumScratch);
     } else {
-      Value fieldInSlot = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(iterTy), sizeSlot,
+      Value fieldInSlot = cudaq::cc::ComputePtrOp::create(
+          builder, loc, cudaq::cc::PointerType::get(iterTy), sizeSlot,
           ArrayRef<cudaq::cc::ComputePtrArg>{iterIdx});
-      auto v = builder.create<cudaq::cc::LoadOp>(loc, val);
-      builder.create<cudaq::cc::StoreOp>(loc, v, fieldInSlot);
+      auto v = cudaq::cc::LoadOp::create(builder, loc, val);
+      cudaq::cc::StoreOp::create(builder, loc, v, fieldInSlot);
     }
   }
   return addendum;
@@ -693,8 +702,9 @@ void populateMessageBufferImpl(
       // Get the address of the slot to be filled.
       auto memberTy = cast<cudaq::cc::StructType>(structTy).getMember(i);
       auto ptrTy = cudaq::cc::PointerType::get(memberTy);
-      auto slot = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrTy, msgBufferBase, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      auto slot = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptrTy, msgBufferBase,
+          ArrayRef<cudaq::cc::ComputePtrArg>{i});
       addendum = populateDynamicAddendum<FromQPU>(
           loc, builder, module, devArgTy, arg, slot, addendum, addendumScratch);
       continue;
@@ -711,8 +721,9 @@ void populateMessageBufferImpl(
     // Get the address of the slot to be filled.
     auto memberTy = cast<cudaq::cc::StructType>(structTy).getMember(i);
     auto ptrTy = cudaq::cc::PointerType::get(memberTy);
-    Value slot = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrTy, msgBufferBase, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+    Value slot =
+        cudaq::cc::ComputePtrOp::create(builder, loc, ptrTy, msgBufferBase,
+                                        ArrayRef<cudaq::cc::ComputePtrArg>{i});
 
     // Argument is a packaged kernel. In this case, the argument is some
     // unknown kernel that may be called. The packaged argument is coming
@@ -721,9 +732,10 @@ void populateMessageBufferImpl(
     // launch kernel.
     if (isa<cudaq::cc::IndirectCallableType>(devArgTy)) {
       auto i64Ty = builder.getI64Type();
-      auto kernKey = builder.create<func::CallOp>(
-          loc, i64Ty, cudaq::runtime::getLinkableKernelKey, ValueRange{arg});
-      builder.create<cudaq::cc::StoreOp>(loc, kernKey.getResult(0), slot);
+      auto kernKey = func::CallOp::create(builder, loc, i64Ty,
+                                          cudaq::runtime::getLinkableKernelKey,
+                                          ValueRange{arg});
+      cudaq::cc::StoreOp::create(builder, loc, kernKey.getResult(0), slot);
       continue;
     }
 
@@ -732,14 +744,14 @@ void populateMessageBufferImpl(
     // is a simulation and things are in the same address space, we pass the
     // pointer for convenience.
     if (isa<cudaq::cc::PointerType>(devArgTy))
-      arg = builder.create<cudaq::cc::CastOp>(loc, memberTy, arg);
+      arg = cudaq::cc::CastOp::create(builder, loc, memberTy, arg);
 
     if (isa<cudaq::cc::StructType, cudaq::cc::ArrayType>(arg.getType()) &&
         (cudaq::cc::PointerType::get(arg.getType()) != slot.getType())) {
-      slot = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(arg.getType()), slot);
+      slot = cudaq::cc::CastOp::create(
+          builder, loc, cudaq::cc::PointerType::get(arg.getType()), slot);
     }
-    builder.create<cudaq::cc::StoreOp>(loc, arg, slot);
+    cudaq::cc::StoreOp::create(builder, loc, arg, slot);
   }
 }
 
@@ -805,10 +817,10 @@ void cudaq::opt::marshal::genStdvecBoolFromInitList(Location loc,
                                                     Value sret, Value data,
                                                     Value size) {
   auto ptrTy = cc::PointerType::get(builder.getContext());
-  auto castData = builder.create<cc::CastOp>(loc, ptrTy, data);
-  auto castSret = builder.create<cc::CastOp>(loc, ptrTy, sret);
-  builder.create<func::CallOp>(loc, std::nullopt, stdvecBoolCtorFromInitList,
-                               ArrayRef<Value>{castSret, castData, size});
+  auto castData = cc::CastOp::create(builder, loc, ptrTy, data);
+  auto castSret = cc::CastOp::create(builder, loc, ptrTy, sret);
+  func::CallOp::create(builder, loc, TypeRange{}, stdvecBoolCtorFromInitList,
+                       ArrayRef<Value>{castSret, castData, size});
 }
 
 void cudaq::opt::marshal::genStdvecTFromInitList(Location loc,
@@ -818,59 +830,58 @@ void cudaq::opt::marshal::genStdvecTFromInitList(Location loc,
   auto i8Ty = builder.getI8Type();
   auto stlVectorTy = cc::PointerType::get(opt::factory::stlVectorType(i8Ty));
   auto ptrTy = cc::PointerType::get(i8Ty);
-  auto castSret = builder.create<cc::CastOp>(loc, stlVectorTy, sret);
+  auto castSret = cc::CastOp::create(builder, loc, stlVectorTy, sret);
   auto ptrPtrTy = cc::PointerType::get(ptrTy);
-  auto sret0 = builder.create<cc::ComputePtrOp>(
-      loc, ptrPtrTy, castSret, SmallVector<cc::ComputePtrArg>{0});
+  auto sret0 = cc::ComputePtrOp::create(builder, loc, ptrPtrTy, castSret,
+                                        SmallVector<cc::ComputePtrArg>{0});
   auto arrI8Ty = cc::ArrayType::get(i8Ty);
   auto ptrArrTy = cc::PointerType::get(arrI8Ty);
-  auto buffPtr0 = builder.create<cc::CastOp>(loc, ptrTy, data);
-  builder.create<cc::StoreOp>(loc, buffPtr0, sret0);
-  auto sret1 = builder.create<cc::ComputePtrOp>(
-      loc, ptrPtrTy, castSret, SmallVector<cc::ComputePtrArg>{1});
-  Value byteLen = builder.create<arith::MulIOp>(loc, tSize, vecSize);
-  auto buffPtr = builder.create<cc::CastOp>(loc, ptrArrTy, data);
-  auto endPtr = builder.create<cc::ComputePtrOp>(
-      loc, ptrTy, buffPtr, SmallVector<cc::ComputePtrArg>{byteLen});
-  builder.create<cc::StoreOp>(loc, endPtr, sret1);
-  auto sret2 = builder.create<cc::ComputePtrOp>(
-      loc, ptrPtrTy, castSret, SmallVector<cc::ComputePtrArg>{2});
-  builder.create<cc::StoreOp>(loc, endPtr, sret2);
+  auto buffPtr0 = cc::CastOp::create(builder, loc, ptrTy, data);
+  cc::StoreOp::create(builder, loc, buffPtr0, sret0);
+  auto sret1 = cc::ComputePtrOp::create(builder, loc, ptrPtrTy, castSret,
+                                        SmallVector<cc::ComputePtrArg>{1});
+  Value byteLen = arith::MulIOp::create(builder, loc, tSize, vecSize);
+  auto buffPtr = cc::CastOp::create(builder, loc, ptrArrTy, data);
+  auto endPtr = cc::ComputePtrOp::create(
+      builder, loc, ptrTy, buffPtr, SmallVector<cc::ComputePtrArg>{byteLen});
+  cc::StoreOp::create(builder, loc, endPtr, sret1);
+  auto sret2 = cc::ComputePtrOp::create(builder, loc, ptrPtrTy, castSret,
+                                        SmallVector<cc::ComputePtrArg>{2});
+  cc::StoreOp::create(builder, loc, endPtr, sret2);
 }
 
 Value cudaq::opt::marshal::createEmptyHeapTracker(Location loc,
                                                   OpBuilder &builder) {
   auto ptrI8Ty = cc::PointerType::get(builder.getI8Type());
-  auto result = builder.create<cc::AllocaOp>(loc, ptrI8Ty);
-  auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-  auto null = builder.create<cc::CastOp>(loc, ptrI8Ty, zero);
-  builder.create<cc::StoreOp>(loc, null, result);
+  auto result = cc::AllocaOp::create(builder, loc, ptrI8Ty);
+  auto zero = arith::ConstantIntOp::create(builder, loc, 0, 64);
+  auto null = cc::CastOp::create(builder, loc, ptrI8Ty, zero);
+  cc::StoreOp::create(builder, loc, null, result);
   return result;
 }
 
 void cudaq::opt::marshal::maybeFreeHeapAllocations(Location loc,
                                                    OpBuilder &builder,
                                                    Value heapTracker) {
-  auto head = builder.create<cc::LoadOp>(loc, heapTracker);
-  auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-  auto headAsInt = builder.create<cc::CastOp>(loc, builder.getI64Type(), head);
-  auto cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
-                                           headAsInt, zero);
+  auto head = cc::LoadOp::create(builder, loc, heapTracker);
+  auto zero = arith::ConstantIntOp::create(builder, loc, 0, 64);
+  auto headAsInt = cc::CastOp::create(builder, loc, builder.getI64Type(), head);
+  auto cmp = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ne,
+                                   headAsInt, zero);
   // If there are no std::vector<bool> to unpack, then the heapTracker will be
   // set to `nullptr` and otherwise unused. That will allow the compiler to DCE
   // this call after constant propagation.
-  builder.create<cc::IfOp>(
-      loc, TypeRange{}, cmp,
-      [&](OpBuilder &builder, Location loc, Region &region) {
-        region.push_back(new Block());
-        auto &body = region.front();
-        OpBuilder::InsertionGuard guard(builder);
-        builder.setInsertionPointToStart(&body);
-        builder.create<func::CallOp>(loc, std::nullopt,
-                                     stdvecBoolFreeTemporaryLists,
-                                     ArrayRef<Value>{head});
-        builder.create<cc::ContinueOp>(loc);
-      });
+  cc::IfOp::create(builder, loc, TypeRange{}, cmp,
+                   [&](OpBuilder &builder, Location loc, Region &region) {
+                     region.push_back(new Block());
+                     auto &body = region.front();
+                     OpBuilder::InsertionGuard guard(builder);
+                     builder.setInsertionPointToStart(&body);
+                     func::CallOp::create(builder, loc, TypeRange{},
+                                          stdvecBoolFreeTemporaryLists,
+                                          ArrayRef<Value>{head});
+                     cc::ContinueOp::create(builder, loc);
+                   });
 }
 
 /// Fetch an argument from the comm buffer. Here, the argument is not dynamic so
@@ -881,33 +892,33 @@ Value fetchInputValue(Location loc, OpBuilder &builder, Type devTy, Value ptr) {
   if (isa<cudaq::cc::IndirectCallableType>(devTy)) {
     // An indirect callable passes a key value which will be used to determine
     // the kernel that is being called.
-    auto key = builder.create<cudaq::cc::LoadOp>(loc, ptr);
-    return builder.create<cudaq::cc::CastOp>(loc, devTy, key);
+    auto key = cudaq::cc::LoadOp::create(builder, loc, ptr);
+    return cudaq::cc::CastOp::create(builder, loc, devTy, key);
   }
 
   if (isa<cudaq::cc::CallableType>(devTy)) {
     // A direct callable will have already been effectively inlined and this
     // argument should not be referenced.
-    return builder.create<cudaq::cc::PoisonOp>(loc, devTy);
+    return cudaq::cc::PoisonOp::create(builder, loc, devTy);
   }
 
   auto ptrDevTy = cudaq::cc::PointerType::get(devTy);
   if (auto strTy = dyn_cast<cudaq::cc::StructType>(devTy)) {
     // Argument is a struct.
     if (strTy.isEmpty())
-      return builder.create<cudaq::cc::UndefOp>(loc, devTy);
+      return cudaq::cc::UndefOp::create(builder, loc, devTy);
 
     // Cast to avoid conflicts between layout compatible, distinct struct types.
-    auto structPtr = builder.create<cudaq::cc::CastOp>(loc, ptrDevTy, ptr);
+    auto structPtr = cudaq::cc::CastOp::create(builder, loc, ptrDevTy, ptr);
     if constexpr (FromQPU) {
       return structPtr;
     } else {
-      return builder.create<cudaq::cc::LoadOp>(loc, structPtr);
+      return cudaq::cc::LoadOp::create(builder, loc, structPtr);
     }
   }
 
   // Default case: argument passed as a value inplace.
-  return builder.create<cudaq::cc::LoadOp>(loc, ptr);
+  return cudaq::cc::LoadOp::create(builder, loc, ptr);
 }
 
 /// Helper routine to generate code to increment the trailing data pointer to
@@ -916,10 +927,12 @@ static Value incrementTrailingDataPointer(Location loc, OpBuilder &builder,
                                           Value trailingData, Value bytes) {
   auto i8Ty = builder.getI8Type();
   auto bufferTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty));
-  auto buffPtr = builder.create<cudaq::cc::CastOp>(loc, bufferTy, trailingData);
+  auto buffPtr =
+      cudaq::cc::CastOp::create(builder, loc, bufferTy, trailingData);
   auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
-  return builder.create<cudaq::cc::ComputePtrOp>(
-      loc, i8PtrTy, buffPtr, ArrayRef<cudaq::cc::ComputePtrArg>{bytes});
+  return cudaq::cc::ComputePtrOp::create(
+      builder, loc, i8PtrTy, buffPtr,
+      ArrayRef<cudaq::cc::ComputePtrArg>{bytes});
 }
 
 /// In the thunk, we need to unpack any `std::vector` objects encoded in the
@@ -957,11 +970,11 @@ constructDynamicInputValue(Location loc, OpBuilder &builder, Type devTy,
     if (auto charSpanTy = dyn_cast<cudaq::cc::CharspanType>(devTy)) {
       // From host, so construct the stdvec span with it.
       auto eleTy = charSpanTy.getElementType();
-      auto castTrailingData = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(eleTy), trailingData);
-      Value vecLength = builder.create<cudaq::cc::LoadOp>(loc, ptr);
-      auto result = builder.create<cudaq::cc::StdvecInitOp>(
-          loc, charSpanTy, castTrailingData, vecLength);
+      auto castTrailingData = cudaq::cc::CastOp::create(
+          builder, loc, cudaq::cc::PointerType::get(eleTy), trailingData);
+      Value vecLength = cudaq::cc::LoadOp::create(builder, loc, ptr);
+      auto result = cudaq::cc::StdvecInitOp::create(
+          builder, loc, charSpanTy, castTrailingData, vecLength);
       auto nextTrailingData =
           incrementTrailingDataPointer(loc, builder, trailingData, vecLength);
       return {result, nextTrailingData};
@@ -979,9 +992,9 @@ constructDynamicInputValue(Location loc, OpBuilder &builder, Type devTy,
 
     // Get the size of each element in the vector and compute the vector's
     // logical length.
-    auto eleSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, buffEleTy);
-    Value bytes = builder.create<cudaq::cc::LoadOp>(loc, ptr);
-    auto vecLength = builder.create<arith::DivSIOp>(loc, bytes, eleSize);
+    auto eleSize = cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, buffEleTy);
+    Value bytes = cudaq::cc::LoadOp::create(builder, loc, ptr);
+    auto vecLength = arith::DivSIOp::create(builder, loc, bytes, eleSize);
 
     if (cudaq::cc::isDynamicType(eleTy)) {
       // The vector is recursively dynamic.
@@ -998,7 +1011,7 @@ constructDynamicInputValue(Location loc, OpBuilder &builder, Type devTy,
         }
       }();
       Value newVecData =
-          builder.create<cudaq::cc::AllocaOp>(loc, toTy, vecLength);
+          cudaq::cc::AllocaOp::create(builder, loc, toTy, vecLength);
       // Compute new trailing data, skipping the current vector's data.
       auto nextTrailingData =
           incrementTrailingDataPointer(loc, builder, trailingData, bytes);
@@ -1011,34 +1024,34 @@ constructDynamicInputValue(Location loc, OpBuilder &builder, Type devTy,
           cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(packTy));
       Type packedEleTy = cudaq::cc::PointerType::get(packTy);
       auto arrPtr =
-          builder.create<cudaq::cc::CastOp>(loc, packedArrTy, trailingData);
+          cudaq::cc::CastOp::create(builder, loc, packedArrTy, trailingData);
       auto trailingDataVar =
-          builder.create<cudaq::cc::AllocaOp>(loc, nextTrailingData.getType());
-      builder.create<cudaq::cc::StoreOp>(loc, nextTrailingData,
-                                         trailingDataVar);
+          cudaq::cc::AllocaOp::create(builder, loc, nextTrailingData.getType());
+      cudaq::cc::StoreOp::create(builder, loc, nextTrailingData,
+                                 trailingDataVar);
       cudaq::opt::factory::createInvariantLoop(
           builder, loc, vecLength,
           [&](OpBuilder &builder, Location loc, Region &, Block &block) {
             Value i = block.getArgument(0);
             auto nextTrailingData =
-                builder.create<cudaq::cc::LoadOp>(loc, trailingDataVar);
-            auto vecMemPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, packedEleTy, arrPtr,
+                cudaq::cc::LoadOp::create(builder, loc, trailingDataVar);
+            auto vecMemPtr = cudaq::cc::ComputePtrOp::create(
+                builder, loc, packedEleTy, arrPtr,
                 ArrayRef<cudaq::cc::ComputePtrArg>{i});
             auto r = constructDynamicInputValue<FromQPU>(
                 loc, builder, eleTy, vecMemPtr, nextTrailingData);
-            auto newVecPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, elePtrTy, newVecData,
+            auto newVecPtr = cudaq::cc::ComputePtrOp::create(
+                builder, loc, elePtrTy, newVecData,
                 ArrayRef<cudaq::cc::ComputePtrArg>{i});
-            builder.create<cudaq::cc::StoreOp>(loc, r.first, newVecPtr);
-            builder.create<cudaq::cc::StoreOp>(loc, r.second, trailingDataVar);
+            cudaq::cc::StoreOp::create(builder, loc, r.first, newVecPtr);
+            cudaq::cc::StoreOp::create(builder, loc, r.second, trailingDataVar);
           });
 
       // Create the new outer stdvec span as the result.
-      Value stdvecResult = builder.create<cudaq::cc::StdvecInitOp>(
-          loc, spanTy, newVecData, vecLength);
+      Value stdvecResult = cudaq::cc::StdvecInitOp::create(
+          builder, loc, spanTy, newVecData, vecLength);
       nextTrailingData =
-          builder.create<cudaq::cc::LoadOp>(loc, trailingDataVar);
+          cudaq::cc::LoadOp::create(builder, loc, trailingDataVar);
       return {stdvecResult, nextTrailingData};
     }
 
@@ -1050,28 +1063,28 @@ constructDynamicInputValue(Location loc, OpBuilder &builder, Type devTy,
       auto *ctx = builder.getContext();
       auto vecTy =
           cudaq::cc::StructType::get(ctx, ArrayRef<Type>{ptrTy, ptrTy, ptrTy});
-      Value vecVar = builder.create<cudaq::cc::UndefOp>(loc, vecTy);
+      Value vecVar = cudaq::cc::UndefOp::create(builder, loc, vecTy);
       Value castData =
-          builder.create<cudaq::cc::CastOp>(loc, ptrTy, trailingData);
-      vecVar = builder.create<cudaq::cc::InsertValueOp>(loc, vecTy, vecVar,
-                                                        castData, 0);
+          cudaq::cc::CastOp::create(builder, loc, ptrTy, trailingData);
+      vecVar = cudaq::cc::InsertValueOp::create(builder, loc, vecTy, vecVar,
+                                                castData, 0);
       auto ptrArrTy =
           cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
       auto castTrailingData =
-          builder.create<cudaq::cc::CastOp>(loc, ptrArrTy, trailingData);
-      Value castEnd = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrTy, castTrailingData,
+          cudaq::cc::CastOp::create(builder, loc, ptrArrTy, trailingData);
+      Value castEnd = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptrTy, castTrailingData,
           ArrayRef<cudaq::cc::ComputePtrArg>{bytes});
-      vecVar = builder.create<cudaq::cc::InsertValueOp>(loc, vecTy, vecVar,
-                                                        castEnd, 1);
-      result = builder.create<cudaq::cc::InsertValueOp>(loc, vecTy, vecVar,
-                                                        castEnd, 2);
+      vecVar = cudaq::cc::InsertValueOp::create(builder, loc, vecTy, vecVar,
+                                                castEnd, 1);
+      result = cudaq::cc::InsertValueOp::create(builder, loc, vecTy, vecVar,
+                                                castEnd, 2);
     } else /*constexpr*/ {
       // From host, so construct the stdvec span with it.
-      auto castTrailingData = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(eleTy), trailingData);
-      result = builder.create<cudaq::cc::StdvecInitOp>(
-          loc, spanTy, castTrailingData, vecLength);
+      auto castTrailingData = cudaq::cc::CastOp::create(
+          builder, loc, cudaq::cc::PointerType::get(eleTy), trailingData);
+      result = cudaq::cc::StdvecInitOp::create(builder, loc, spanTy,
+                                               castTrailingData, vecLength);
     }
     auto nextTrailingData =
         incrementTrailingDataPointer(loc, builder, trailingData, bytes);
@@ -1086,27 +1099,27 @@ constructDynamicInputValue(Location loc, OpBuilder &builder, Type devTy,
   auto strTy = cast<cudaq::cc::StructType>(devTy);
   auto ptrEleTy = cast<cudaq::cc::PointerType>(ptr.getType()).getElementType();
   auto packedTy = cast<cudaq::cc::StructType>(ptrEleTy);
-  Value result = builder.create<cudaq::cc::UndefOp>(loc, strTy);
+  Value result = cudaq::cc::UndefOp::create(builder, loc, strTy);
   assert(strTy.getNumMembers() == packedTy.getNumMembers());
   for (auto iter :
        llvm::enumerate(llvm::zip(strTy.getMembers(), packedTy.getMembers()))) {
     auto devMemTy = std::get<0>(iter.value());
     std::int32_t off = iter.index();
     auto packedMemTy = std::get<1>(iter.value());
-    auto dataPtr = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, cudaq::cc::PointerType::get(packedMemTy), ptr,
+    auto dataPtr = cudaq::cc::ComputePtrOp::create(
+        builder, loc, cudaq::cc::PointerType::get(packedMemTy), ptr,
         ArrayRef<cudaq::cc::ComputePtrArg>{off});
     if (cudaq::cc::isDynamicType(devMemTy)) {
       auto r = constructDynamicInputValue<FromQPU>(loc, builder, devMemTy,
                                                    dataPtr, trailingData);
-      result = builder.create<cudaq::cc::InsertValueOp>(loc, strTy, result,
-                                                        r.first, off);
+      result = cudaq::cc::InsertValueOp::create(builder, loc, strTy, result,
+                                                r.first, off);
       trailingData = r.second;
       continue;
     }
     auto val = fetchInputValue<FromQPU>(loc, builder, devMemTy, dataPtr);
     result =
-        builder.create<cudaq::cc::InsertValueOp>(loc, strTy, result, val, off);
+        cudaq::cc::InsertValueOp::create(builder, loc, strTy, result, val, off);
   }
   return {result, trailingData};
 }
@@ -1116,8 +1129,8 @@ std::pair<Value, Value>
 processInputValueImpl(Location loc, OpBuilder &builder, Value trailingData,
                       Value ptrPackedStruct, Type inTy, std::int32_t off,
                       cudaq::cc::StructType packedStructTy) {
-  auto packedPtr = builder.create<cudaq::cc::ComputePtrOp>(
-      loc, cudaq::cc::PointerType::get(packedStructTy.getMember(off)),
+  auto packedPtr = cudaq::cc::ComputePtrOp::create(
+      builder, loc, cudaq::cc::PointerType::get(packedStructTy.getMember(off)),
       ptrPackedStruct, ArrayRef<cudaq::cc::ComputePtrArg>{off});
   if (cudaq::cc::isDynamicType(inTy)) {
     if constexpr (FromQPU) {
@@ -1125,24 +1138,24 @@ processInputValueImpl(Location loc, OpBuilder &builder, Value trailingData,
           loc, builder, inTy, packedPtr, trailingData);
       if (isa<cudaq::cc::StdvecType>(inTy)) {
         Value retVal = dynamo.first;
-        Value tmp = builder.create<cudaq::cc::AllocaOp>(loc, retVal.getType());
-        builder.create<cudaq::cc::StoreOp>(loc, retVal, tmp);
+        Value tmp = cudaq::cc::AllocaOp::create(builder, loc, retVal.getType());
+        cudaq::cc::StoreOp::create(builder, loc, retVal, tmp);
         return {tmp, dynamo.second};
       }
       if (isa<cudaq::cc::CharspanType>(inTy)) {
         auto module = packedPtr->getParentOfType<ModuleOp>();
         auto arrTy = cudaq::opt::factory::genHostStringType(module);
         Value retVal = dynamo.first;
-        Value tmp = builder.create<cudaq::cc::AllocaOp>(loc, arrTy);
+        Value tmp = cudaq::cc::AllocaOp::create(builder, loc, arrTy);
         auto ptrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-        Value castTmp = builder.create<cudaq::cc::CastOp>(loc, ptrTy, tmp);
-        Value len = builder.create<cudaq::cc::StdvecSizeOp>(
-            loc, builder.getI64Type(), dynamo.first);
+        Value castTmp = cudaq::cc::CastOp::create(builder, loc, ptrTy, tmp);
+        Value len = cudaq::cc::StdvecSizeOp::create(
+            builder, loc, builder.getI64Type(), dynamo.first);
         Value data =
-            builder.create<cudaq::cc::StdvecDataOp>(loc, ptrTy, dynamo.first);
-        builder.create<func::CallOp>(loc, TypeRange{},
-                                     cudaq::runtime::bindingInitializeString,
-                                     ArrayRef<Value>{castTmp, data, len});
+            cudaq::cc::StdvecDataOp::create(builder, loc, ptrTy, dynamo.first);
+        func::CallOp::create(builder, loc, TypeRange{},
+                             cudaq::runtime::bindingInitializeString,
+                             ArrayRef<Value>{castTmp, data, len});
         return {tmp, dynamo.second};
       }
       return dynamo;
diff --git a/lib/Optimizer/CAPI/CMakeLists.txt b/lib/Optimizer/CAPI/CMakeLists.txt
index b17dfaeae15..61496606da1 100644
--- a/lib/Optimizer/CAPI/CMakeLists.txt
+++ b/lib/Optimizer/CAPI/CMakeLists.txt
@@ -6,13 +6,14 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
-add_mlir_public_c_api_library(CUDAQuantumMLIRCAPI 
+add_mlir_public_c_api_library(CUDAQuantumMLIRCAPI
   Dialects.cpp
 
   DEPENDS
   QuakeDialectIncGen
 
-  LINK_LIBS PRIVATE 
-  QuakeDialect 
+  LINK_LIBS PRIVATE
+  QuakeDialect
   CCDialect
+  MLIRRegisterAllDialects
 )
diff --git a/lib/Optimizer/CAPI/Dialects.cpp b/lib/Optimizer/CAPI/Dialects.cpp
index 59a1210a694..8d2c482d465 100644
--- a/lib/Optimizer/CAPI/Dialects.cpp
+++ b/lib/Optimizer/CAPI/Dialects.cpp
@@ -9,6 +9,16 @@
 #include "cudaq/Optimizer/CAPI/Dialects.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
+#include "mlir/InitAllDialects.h"
 
 MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(Quake, quake, quake::QuakeDialect)
 MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(CC, cc, cudaq::cc::CCDialect)
+
+extern "C" void cudaqRegisterAllDialects(MlirContext context) {
+  mlir::DialectRegistry registry;
+  registry.insert<quake::QuakeDialect, cudaq::cc::CCDialect>();
+  mlir::registerAllDialects(registry);
+  auto *mlirContext = unwrap(context);
+  mlirContext->appendDialectRegistry(registry);
+  mlirContext->loadAllAvailableDialects();
+}
diff --git a/lib/Optimizer/CodeGen/CCToLLVM.cpp b/lib/Optimizer/CodeGen/CCToLLVM.cpp
index ce0e4b50bb5..8291c1ff63d 100644
--- a/lib/Optimizer/CodeGen/CCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/CCToLLVM.cpp
@@ -52,21 +52,30 @@ class AllocaOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::AllocaOp> {
   LogicalResult
   matchAndRewrite(cudaq::cc::AllocaOp alloc, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto operands = adaptor.getOperands();
-    auto toTy = LLVM::LLVMPointerType::get([&]() -> Type {
-      if (auto arrTy = dyn_cast<cudaq::cc::ArrayType>(alloc.getElementType());
-          arrTy && arrTy.isUnknownSize())
-        return getTypeConverter()->convertType(arrTy.getElementType());
-      return getTypeConverter()->convertType(alloc.getElementType());
-    }());
-    if (operands.empty()) {
-      rewriter.replaceOpWithNewOp<LLVM::AllocaOp>(
-          alloc, toTy,
-          ArrayRef<Value>{cudaq::opt::factory::genLlvmI32Constant(
-              alloc.getLoc(), rewriter, 1)});
-    } else {
-      rewriter.replaceOpWithNewOp<LLVM::AllocaOp>(alloc, toTy, operands);
+    Type type = getTypeConverter()->convertType(alloc.getElementType());
+    Value size = adaptor.getSeqSize();
+    if (!size)
+      size =
+          cudaq::opt::factory::genLlvmI32Constant(alloc.getLoc(), rewriter, 1);
+#ifdef __APPLE__
+    if (alloc.getElementType().isInteger(8) && adaptor.getSeqSize()) {
+      auto loc = alloc.getLoc();
+      auto i64Ty = rewriter.getI64Type();
+      Value sized = size;
+      if (sized.getType() != i64Ty)
+        sized = LLVM::ZExtOp::create(rewriter, loc, i64Ty, sized);
+      auto seven = LLVM::ConstantOp::create(rewriter, loc, i64Ty,
+                                            rewriter.getI64IntegerAttr(7));
+      auto mask = LLVM::ConstantOp::create(
+          rewriter, loc, i64Ty,
+          rewriter.getI64IntegerAttr(static_cast<std::int64_t>(~7ULL)));
+      auto bumped = LLVM::AddOp::create(rewriter, loc, i64Ty, sized, seven);
+      size = LLVM::AndOp::create(rewriter, loc, i64Ty, bumped, mask);
     }
+#endif
+    rewriter.replaceOpWithNewOp<LLVM::AllocaOp>(
+        alloc, cudaq::opt::factory::getPointerType(rewriter.getContext()), type,
+        size);
     return success();
   }
 };
@@ -86,30 +95,26 @@ class CallableClosureOpPattern
       resTy.push_back(getTypeConverter()->convertType(callable.getType(i)));
     auto *ctx = rewriter.getContext();
     auto tupleTy = LLVM::LLVMStructType::getLiteral(ctx, resTy);
-    auto tuplePtrTy = cudaq::opt::factory::getPointerType(tupleTy);
+    auto tuplePtrTy = cudaq::opt::factory::getPointerType(ctx);
     auto structTy = dyn_cast<LLVM::LLVMStructType>(operands[0].getType());
     if (!structTy)
       return failure();
     auto one = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{1});
-    auto extract = rewriter.create<LLVM::ExtractValueOp>(
-        loc, structTy.getBody()[1], operands[0], one);
-    if (resTy.size() == 1 && resTy[0] != tupleTy) {
-      auto tupleVal = rewriter.create<LLVM::BitcastOp>(
-          loc, cudaq::opt::factory::getPointerType(resTy[0]), extract);
-      rewriter.replaceOpWithNewOp<LLVM::LoadOp>(callable, tupleVal);
-    } else {
-      auto tuplePtr =
-          rewriter.create<LLVM::BitcastOp>(loc, tuplePtrTy, extract);
-      auto tupleVal = rewriter.create<LLVM::LoadOp>(loc, tupleTy, tuplePtr);
-      SmallVector<Value> exposedVals;
-      for (std::int64_t i = 0, N = resTy.size(); i < N; ++i) {
-        auto offset = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{i});
-        auto extract = rewriter.create<LLVM::ExtractValueOp>(
-            loc, tupleTy.getBody()[i], tupleVal, offset);
-        exposedVals.push_back(extract);
-      }
-      rewriter.replaceOp(callable, exposedVals);
+    auto extract = LLVM::ExtractValueOp::create(
+        rewriter, loc, structTy.getBody()[1], operands[0], one);
+    auto tupleVal = LLVM::BitcastOp::create(rewriter, loc, tuplePtrTy, extract);
+    auto loadOp = LLVM::LoadOp::create(rewriter, loc, tupleTy, tupleVal);
+    // In LLVM 22, replaceOp strictly requires the same number of results.
+    // The LoadOp returns a single struct value; extract each field to match
+    // the multiple results of CallableClosureOp.
+    SmallVector<Value> results;
+    for (std::size_t i = 0, N = callable.getResults().size(); i < N; ++i) {
+      auto idx = DenseI64ArrayAttr::get(
+          ctx, ArrayRef<std::int64_t>{static_cast<int64_t>(i)});
+      results.push_back(LLVM::ExtractValueOp::create(rewriter, loc, resTy[i],
+                                                     loadOp.getResult(), idx));
     }
+    rewriter.replaceOp(callable, results);
     return success();
   }
 };
@@ -130,8 +135,8 @@ class CallableFuncOpPattern
       return failure();
     auto *ctx = rewriter.getContext();
     auto zero = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{0});
-    auto extract = rewriter.create<LLVM::ExtractValueOp>(
-        loc, structTy.getBody()[0], operands[0], zero);
+    auto extract = LLVM::ExtractValueOp::create(
+        rewriter, loc, structTy.getBody()[0], operands[0], zero);
     rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(callable, resTy, extract);
     return success();
   }
@@ -146,6 +151,7 @@ class CallCallableOpPattern
   matchAndRewrite(cudaq::cc::CallCallableOp call, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = call.getLoc();
+    // Get the mlir::FunctionType signature from the callable
     auto calleeFuncTy =
         cast<cudaq::cc::CallableType>(call.getCallee().getType())
             .getSignature();
@@ -154,52 +160,82 @@ class CallCallableOpPattern
     auto structTy = dyn_cast<LLVM::LLVMStructType>(operands[0].getType());
     if (!structTy)
       return failure();
+
+    // Extract raw function pointer (first element of callable struct)
     auto ptr0Ty = structTy.getBody()[0];
     auto zero = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{0});
     auto rawFuncPtr =
-        rewriter.create<LLVM::ExtractValueOp>(loc, ptr0Ty, operands[0], zero);
+        LLVM::ExtractValueOp::create(rewriter, loc, ptr0Ty, operands[0], zero);
+
+    // Extract raw tuple pointer (second element of callable struct)
     auto ptr1Ty = structTy.getBody()[1];
     auto one = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{1});
     auto rawTuplePtr =
-        rewriter.create<LLVM::ExtractValueOp>(loc, ptr1Ty, operands[0], one);
-    Type funcPtrTy = getTypeConverter()->convertType(calleeFuncTy);
-    auto funcPtr = rewriter.create<LLVM::BitcastOp>(loc, funcPtrTy, rawFuncPtr);
+        LLVM::ExtractValueOp::create(rewriter, loc, ptr1Ty, operands[0], one);
+
+    // Build the LLVM function type by converting the signature's types
+    // individually (since convertType on FunctionType returns ptr with opaque
+    // pointers)
+    SmallVector<Type> llvmArgTys;
+    for (Type argTy : calleeFuncTy.getInputs())
+      llvmArgTys.push_back(getTypeConverter()->convertType(argTy));
+
+    Type llvmRetTy;
+    if (calleeFuncTy.getNumResults() == 0)
+      llvmRetTy = LLVM::LLVMVoidType::get(ctx);
+    else if (calleeFuncTy.getNumResults() == 1)
+      llvmRetTy = getTypeConverter()->convertType(calleeFuncTy.getResult(0));
+    else {
+      // Multiple results - pack into a struct
+      SmallVector<Type> llvmResultTys;
+      for (Type resTy : calleeFuncTy.getResults())
+        llvmResultTys.push_back(getTypeConverter()->convertType(resTy));
+      llvmRetTy = LLVM::LLVMStructType::getLiteral(ctx, llvmResultTys);
+    }
+    auto llvmFuncTy = LLVM::LLVMFunctionType::get(llvmRetTy, llvmArgTys);
+
+    // Check if tuple pointer is null (determines direct vs closure call)
     auto i64Ty = rewriter.getI64Type();
     auto zeroI64 = cudaq::opt::factory::genLlvmI64Constant(loc, rewriter, 0);
     auto rawTupleVal =
-        rewriter.create<LLVM::PtrToIntOp>(loc, i64Ty, rawTuplePtr);
-    auto isNullptr = rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq,
-                                                   rawTupleVal, zeroI64);
+        LLVM::PtrToIntOp::create(rewriter, loc, i64Ty, rawTuplePtr);
+    auto isNullptr = LLVM::ICmpOp::create(
+        rewriter, loc, LLVM::ICmpPredicate::eq, rawTupleVal, zeroI64);
+
+    // Create control flow blocks
     auto *initBlock = rewriter.getInsertionBlock();
     auto initPos = rewriter.getInsertionPoint();
     auto *endBlock = rewriter.splitBlock(initBlock, initPos);
     auto *thenBlock = rewriter.createBlock(endBlock);
     auto *elseBlock = rewriter.createBlock(endBlock);
+
     SmallVector<Type> resultTy;
-    auto llvmFuncTy = cast<LLVM::LLVMFunctionType>(
-        cast<LLVM::LLVMPointerType>(funcPtrTy).getElementType());
     if (!isa<LLVM::LLVMVoidType>(llvmFuncTy.getReturnType())) {
       resultTy.push_back(llvmFuncTy.getReturnType());
       endBlock->addArgument(resultTy[0], loc);
     }
+
     rewriter.setInsertionPointToEnd(initBlock);
-    rewriter.create<LLVM::CondBrOp>(loc, isNullptr, thenBlock, elseBlock);
+    LLVM::CondBrOp::create(rewriter, loc, isNullptr, thenBlock, elseBlock);
+
+    // Then block: tuple is null, call function directly with remaining operands
     rewriter.setInsertionPointToEnd(thenBlock);
-    SmallVector<Value> arguments1 = {funcPtr};
-    arguments1.append(operands.begin() + 1, operands.end());
-    auto call1 = rewriter.create<LLVM::CallOp>(loc, resultTy, arguments1);
-    rewriter.create<LLVM::BrOp>(loc, call1.getResults(), endBlock);
+    SmallVector<Value> calleeOps1 = {rawFuncPtr};
+    calleeOps1.append(operands.begin() + 1, operands.end());
+    auto call1 = LLVM::CallOp::create(rewriter, loc, llvmFuncTy, calleeOps1);
+    LLVM::BrOp::create(rewriter, loc, call1.getResults(), endBlock);
+
+    // Else block: tuple is not null, call with callable struct as first arg
     rewriter.setInsertionPointToEnd(elseBlock);
-    SmallVector<Type> argTys(operands.getTypes().begin(),
-                             operands.getTypes().end());
-    auto adjustedFuncTy =
-        LLVM::LLVMFunctionType::get(llvmFuncTy.getReturnType(), argTys);
-    auto adjustedFuncPtr = rewriter.create<LLVM::BitcastOp>(
-        loc, cudaq::opt::factory::getPointerType(adjustedFuncTy), funcPtr);
-    SmallVector<Value> arguments2 = {adjustedFuncPtr};
-    arguments2.append(operands.begin(), operands.end());
-    auto call2 = rewriter.create<LLVM::CallOp>(loc, resultTy, arguments2);
-    rewriter.create<LLVM::BrOp>(loc, call2.getResults(), endBlock);
+    SmallVector<Value> calleeOps2 = {rawFuncPtr};
+    calleeOps2.append(operands.begin(), operands.end());
+    SmallVector<Type> closureArgTys;
+    closureArgTys.push_back(operands[0].getType());
+    closureArgTys.append(llvmArgTys.begin(), llvmArgTys.end());
+    auto closureFuncTy = LLVM::LLVMFunctionType::get(llvmRetTy, closureArgTys);
+    auto call2 = LLVM::CallOp::create(rewriter, loc, closureFuncTy, calleeOps2);
+    LLVM::BrOp::create(rewriter, loc, call2.getResults(), endBlock);
+
     rewriter.replaceOp(call, endBlock->getArguments());
     return success();
   }
@@ -214,13 +250,29 @@ class CallIndirectCallableOpPattern
   matchAndRewrite(cudaq::cc::CallIndirectCallableOp call, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = call.getLoc();
+    auto *ctx = rewriter.getContext();
     auto parentModule = call->getParentOfType<ModuleOp>();
-    auto funcPtrTy = getTypeConverter()->convertType(
-        cast<cudaq::cc::IndirectCallableType>(call.getCallee().getType())
-            .getSignature());
-    auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getI8Type());
-    auto funcTy = cast<LLVM::LLVMFunctionType>(
-        cast<LLVM::LLVMPointerType>(funcPtrTy).getElementType());
+    auto indirectTy =
+        cast<cudaq::cc::IndirectCallableType>(call.getCallee().getType());
+    mlir::FunctionType calleeFuncTy = indirectTy.getSignature();
+    auto funcPtrTy = getTypeConverter()->convertType(calleeFuncTy);
+    auto ptrTy = cudaq::opt::factory::getPointerType(ctx);
+    SmallVector<Type> llvmArgTys;
+    for (Type argTy : calleeFuncTy.getInputs())
+      llvmArgTys.push_back(getTypeConverter()->convertType(argTy));
+    Type llvmRetTy;
+    if (calleeFuncTy.getNumResults() == 0)
+      llvmRetTy = LLVM::LLVMVoidType::get(ctx);
+    else if (calleeFuncTy.getNumResults() == 1)
+      llvmRetTy = getTypeConverter()->convertType(calleeFuncTy.getResult(0));
+    else {
+      SmallVector<Type> llvmResultTys;
+      for (Type resTy : calleeFuncTy.getResults())
+        llvmResultTys.push_back(getTypeConverter()->convertType(resTy));
+      llvmRetTy = LLVM::LLVMStructType::getLiteral(ctx, llvmResultTys);
+    }
+    LLVM::LLVMFunctionType funcTy =
+        LLVM::LLVMFunctionType::get(llvmRetTy, llvmArgTys);
     auto i64Ty = rewriter.getI64Type(); // intptr_t
     FlatSymbolRefAttr funSymbol = cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::runtime::getLinkableKernelDeviceSide, ptrTy, {i64Ty},
@@ -232,19 +284,17 @@ class CallIndirectCallableOpPattern
     // device-side functions are located in the same address space as well. None
     // of these functions should be expected to reside on remote hardware.
     // Therefore, this will likely only be useful in a simulation target.
-    auto lookee = rewriter.create<LLVM::CallOp>(
-        loc, ptrTy, funSymbol, ValueRange{adaptor.getCallee()});
+    auto lookee = LLVM::CallOp::create(rewriter, loc, ptrTy, funSymbol,
+                                       ValueRange{adaptor.getCallee()});
     auto lookup =
-        rewriter.create<LLVM::BitcastOp>(loc, funcPtrTy, lookee.getResult());
+        LLVM::BitcastOp::create(rewriter, loc, funcPtrTy, lookee.getResult());
 
-    // Call the function that was just found in the map.
+    // Use create() so operandSegmentSizes is set (LLVM 22
+    // AttrSizedOperandSegments).
     SmallVector<Value> args = {lookup.getResult()};
     args.append(adaptor.getArgs().begin(), adaptor.getArgs().end());
-    if (isa<LLVM::LLVMVoidType>(funcTy.getReturnType()))
-      rewriter.replaceOpWithNewOp<LLVM::CallOp>(call, std::nullopt, args);
-    else
-      rewriter.replaceOpWithNewOp<LLVM::CallOp>(call, funcTy.getReturnType(),
-                                                args);
+    auto newCall = LLVM::CallOp::create(rewriter, loc, funcTy, args);
+    rewriter.replaceOp(call, newCall.getResults());
     return success();
   }
 };
@@ -329,20 +379,27 @@ class ComputePtrOpPattern
   LogicalResult
   matchAndRewrite(cudaq::cc::ComputePtrOp cpOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto operands = adaptor.getOperands();
-    auto toTy = getTypeConverter()->convertType(cpOp.getType());
+    // Get the CC element type before conversion
+    auto ccPtrTy = cast<cudaq::cc::PointerType>(cpOp.getBase().getType());
+    Type ccEleTy = ccPtrTy.getElementType();
+
     // The first operand is the base pointer.
-    Value base = operands[0];
     if (cpOp.llvmNormalForm()) {
       // In this case, the `cc.compute_ptr` has already been converted such that
       // it corresponds 1:1 with the C-like semantics of LLVM's getelementptr
       // operation. Specifically, a pointer to a scalar type is overloaded to
       // possibly be the same as a pointer to an array with unknown bound.
       // All operands except the first are indices.
+      // Extract inner element type from CC array type before conversion
+      ccEleTy = cast<cudaq::cc::ArrayType>(ccEleTy).getElementType();
       auto newOpnds = interleaveConstantsAndOperands(
-          operands.drop_front(), cpOp.getRawConstantIndices());
+          adaptor.getDynamicIndices(), cpOp.getRawConstantIndices());
+      // Convert to LLVM type after extracting the element type
+      Type eleTy = getTypeConverter()->convertType(ccEleTy);
       // Rewrite the ComputePtrOp as a LLVM::GEPOp.
-      rewriter.replaceOpWithNewOp<LLVM::GEPOp>(cpOp, toTy, base, newOpnds);
+      rewriter.replaceOpWithNewOp<LLVM::GEPOp>(
+          cpOp, cudaq::opt::factory::getPointerType(rewriter.getContext()),
+          eleTy, adaptor.getBase(), newOpnds);
     } else {
       // If the `cc.compute_ptr` operation has a base argument that is not in
       // LLVM normal form, we implicitly assume that pointer's element type
@@ -354,9 +411,13 @@ class ComputePtrOpPattern
       SmallVector<std::int32_t> constIndices = {0};
       constIndices.append(cpOp.getRawConstantIndices().begin(),
                           cpOp.getRawConstantIndices().end());
-      auto newOpnds =
-          interleaveConstantsAndOperands(operands.drop_front(), constIndices);
-      rewriter.replaceOpWithNewOp<LLVM::GEPOp>(cpOp, toTy, base, newOpnds);
+      auto newOpnds = interleaveConstantsAndOperands(
+          adaptor.getDynamicIndices(), constIndices);
+      // Convert to LLVM type
+      Type eleTy = getTypeConverter()->convertType(ccEleTy);
+      rewriter.replaceOpWithNewOp<LLVM::GEPOp>(
+          cpOp, cudaq::opt::factory::getPointerType(rewriter.getContext()),
+          eleTy, adaptor.getBase(), newOpnds);
     }
     return success();
   }
@@ -430,9 +491,9 @@ class GlobalOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::GlobalOp> {
     auto name = global.getSymName();
     bool isReadOnly = global.getConstant();
     Attribute initializer = global.getValue().value_or(Attribute{});
-    rewriter.create<mlir::LLVM::GlobalOp>(loc, type, isReadOnly,
-                                          LLVM::Linkage::Private, name,
-                                          initializer, /*alignment=*/0);
+    mlir::LLVM::GlobalOp::create(rewriter, loc, type, isReadOnly,
+                                 LLVM::Linkage::Private, name, initializer,
+                                 /*alignment=*/0);
     rewriter.eraseOp(global);
     return success();
   }
@@ -471,41 +532,33 @@ class InstantiateCallableOpPattern
     Value tmp;
     auto tupleArgTy = cudaq::opt::lambdaAsPairOfPointers(ctx);
     if (callable.getNoCapture()) {
-      auto zero = cudaq::opt::factory::genLlvmI64Constant(loc, rewriter, 0);
-      tmp =
-          rewriter.create<LLVM::IntToPtrOp>(loc, tupleArgTy.getBody()[1], zero);
+      Value zero = cudaq::opt::factory::genLlvmI64Constant(loc, rewriter, 0);
+      tmp = LLVM::IntToPtrOp::create(rewriter, loc, tupleArgTy.getBody()[1],
+                                     zero);
     } else {
-      Value tupleVal = rewriter.create<LLVM::UndefOp>(loc, tupleTy);
+      Value tupleVal = LLVM::UndefOp::create(rewriter, loc, tupleTy);
       std::int64_t offsetVal = 0;
       for (auto op : operands) {
         auto offset =
             DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{offsetVal});
-        tupleVal = rewriter.create<LLVM::InsertValueOp>(loc, tupleTy, tupleVal,
-                                                        op, offset);
+        tupleVal = LLVM::InsertValueOp::create(rewriter, loc, tupleTy, tupleVal,
+                                               op, offset);
         offsetVal++;
       }
-      auto tuplePtrTy = cudaq::opt::factory::getPointerType(tupleTy);
-      tmp = cudaq::opt::factory::createLLVMTemporary(loc, rewriter, tuplePtrTy);
-      rewriter.create<LLVM::StoreOp>(loc, tupleVal, tmp);
+      tmp = cudaq::opt::factory::createLLVMTemporary(loc, rewriter, tupleTy);
+      LLVM::StoreOp::create(rewriter, loc, tupleVal, tmp);
     }
-    Value tupleArg = rewriter.create<LLVM::UndefOp>(loc, tupleArgTy);
-    auto module = callable->getParentOfType<ModuleOp>();
-    auto *calledFuncOp = module.lookupSymbol(callable.getCallee());
-    auto sigTy = [&]() -> Type {
-      if (auto calledFunc = dyn_cast<func::FuncOp>(calledFuncOp))
-        return getTypeConverter()->convertType(calledFunc.getFunctionType());
-      return cudaq::opt::factory::getPointerType(
-          cast<LLVM::LLVMFuncOp>(calledFuncOp).getFunctionType());
-    }();
-    auto tramp = rewriter.create<LLVM::AddressOfOp>(
-        loc, sigTy, callable.getCallee().cast<FlatSymbolRefAttr>());
+    Value tupleArg = LLVM::UndefOp::create(rewriter, loc, tupleArgTy);
+    auto sigTy = cudaq::opt::factory::getPointerType(ctx);
+    auto tramp = LLVM::AddressOfOp::create(
+        rewriter, loc, sigTy, cast<FlatSymbolRefAttr>(callable.getCallee()));
     auto trampoline =
-        rewriter.create<LLVM::BitcastOp>(loc, tupleArgTy.getBody()[0], tramp);
+        LLVM::BitcastOp::create(rewriter, loc, tupleArgTy.getBody()[0], tramp);
     auto zeroA = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{0});
-    tupleArg = rewriter.create<LLVM::InsertValueOp>(loc, tupleArgTy, tupleArg,
-                                                    trampoline, zeroA);
+    tupleArg = LLVM::InsertValueOp::create(rewriter, loc, tupleArgTy, tupleArg,
+                                           trampoline, zeroA);
     auto castTmp =
-        rewriter.create<LLVM::BitcastOp>(loc, tupleArgTy.getBody()[1], tmp);
+        LLVM::BitcastOp::create(rewriter, loc, tupleArgTy.getBody()[1], tmp);
     rewriter.replaceOpWithNewOp<LLVM::InsertValueOp>(
         callable, tupleArgTy, tupleArg, castTmp,
         DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{1}));
@@ -532,8 +585,6 @@ class SizeOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::SizeOfOp> {
 public:
   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
 
-  // Use the GEP approach for now. LLVM is planning to remove support for this
-  // at some point. See: https://github.com/llvm/llvm-project/issues/71507
   LogicalResult
   matchAndRewrite(cudaq::cc::SizeOfOp sizeOfOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
@@ -541,19 +592,17 @@ class SizeOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::SizeOfOp> {
     auto resultTy = sizeOfOp.getType();
     if (quake::isQuakeType(inputTy) ||
         cudaq::cc::isDynamicallySizedType(inputTy)) {
+      // Types that cannot be reified produce the poison op.
       rewriter.replaceOpWithNewOp<cudaq::cc::PoisonOp>(sizeOfOp, resultTy);
       return success();
     }
     auto loc = sizeOfOp.getLoc();
-    // TODO: replace this with some target-specific memory layout computation
-    // when we upgrade to a newer MLIR.
-    auto zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto ptrTy =
-        cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(inputTy));
-    auto nullCast = rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
-    Value nextPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrTy, nullCast, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-    rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(sizeOfOp, resultTy, nextPtr);
+    // We rely on MLIR here, they are using the GEP approach for now. LLVM is
+    // planning to remove support for this at some point.
+    // See: https://github.com/llvm/llvm-project/issues/71507 and
+    //      https://github.com/llvm/llvm-project/issues/96047
+    auto sizeOp = getSizeInBytes(loc, inputTy, rewriter);
+    rewriter.replaceOp(sizeOfOp, sizeOp);
     return success();
   }
 };
@@ -575,11 +624,11 @@ class OffsetOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::OffsetOfOp> {
     auto loc = offsetOp.getLoc();
     // TODO: replace this with some target-specific memory layout computation
     // when we upgrade to a newer MLIR.
-    auto zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
+    auto zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
     auto ptrTy = cudaq::cc::PointerType::get(inputTy);
-    auto nul = rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
+    auto nul = cudaq::cc::CastOp::create(rewriter, loc, ptrTy, zero);
     Value nextPtr =
-        rewriter.create<cudaq::cc::ComputePtrOp>(loc, ptrTy, nul, args);
+        cudaq::cc::ComputePtrOp::create(rewriter, loc, ptrTy, nul, args);
     rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(offsetOp, resultTy, nextPtr);
     return success();
   }
@@ -600,8 +649,8 @@ class StdvecDataOpPattern
     auto structTy = dyn_cast<LLVM::LLVMStructType>(operands[0].getType());
     if (!structTy)
       return data.emitError("stdvec_data must have a struct as argument.");
-    auto extract = rewriter.create<LLVM::ExtractValueOp>(
-        data.getLoc(), structTy.getBody()[0], operands[0], zero);
+    auto extract = LLVM::ExtractValueOp::create(
+        rewriter, data.getLoc(), structTy.getBody()[0], operands[0], zero);
     rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(data, resTy, extract);
     return success();
   }
@@ -620,26 +669,26 @@ class StdvecInitOpPattern
     auto ctx = init.getContext();
     auto zero = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{0});
     auto loc = init.getLoc();
-    Value val = rewriter.create<LLVM::UndefOp>(loc, resTy);
+    Value val = LLVM::UndefOp::create(rewriter, loc, resTy);
     auto structTy = dyn_cast<LLVM::LLVMStructType>(resTy);
     if (!structTy)
       return init.emitError("stdvec_init must have a struct as argument.");
-    auto cast = rewriter.create<LLVM::BitcastOp>(loc, structTy.getBody()[0],
-                                                 operands[0]);
-    val = rewriter.create<LLVM::InsertValueOp>(loc, val, cast, zero);
+    auto yolo = LLVM::BitcastOp::create(rewriter, loc, structTy.getBody()[0],
+                                        operands[0]);
+    val = LLVM::InsertValueOp::create(rewriter, loc, val, yolo, zero);
     auto one = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{1});
     if (operands.size() == 2) {
       rewriter.replaceOpWithNewOp<LLVM::InsertValueOp>(init, val, operands[1],
                                                        one);
     } else {
       std::int64_t arrSize =
-          llvm::cast<LLVM::LLVMArrayType>(
-              llvm::cast<LLVM::LLVMPointerType>(operands[0].getType())
+          cast<cudaq::cc::ArrayType>(
+              cast<cudaq::cc::PointerType>(init.getBuffer().getType())
                   .getElementType())
-              .getNumElements();
+              .getSize();
       auto i64Ty = rewriter.getI64Type();
-      Value len = rewriter.create<LLVM::ConstantOp>(
-          loc, i64Ty, IntegerAttr::get(i64Ty, arrSize));
+      Value len = LLVM::ConstantOp::create(rewriter, loc, i64Ty,
+                                           IntegerAttr::get(i64Ty, arrSize));
       rewriter.replaceOpWithNewOp<LLVM::InsertValueOp>(init, val, len, one);
     }
     return success();
@@ -693,7 +742,7 @@ class CreateStringLiteralOpPattern
     // Get the string address
     rewriter.replaceOpWithNewOp<LLVM::AddressOfOp>(
         stringLiteralOp,
-        cudaq::opt::factory::getPointerType(slGlobal.getType()),
+        cudaq::opt::factory::getPointerType(rewriter.getContext()),
         slGlobal.getSymName());
 
     return success();
@@ -751,8 +800,19 @@ class VarargCallPattern
     SmallVector<Type> types;
     for (auto ty : vcall.getResultTypes())
       types.push_back(getTypeConverter()->convertType(ty));
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(vcall, types, vcall.getCallee(),
-                                              adaptor.getArgs());
+
+    // For vararg calls, we need to set the var_callee_type attribute. Look up
+    // the callee function to get its type.
+    auto calleeName = vcall.getCalleeAttr();
+    TypeAttr varCalleeType;
+    if (auto func = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
+            vcall, calleeName))
+      varCalleeType = TypeAttr::get(func.getFunctionType());
+
+    auto callOp = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        vcall, types, calleeName, adaptor.getArgs());
+    if (varCalleeType)
+      callOp.setVarCalleeTypeAttr(varCalleeType);
     return success();
   }
 };
diff --git a/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp b/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
index 9dc2b679ea3..d484ab866c9 100644
--- a/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
@@ -6,13 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/CCToLLVM.h"
-#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
diff --git a/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp b/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
index 9cb7869cd66..7915d25286f 100644
--- a/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
@@ -11,23 +11,19 @@
 #include "cudaq/Optimizer/CodeGen/CudaqFunctionNames.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/QuakeToExecMgr.h"
-#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
-#define DEBUG_TYPE "convert-to-cc"
-
 namespace cudaq::opt {
 #define GEN_PASS_DEF_QUAKETOCCPREP
 #define GEN_PASS_DEF_QUAKETOCC
 #include "cudaq/Optimizer/CodeGen/Passes.h.inc"
 } // namespace cudaq::opt
 
+#define DEBUG_TYPE "convert-to-cc"
+
 using namespace mlir;
 
 namespace {
@@ -101,7 +97,7 @@ struct QuakeToCCPrepPass
       return;
     }
 
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "Module after prep:\n"; op->dump());
   }
diff --git a/lib/Optimizer/CodeGen/ConvertToQIR.cpp b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
index 686eb82d806..e8442ee0eaa 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIR.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
@@ -16,8 +16,6 @@
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h"
 #include "cudaq/Optimizer/CodeGen/QuakeToLLVM.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -31,11 +29,16 @@
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
+namespace cudaq::opt {
+#define GEN_PASS_DEF_CONVERTTOQIR
+#define GEN_PASS_DEF_LOWERTOCG
+#include "cudaq/Optimizer/CodeGen/Passes.h.inc"
+} // namespace cudaq::opt
+
 #define DEBUG_TYPE "convert-to-qir"
 
 /**
@@ -45,12 +48,6 @@
    version 0.1.
  */
 
-namespace cudaq::opt {
-#define GEN_PASS_DEF_CONVERTTOQIR
-#define GEN_PASS_DEF_LOWERTOCG
-#include "cudaq/Optimizer/CodeGen/Passes.h.inc"
-} // namespace cudaq::opt
-
 using namespace mlir;
 
 #include "PeepholePatterns.inc"
@@ -61,7 +58,7 @@ static LogicalResult fuseSubgraphPatterns(MLIRContext *ctx, ModuleOp module) {
   RewritePatternSet patterns(ctx);
   cudaq::codegen::populateQuakeToCodegenPatterns(patterns);
   LLVM_DEBUG(llvm::dbgs() << "Before codegen dialect:\n"; module.dump());
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+  if (failed(applyPatternsGreedily(module, std::move(patterns))))
     return failure();
   LLVM_DEBUG(llvm::dbgs() << "After codegen dialect:\n"; module.dump());
   return success();
@@ -120,18 +117,19 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
           auto v = [&]() -> Value {
             auto val = constantValues[idx];
             if (auto fTy = dyn_cast<FloatType>(eleTy))
-              return builder.create<arith::ConstantFloatOp>(
-                  loc, cast<FloatAttr>(val).getValue(), fTy);
+              return arith::ConstantFloatOp::create(
+                  builder, loc, fTy, cast<FloatAttr>(val).getValue());
             if (auto iTy = dyn_cast<IntegerType>(eleTy))
-              return builder.create<arith::ConstantIntOp>(
-                  loc, cast<IntegerAttr>(val).getInt(), iTy);
+              return arith::ConstantIntOp::create(
+                  builder, loc, iTy, cast<IntegerAttr>(val).getInt());
             auto cTy = cast<ComplexType>(eleTy);
-            return builder.create<complex::ConstantOp>(loc, cTy,
-                                                       cast<ArrayAttr>(val));
+            return complex::ConstantOp::create(builder, loc, cTy,
+                                               cast<ArrayAttr>(val));
           }();
-          Value arrWithOffset = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, ptrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{idx});
-          builder.create<cudaq::cc::StoreOp>(loc, v, arrWithOffset);
+          Value arrWithOffset = cudaq::cc::ComputePtrOp::create(
+              builder, loc, ptrTy, buffer,
+              ArrayRef<cudaq::cc::ComputePtrArg>{idx});
+          cudaq::cc::StoreOp::create(builder, loc, v, arrWithOffset);
         }
         cleanUps.push_back(user);
       }
@@ -195,10 +193,12 @@ class ConvertToQIR : public cudaq::opt::impl::ConvertToQIRBase<ConvertToQIR> {
 } // namespace
 
 void cudaq::opt::initializeTypeConversions(LLVMTypeConverter &typeConverter) {
-  typeConverter.addConversion(
-      [](quake::VeqType type) { return getArrayType(type.getContext()); });
-  typeConverter.addConversion(
-      [](quake::RefType type) { return getQubitType(type.getContext()); });
+  typeConverter.addConversion([](quake::VeqType type) {
+    return cg::getLLVMArrayType(type.getContext());
+  });
+  typeConverter.addConversion([](quake::RefType type) {
+    return cg::getLLVMQubitType(type.getContext());
+  });
   typeConverter.addConversion([&](quake::StruqType type) {
     SmallVector<Type> mems;
     for (auto m : type.getMembers())
diff --git a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
index 310de98707b..b369d6d8af9 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
@@ -7,28 +7,20 @@
  ******************************************************************************/
 
 #include "CodeGenOps.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
-#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/QIRAttributeNames.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h"
 #include "cudaq/Optimizer/CodeGen/QuakeToExecMgr.h"
-#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h" // for GlobalizeArrayValues
 #include "nlohmann/json.hpp"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
 #define DEBUG_TYPE "convert-to-qir-api"
@@ -100,10 +92,11 @@ static Value createGlobalCString(Operation *op, Location loc,
   cudaq::IRBuilder irb(rewriter.getContext());
   auto mod = op->getParentOfType<ModuleOp>();
   auto nameObj = irb.genCStringLiteralAppendNul(loc, mod, regName);
-  Value nameVal = rewriter.create<cudaq::cc::AddressOfOp>(
-      loc, cudaq::cc::PointerType::get(nameObj.getType()), nameObj.getName());
+  Value nameVal = cudaq::cc::AddressOfOp::create(
+      rewriter, loc, cudaq::cc::PointerType::get(nameObj.getType()),
+      nameObj.getName());
   auto cstrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
-  return rewriter.create<cudaq::cc::CastOp>(loc, cstrTy, nameVal);
+  return cudaq::cc::CastOp::create(rewriter, loc, cstrTy, nameVal);
 }
 
 /// Use modifier class classes to specialize the QIR API to a particular flavor
@@ -241,22 +234,22 @@ struct AllocaOpToCallsRewrite : public OpConversionPattern<quake::AllocaOp> {
     Value sizeOperand;
     auto loc = alloc.getLoc();
     if (adaptor.getOperands().empty()) {
-      auto type = alloc.getType().cast<quake::VeqType>();
+      auto type = cast<quake::VeqType>(alloc.getType());
       if (!type.hasSpecifiedSize())
         return failure();
       auto constantSize = type.getSize();
       sizeOperand =
-          rewriter.create<arith::ConstantIntOp>(loc, constantSize, 64);
+          arith::ConstantIntOp::create(rewriter, loc, constantSize, 64);
     } else {
       sizeOperand = adaptor.getOperands().front();
       auto sizeOpTy = cast<IntegerType>(sizeOperand.getType());
       if (sizeOpTy.getWidth() < 64)
-        sizeOperand = rewriter.create<cudaq::cc::CastOp>(
-            loc, rewriter.getI64Type(), sizeOperand,
+        sizeOperand = cudaq::cc::CastOp::create(
+            rewriter, loc, rewriter.getI64Type(), sizeOperand,
             cudaq::cc::CastOpMode::Unsigned);
       else if (sizeOpTy.getWidth() > 64)
-        sizeOperand = rewriter.create<cudaq::cc::CastOp>(
-            loc, rewriter.getI64Type(), sizeOperand);
+        sizeOperand = cudaq::cc::CastOp::create(
+            rewriter, loc, rewriter.getI64Type(), sizeOperand);
     }
 
     // Replace the AllocaOp with the QIR call.
@@ -300,8 +293,9 @@ struct NullCableOpToCallsRewrite
     // return type.
     auto loc = nullcable.getLoc();
     quake::CableType type = nullcable.getType();
-    auto width = type.getSize();
-    Value sizeOperand = rewriter.create<arith::ConstantIntOp>(loc, width, 64);
+    auto constantSize = type.getSize();
+    Value sizeOperand =
+        arith::ConstantIntOp::create(rewriter, loc, constantSize, 64);
 
     // Replace the NullCableOp with the QIR call.
     rewriter.replaceOpWithNewOp<func::CallOp>(
@@ -337,7 +331,7 @@ struct AllocaOpToIntRewrite : public OpConversionPattern<quake::AllocaOp> {
     // the startingIndex as the qubit value. Voila!
     if (auto resultType = dyn_cast<quake::RefType>(ty)) {
       Value index =
-          rewriter.create<arith::ConstantIntOp>(loc, startingOffset, 64);
+          arith::ConstantIntOp::create(rewriter, loc, startingOffset, 64);
       auto qubitTy = M::getQubitType(rewriter.getContext());
       rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(alloc, qubitTy, index);
       return success();
@@ -358,8 +352,8 @@ struct AllocaOpToIntRewrite : public OpConversionPattern<quake::AllocaOp> {
     SmallVector<std::int64_t> data;
     for (std::int64_t i = 0; i < veqSize; ++i)
       data.emplace_back(startingOffset + i);
-    auto arr = rewriter.create<cudaq::cc::ConstantArrayOp>(
-        loc, arrTy, rewriter.getI64ArrayAttr(data));
+    auto arr = cudaq::cc::ConstantArrayOp::create(
+        rewriter, loc, arrTy, rewriter.getI64ArrayAttr(data));
     Type qirArrTy = M::getArrayType(rewriter.getContext());
     rewriter.replaceOpWithNewOp<cudaq::codegen::MaterializeConstantArrayOp>(
         alloc, qirArrTy, arr);
@@ -387,7 +381,7 @@ struct NullWireOpToIntRewrite : public OpConversionPattern<quake::NullWireOp> {
     // In this case this is allocating a single qubit, so we can just substitute
     // the startingIndex as the qubit value. Voila!
     Value index =
-        rewriter.create<arith::ConstantIntOp>(loc, startingOffset, 64);
+        arith::ConstantIntOp::create(rewriter, loc, startingOffset, 64);
     auto qubitTy = M::getQubitType(rewriter.getContext());
     rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(nullwire, qubitTy, index);
     return success();
@@ -426,8 +420,8 @@ struct NullCableOpToIntRewrite
     SmallVector<std::int64_t> data;
     for (std::int64_t i = 0; i < cableSize; ++i)
       data.emplace_back(startingOffset + i);
-    auto arr = rewriter.create<cudaq::cc::ConstantArrayOp>(
-        loc, arrTy, rewriter.getI64ArrayAttr(data));
+    auto arr = cudaq::cc::ConstantArrayOp::create(
+        rewriter, loc, arrTy, rewriter.getI64ArrayAttr(data));
     Type qirArrTy = M::getArrayType(rewriter.getContext());
     rewriter.replaceOpWithNewOp<cudaq::codegen::MaterializeConstantArrayOp>(
         nullcable, qirArrTy, arr);
@@ -435,6 +429,15 @@ struct NullCableOpToIntRewrite
   }
 };
 
+template <typename OP>
+Type getInitialType(OP op, unsigned off) {
+  ArrayAttr initialArgs =
+      op->template getAttrOfType<ArrayAttr>(InitialArgTypesAttrName);
+  if (!initialArgs)
+    return {};
+  return cast<TypeAttr>(initialArgs[off]).getValue();
+}
+
 template <typename M>
 struct ApplyNoiseOpRewrite : public OpConversionPattern<quake::ApplyNoiseOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -444,6 +447,7 @@ struct ApplyNoiseOpRewrite : public OpConversionPattern<quake::ApplyNoiseOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = noise.getLoc();
 
+    const unsigned paramOffset = noise.getKey() ? 1 : 0;
     if (!noise.getNoiseFunc()) {
       // This is the key-based variant. Call the generalized version of the
       // apply_kraus_channel helper function. Let it do all the conversions into
@@ -451,50 +455,51 @@ struct ApplyNoiseOpRewrite : public OpConversionPattern<quake::ApplyNoiseOp> {
       SmallVector<Value> args;
       const bool pushASpan =
           adaptor.getParameters().size() == 1 &&
-          isa<cudaq::cc::StdvecType>(adaptor.getParameters()[0].getType());
+          isa<cudaq::cc::StdvecType>(getInitialType(noise, paramOffset));
       const bool usingDouble = [&]() {
         if (adaptor.getParameters().empty())
           return true;
-        auto param0 = adaptor.getParameters()[0];
+        Type param0Ty = getInitialType(noise, paramOffset);
         if (pushASpan)
-          return cast<cudaq::cc::StdvecType>(param0.getType())
-                     .getElementType() == rewriter.getF64Type();
-        return cast<cudaq::cc::PointerType>(param0.getType())
-                   .getElementType() == rewriter.getF64Type();
+          return cast<cudaq::cc::StdvecType>(param0Ty).getElementType() ==
+                 rewriter.getF64Type();
+        return cast<cudaq::cc::PointerType>(param0Ty).getElementType() ==
+               rewriter.getF64Type();
       }();
       if (usingDouble) {
         auto code = static_cast<std::int64_t>(
             cudaq::opt::KrausChannelDataKind::DoubleKind);
-        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, code, 64));
+        args.push_back(arith::ConstantIntOp::create(rewriter, loc, code, 64));
       } else {
         auto code = static_cast<std::int64_t>(
             cudaq::opt::KrausChannelDataKind::FloatKind);
-        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, code, 64));
+        args.push_back(arith::ConstantIntOp::create(rewriter, loc, code, 64));
       }
       args.push_back(adaptor.getKey());
       if (pushASpan) {
-        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, 1, 64));
-        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, 0, 64));
+        args.push_back(arith::ConstantIntOp::create(rewriter, loc, 1, 64));
+        args.push_back(arith::ConstantIntOp::create(rewriter, loc, 0, 64));
       } else {
-        args.push_back(rewriter.create<arith::ConstantIntOp>(loc, 0, 64));
+        args.push_back(arith::ConstantIntOp::create(rewriter, loc, 0, 64));
         auto numParams = std::distance(adaptor.getParameters().begin(),
                                        adaptor.getParameters().end());
         args.push_back(
-            rewriter.create<arith::ConstantIntOp>(loc, numParams, 64));
+            arith::ConstantIntOp::create(rewriter, loc, numParams, 64));
       }
       auto numTargets =
           std::distance(adaptor.getQubits().begin(), adaptor.getQubits().end());
       args.push_back(
-          rewriter.create<arith::ConstantIntOp>(loc, numTargets, 64));
+          arith::ConstantIntOp::create(rewriter, loc, numTargets, 64));
       if (pushASpan) {
         Value stdvec = adaptor.getParameters()[0];
-        auto stdvecTy = cast<cudaq::cc::StdvecType>(stdvec.getType());
+        auto stdvecTy =
+            cast<cudaq::cc::StdvecType>(getInitialType(noise, paramOffset));
         auto dataTy = cudaq::cc::PointerType::get(
             cudaq::cc::ArrayType::get(stdvecTy.getElementType()));
         args.push_back(
-            rewriter.create<cudaq::cc::StdvecDataOp>(loc, dataTy, stdvec));
-        args.push_back(rewriter.create<cudaq::cc::StdvecSizeOp>(
-            loc, rewriter.getI64Type(), stdvec));
+            cudaq::cc::StdvecDataOp::create(rewriter, loc, dataTy, stdvec));
+        args.push_back(cudaq::cc::StdvecSizeOp::create(
+            rewriter, loc, rewriter.getI64Type(), stdvec));
       } else {
         args.append(adaptor.getParameters().begin(),
                     adaptor.getParameters().end());
@@ -529,35 +534,40 @@ struct ApplyNoiseOpRewrite : public OpConversionPattern<quake::ApplyNoiseOp> {
     // already the case, we just append the operands.
     SmallVector<Value> args;
     if (adaptor.getParameters().size() == 1 &&
-        isa<cudaq::cc::StdvecType>(adaptor.getParameters()[0].getType())) {
+        isa<cudaq::cc::StdvecType>(getInitialType(noise, paramOffset))) {
       Value svp = adaptor.getParameters()[0];
       // Convert the device-side span back to a host-side vector so that C++
       // doesn't crash.
-      auto stdvecTy = cast<cudaq::cc::StdvecType>(svp.getType());
+      auto stdvecTy =
+          cast<cudaq::cc::StdvecType>(getInitialType(noise, paramOffset));
       auto *ctx = rewriter.getContext();
       auto ptrTy = cudaq::cc::PointerType::get(stdvecTy.getElementType());
       auto ptrArrTy = cudaq::cc::PointerType::get(
           cudaq::cc::ArrayType::get(stdvecTy.getElementType()));
       auto hostVecTy = cudaq::cc::ArrayType::get(ctx, ptrTy, 3);
-      auto hostVec = rewriter.create<cudaq::cc::AllocaOp>(loc, hostVecTy);
+      auto hostVec = cudaq::cc::AllocaOp::create(rewriter, loc, hostVecTy);
       Value startPtr =
-          rewriter.create<cudaq::cc::StdvecDataOp>(loc, ptrArrTy, svp);
+          cudaq::cc::StdvecDataOp::create(rewriter, loc, ptrArrTy, svp);
       auto i64Ty = rewriter.getI64Type();
-      Value len = rewriter.create<cudaq::cc::StdvecSizeOp>(loc, i64Ty, svp);
-      Value endPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrTy, startPtr, ArrayRef<cudaq::cc::ComputePtrArg>{len});
+      Value len = cudaq::cc::StdvecSizeOp::create(rewriter, loc, i64Ty, svp);
+      Value endPtr = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrTy, startPtr,
+          ArrayRef<cudaq::cc::ComputePtrArg>{len});
       Value castStartPtr =
-          rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, startPtr);
+          cudaq::cc::CastOp::create(rewriter, loc, ptrTy, startPtr);
       auto ptrPtrTy = cudaq::cc::PointerType::get(ptrTy);
-      Value ptr0 = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrTy, hostVec, ArrayRef<cudaq::cc::ComputePtrArg>{0});
-      rewriter.create<cudaq::cc::StoreOp>(loc, castStartPtr, ptr0);
-      Value ptr1 = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrTy, hostVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-      rewriter.create<cudaq::cc::StoreOp>(loc, endPtr, ptr1);
-      Value ptr2 = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrTy, hostVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
-      rewriter.create<cudaq::cc::StoreOp>(loc, endPtr, ptr2);
+      Value ptr0 = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrPtrTy, hostVec,
+          ArrayRef<cudaq::cc::ComputePtrArg>{0});
+      cudaq::cc::StoreOp::create(rewriter, loc, castStartPtr, ptr0);
+      Value ptr1 = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrPtrTy, hostVec,
+          ArrayRef<cudaq::cc::ComputePtrArg>{1});
+      cudaq::cc::StoreOp::create(rewriter, loc, endPtr, ptr1);
+      Value ptr2 = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrPtrTy, hostVec,
+          ArrayRef<cudaq::cc::ComputePtrArg>{2});
+      cudaq::cc::StoreOp::create(rewriter, loc, endPtr, ptr2);
 
       // N.B. This pointer must be treated as const by the C++ side and should
       // never have move semantics!
@@ -581,11 +591,15 @@ struct ApplyNoiseOpRewrite : public OpConversionPattern<quake::ApplyNoiseOp> {
     SmallVector<Value> qubits;
     SmallVector<Value> converted;
     Type qirArrTy = M::getArrayType(rewriter.getContext());
-    for (auto [qb, oa] : llvm::zip(adaptor.getQubits(), noise.getQubits())) {
-      if ((oa && isa<quake::VeqType>(oa.getType())) ||
-          (!oa && (qb.getType() == qirArrTy))) {
-        auto svec = rewriter.create<func::CallOp>(
-            loc, qirArrTy, cudaq::opt::QISConvertArrayToStdvec, ValueRange{qb});
+    SmallVector<Type> origQubitTys;
+    for (auto [i, _] : llvm::enumerate(noise.getQubits()))
+      origQubitTys.push_back(getInitialType(
+          noise, paramOffset + adaptor.getParameters().size() + i));
+    for (auto [qb, oa] : llvm::zip(adaptor.getQubits(), origQubitTys)) {
+      if (isa<quake::VeqType>(oa)) {
+        auto svec = func::CallOp::create(rewriter, loc, qirArrTy,
+                                         cudaq::opt::QISConvertArrayToStdvec,
+                                         ValueRange{qb});
         qb = svec.getResult(0);
         converted.push_back(qb);
       }
@@ -595,8 +609,8 @@ struct ApplyNoiseOpRewrite : public OpConversionPattern<quake::ApplyNoiseOp> {
     rewriter.replaceOpWithNewOp<func::CallOp>(noise, TypeRange{},
                                               *noise.getNoiseFunc(), args);
     for (auto v : converted)
-      rewriter.create<func::CallOp>(
-          loc, TypeRange{}, cudaq::opt::QISFreeConvertedStdvec, ValueRange{v});
+      func::CallOp::create(rewriter, loc, TypeRange{},
+                           cudaq::opt::QISFreeConvertedStdvec, ValueRange{v});
     return success();
   }
 };
@@ -618,39 +632,42 @@ struct MaterializeConstantArrayOpRewrite
   }
 };
 
+/// This helper base class provides shared functionality to convert single
+/// qubits (`!quake.ref`) to vectors of qubits (`!quake.veq`) to satisfy the QIR
+/// API.
 template <typename M, typename OP>
 struct QubitHelperConversionPattern : public OpConversionPattern<OP> {
   using Base = OpConversionPattern<OP>;
   using Base::Base;
 
   Value wrapQubitAsArray(Location loc, ConversionPatternRewriter &rewriter,
-                         Value val) const {
-    Type qubitTy = M::getQubitType(rewriter.getContext());
-    if (val.getType() != qubitTy)
+                         Value val, Type origTy) const {
+    if (isa<quake::VeqType>(origTy))
       return val;
 
     // Create a QIR array container of 1 element.
     auto ptrTy = cudaq::cc::PointerType::get(rewriter.getNoneType());
-    Value sizeofPtrVal =
-        rewriter.create<cudaq::cc::SizeOfOp>(loc, rewriter.getI32Type(), ptrTy);
-    Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
+    Value sizeofPtrVal = cudaq::cc::SizeOfOp::create(
+        rewriter, loc, rewriter.getI32Type(), ptrTy);
+    Value one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
     Type arrayTy = M::getArrayType(rewriter.getContext());
-    auto newArr = rewriter.create<func::CallOp>(
-        loc, TypeRange{arrayTy}, cudaq::opt::QIRArrayCreateArray,
-        ArrayRef<Value>{sizeofPtrVal, one});
+    auto newArr = func::CallOp::create(rewriter, loc, TypeRange{arrayTy},
+                                       cudaq::opt::QIRArrayCreateArray,
+                                       ArrayRef<Value>{sizeofPtrVal, one});
     Value result = newArr.getResult(0);
 
     // Get a pointer to element 0.
-    Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
+    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+    Type qubitTy = M::getQubitType(rewriter.getContext());
     auto ptrQubitTy = cudaq::cc::PointerType::get(qubitTy);
-    auto elePtr = rewriter.create<func::CallOp>(
-        loc, TypeRange{ptrQubitTy}, cudaq::opt::QIRArrayGetElementPtr1d,
-        ArrayRef<Value>{result, zero});
+    auto elePtr = func::CallOp::create(rewriter, loc, TypeRange{ptrQubitTy},
+                                       cudaq::opt::QIRArrayGetElementPtr1d,
+                                       ArrayRef<Value>{result, zero});
 
     // Write the qubit into the array at position 0.
-    auto castVal = rewriter.create<cudaq::cc::CastOp>(loc, qubitTy, val);
+    auto castVal = cudaq::cc::CastOp::create(rewriter, loc, qubitTy, val);
     Value addr = elePtr.getResult(0);
-    rewriter.create<cudaq::cc::StoreOp>(loc, castVal, addr);
+    cudaq::cc::StoreOp::create(rewriter, loc, castVal, addr);
 
     return result;
   }
@@ -680,11 +697,17 @@ struct ConcatOpRewrite
     auto loc = concat.getLoc();
     Type arrayTy = M::getArrayType(rewriter.getContext());
     Value firstOperand = adaptor.getOperands().front();
-    Value resultArray = Base::wrapQubitAsArray(loc, rewriter, firstOperand);
-    for (auto next : adaptor.getOperands().drop_front()) {
-      Value wrapNext = Base::wrapQubitAsArray(loc, rewriter, next);
-      auto appended = rewriter.create<func::CallOp>(
-          loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
+    Type firstTy = getInitialType(concat, 0);
+    Value resultArray =
+        Base::wrapQubitAsArray(loc, rewriter, firstOperand, firstTy);
+    SmallVector<Type> origTys;
+    for (auto [i, _] : llvm::enumerate(adaptor.getOperands().drop_front()))
+      origTys.push_back(getInitialType(concat, i + 1));
+    for (auto [next, origTy] :
+         llvm::zip(adaptor.getOperands().drop_front(), origTys)) {
+      Value wrapNext = Base::wrapQubitAsArray(loc, rewriter, next, origTy);
+      auto appended = func::CallOp::create(
+          rewriter, loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
           ArrayRef<Value>{resultArray, wrapNext});
       resultArray = appended.getResult(0);
     }
@@ -740,6 +763,7 @@ struct DeallocLikeErase : public OpConversionPattern<OP> {
 
 using DeallocOpErase = DeallocLikeErase<quake::DeallocOp>;
 using SinkOpErase = DeallocLikeErase<quake::SinkOp>;
+
 struct DiscriminateOpRewrite
     : public OpConversionPattern<quake::DiscriminateOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -750,7 +774,7 @@ struct DiscriminateOpRewrite
     auto loc = disc.getLoc();
     Value m = adaptor.getMeasurement();
     auto i1PtrTy = cudaq::cc::PointerType::get(rewriter.getI1Type());
-    auto cast = rewriter.create<cudaq::cc::CastOp>(loc, i1PtrTy, m);
+    auto cast = cudaq::cc::CastOp::create(rewriter, loc, i1PtrTy, m);
     rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, cast);
     return success();
   }
@@ -776,7 +800,7 @@ struct DiscriminateOpToCallRewrite
     if (operands.size() == 1 && isa<IntegerType>(operands.front().getType())) {
       auto resultTy = M::getResultType(rewriter.getContext());
       operands.front() =
-          rewriter.create<cudaq::cc::CastOp>(loc, resultTy, operands.front());
+          cudaq::cc::CastOp::create(rewriter, loc, resultTy, operands.front());
     }
     if constexpr (M::discriminateToClassical) {
       if constexpr (M::qirVersion == QirVersion::version_1_0) {
@@ -791,9 +815,10 @@ struct DiscriminateOpToCallRewrite
     } else {
       // NB: the double cast here is to avoid folding the pointer casts.
       auto i64Ty = rewriter.getI64Type();
-      auto unu = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, operands);
+      auto unu = cudaq::cc::CastOp::create(rewriter, loc, i64Ty,
+                                           adaptor.getOperands());
       auto ptrI1Ty = cudaq::cc::PointerType::get(rewriter.getI1Type());
-      auto du = rewriter.create<cudaq::cc::CastOp>(loc, ptrI1Ty, unu);
+      auto du = cudaq::cc::CastOp::create(rewriter, loc, ptrI1Ty, unu);
       rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, du);
     }
     return success();
@@ -820,16 +845,16 @@ struct ExtractRefOpRewrite : public OpConversionPattern<quake::ExtractRefOp> {
 
     Value index;
     if (!adaptor.getIndex()) {
-      index = rewriter.create<arith::ConstantIntOp>(
-          loc, extract.getConstantIndex(), 64);
+      index = arith::ConstantIntOp::create(rewriter, loc,
+                                           extract.getConstantIndex(), 64);
     } else {
       index = adaptor.getIndex();
       if (index.getType().isIntOrFloat()) {
         if (cast<IntegerType>(index.getType()).getWidth() < 64)
-          index = rewriter.create<cudaq::cc::CastOp>(
-              loc, i64Ty, index, cudaq::cc::CastOpMode::Unsigned);
+          index = cudaq::cc::CastOp::create(rewriter, loc, i64Ty, index,
+                                            cudaq::cc::CastOpMode::Unsigned);
         else if (cast<IntegerType>(index.getType()).getWidth() > 64)
-          index = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, index);
+          index = cudaq::cc::CastOp::create(rewriter, loc, i64Ty, index);
       }
     }
     auto qubitTy = M::getQubitType(rewriter.getContext());
@@ -837,15 +862,15 @@ struct ExtractRefOpRewrite : public OpConversionPattern<quake::ExtractRefOp> {
     if (auto mca =
             veq.getDefiningOp<cudaq::codegen::MaterializeConstantArrayOp>()) {
       // This is the profile QIR case.
-      auto ext = rewriter.create<cudaq::cc::ExtractValueOp>(
-          loc, i64Ty, mca.getConstArray(), index);
+      auto ext = cudaq::cc::ExtractValueOp::create(rewriter, loc, i64Ty,
+                                                   mca.getConstArray(), index);
       rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(extract, qubitTy, ext);
       return success();
     }
 
     // Otherwise, this must be full QIR.
-    auto call = rewriter.create<func::CallOp>(
-        loc, cudaq::cc::PointerType::get(qubitTy),
+    auto call = func::CallOp::create(
+        rewriter, loc, cudaq::cc::PointerType::get(qubitTy),
         cudaq::opt::QIRArrayGetElementPtr1d, ArrayRef<Value>{veq, index});
     rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(extract, call.getResult(0));
     return success();
@@ -888,12 +913,12 @@ struct MakeStruqOpRewrite : public OpConversionPattern<quake::MakeStruqOp> {
     auto loc = mkstruq.getLoc();
     auto *ctx = rewriter.getContext();
     auto toTy = getTypeConverter()->convertType(mkstruq.getType());
-    Value result = rewriter.create<cudaq::cc::UndefOp>(loc, toTy);
+    Value result = cudaq::cc::UndefOp::create(rewriter, loc, toTy);
     std::int64_t count = 0;
     for (auto op : adaptor.getOperands()) {
       auto off = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{count});
-      result =
-          rewriter.create<cudaq::cc::InsertValueOp>(loc, toTy, result, op, off);
+      result = cudaq::cc::InsertValueOp::create(rewriter, loc, toTy, result, op,
+                                                off);
       count++;
     }
     rewriter.replaceOp(mkstruq, result);
@@ -967,20 +992,20 @@ struct QmemRAIIOpRewrite : public OpConversionPattern<cudaq::codegen::RAIIOp> {
       auto type = dyn_cast<quake::VeqType>(allocTy);
       auto constantSize = type ? type.getSize() : 1;
       sizeOperand =
-          rewriter.create<arith::ConstantIntOp>(loc, constantSize, 64);
+          arith::ConstantIntOp::create(rewriter, loc, constantSize, 64);
     } else {
       sizeOperand = adaptor.getAllocSize();
       auto sizeTy = cast<IntegerType>(sizeOperand.getType());
       if (sizeTy.getWidth() < 64)
-        sizeOperand = rewriter.create<cudaq::cc::CastOp>(
-            loc, i64Ty, sizeOperand, cudaq::cc::CastOpMode::Unsigned);
+        sizeOperand = cudaq::cc::CastOp::create(
+            rewriter, loc, i64Ty, sizeOperand, cudaq::cc::CastOpMode::Unsigned);
       else if (sizeTy.getWidth() > 64)
         sizeOperand =
-            rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, sizeOperand);
+            cudaq::cc::CastOp::create(rewriter, loc, i64Ty, sizeOperand);
     }
 
     // Call the allocation function
-    Value casted = rewriter.create<cudaq::cc::CastOp>(loc, ptrTy, ccState);
+    Value casted = cudaq::cc::CastOp::create(rewriter, loc, ptrTy, ccState);
     rewriter.replaceOpWithNewOp<func::CallOp>(
         raii, arrayTy, functionName, ArrayRef<Value>{sizeOperand, casted});
     return success();
@@ -1009,24 +1034,24 @@ struct SubveqOpRewrite : public OpConversionPattern<quake::SubVeqOp> {
 
     auto lowArg = [&]() -> Value {
       if (!adaptor.getLower())
-        return rewriter.create<arith::ConstantIntOp>(loc, adaptor.getRawLower(),
-                                                     64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            adaptor.getRawLower(), 64);
       return adaptor.getLower();
     }();
     auto highArg = [&]() -> Value {
       if (!adaptor.getUpper())
-        return rewriter.create<arith::ConstantIntOp>(loc, adaptor.getRawUpper(),
-                                                     64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            adaptor.getRawUpper(), 64);
       return adaptor.getUpper();
     }();
     auto i64Ty = rewriter.getI64Type();
     auto extend = [&](Value &v) -> Value {
       if (auto intTy = dyn_cast<IntegerType>(v.getType())) {
         if (intTy.getWidth() < 64)
-          return rewriter.create<cudaq::cc::CastOp>(
-              loc, i64Ty, v, cudaq::cc::CastOpMode::Unsigned);
+          return cudaq::cc::CastOp::create(rewriter, loc, i64Ty, v,
+                                           cudaq::cc::CastOpMode::Unsigned);
         if (intTy.getWidth() > 64)
-          return rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, v);
+          return cudaq::cc::CastOp::create(rewriter, loc, i64Ty, v);
       }
       return v;
     };
@@ -1034,8 +1059,8 @@ struct SubveqOpRewrite : public OpConversionPattern<quake::SubVeqOp> {
     highArg = extend(highArg);
     Value inArr = adaptor.getVeq();
     auto i32Ty = rewriter.getI32Type();
-    Value one32 = rewriter.create<arith::ConstantIntOp>(loc, 1, i32Ty);
-    Value one64 = rewriter.create<arith::ConstantIntOp>(loc, 1, i64Ty);
+    Value one32 = arith::ConstantIntOp::create(rewriter, loc, i32Ty, 1);
+    Value one64 = arith::ConstantIntOp::create(rewriter, loc, i64Ty, 1);
     auto arrayTy = M::getArrayType(rewriter.getContext());
     rewriter.replaceOpWithNewOp<func::CallOp>(
         subveq, arrayTy, cudaq::opt::QIRArraySlice,
@@ -1096,12 +1121,20 @@ struct CustomUnitaryOpPattern
       return unitary.emitOpError("Custom operations must have targets.");
 
     // Concat all the targets into an array.
-    auto targetArray =
-        Base::wrapQubitAsArray(loc, rewriter, adaptor.getTargets().front());
-    for (auto next : adaptor.getTargets().drop_front()) {
-      auto wrapNext = Base::wrapQubitAsArray(loc, rewriter, next);
-      auto result = rewriter.create<func::CallOp>(
-          loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
+    Type firstTy = getInitialType(unitary, adaptor.getParameters().size() +
+                                               adaptor.getControls().size());
+    auto targetArray = Base::wrapQubitAsArray(
+        loc, rewriter, adaptor.getTargets().front(), firstTy);
+    SmallVector<Type> origTys;
+    for (auto [i, _] : llvm::enumerate(adaptor.getTargets().drop_front()))
+      origTys.push_back(
+          getInitialType(unitary, adaptor.getParameters().size() +
+                                      adaptor.getControls().size() + i + 1));
+    for (auto [next, origTy] :
+         llvm::zip(adaptor.getTargets().drop_front(), origTys)) {
+      auto wrapNext = Base::wrapQubitAsArray(loc, rewriter, next, origTy);
+      auto result = func::CallOp::create(
+          rewriter, loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
           ArrayRef<Value>{targetArray, wrapNext});
       targetArray = result.getResult(0);
     }
@@ -1110,15 +1143,21 @@ struct CustomUnitaryOpPattern
     Value controlArray;
     if (adaptor.getControls().empty()) {
       // Use a nullptr for when 0 control qubits are present.
-      Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-      controlArray = rewriter.create<cudaq::cc::CastOp>(loc, arrayTy, zero);
+      Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+      controlArray = cudaq::cc::CastOp::create(rewriter, loc, arrayTy, zero);
     } else {
-      controlArray =
-          Base::wrapQubitAsArray(loc, rewriter, adaptor.getControls().front());
-      for (auto next : adaptor.getControls().drop_front()) {
-        auto wrapNext = Base::wrapQubitAsArray(loc, rewriter, next);
-        auto result = rewriter.create<func::CallOp>(
-            loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
+      Type firstTy = getInitialType(unitary, adaptor.getParameters().size());
+      controlArray = Base::wrapQubitAsArray(
+          loc, rewriter, adaptor.getControls().front(), firstTy);
+      SmallVector<Type> origTys;
+      for (auto [i, _] : llvm::enumerate(adaptor.getControls().drop_front()))
+        origTys.push_back(
+            getInitialType(unitary, adaptor.getParameters().size() + i + 1));
+      for (auto [next, origTy] :
+           llvm::zip(adaptor.getControls().drop_front(), origTys)) {
+        auto wrapNext = Base::wrapQubitAsArray(loc, rewriter, next, origTy);
+        auto result = func::CallOp::create(
+            rewriter, loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
             ArrayRef<Value>{controlArray, wrapNext});
         controlArray = result.getResult(0);
       }
@@ -1136,10 +1175,10 @@ struct CustomUnitaryOpPattern
     auto complex64PtrTy = cudaq::cc::PointerType::get(complex64Ty);
     auto globalObj = cast<cudaq::cc::GlobalOp>(
         unitary->getParentOfType<ModuleOp>().lookupSymbol(generatorName));
-    auto addrOp = rewriter.create<cudaq::cc::AddressOfOp>(
-        loc, globalObj.getType(), generatorName);
+    auto addrOp = cudaq::cc::AddressOfOp::create(
+        rewriter, loc, globalObj.getType(), generatorName);
     auto unitaryData =
-        rewriter.create<cudaq::cc::CastOp>(loc, complex64PtrTy, addrOp);
+        cudaq::cc::CastOp::create(rewriter, loc, complex64PtrTy, addrOp);
 
     StringRef functionName =
         unitary.isAdj() ? cudaq::opt::QIRCustomAdjOp : cudaq::opt::QIRCustomOp;
@@ -1179,18 +1218,29 @@ struct ExpPauliOpPattern
     if (adaptor.getNegatedQubitControls())
       return pauli->emitOpError("negated control qubits not allowed.");
     SmallVector<Value> controls;
+    const auto firstControlIndex = adaptor.getParameters().size();
     if (adaptor.getControls().empty()) {
       // do nothing
     } else if (adaptor.getControls().size() > 1 ||
-               !isa<quake::VeqType>(adaptor.getControls().front().getType())) {
+               !isa<quake::VeqType>(getInitialType(pauli, firstControlIndex))) {
       // Concat all controls into a single Array.
       Type arrayTy = M::getArrayType(rewriter.getContext());
+      auto wrapIfQubit = [&](Value adaptorVal, Type origTy) {
+        if (isa<quake::VeqType>(origTy))
+          return adaptorVal;
+        return Base::wrapQubitAsArray(loc, rewriter, adaptorVal, origTy);
+      };
       Value firstOperand = adaptor.getControls().front();
-      Value resultArray = Base::wrapQubitAsArray(loc, rewriter, firstOperand);
-      for (auto next : adaptor.getControls().drop_front()) {
-        Value wrapNext = Base::wrapQubitAsArray(loc, rewriter, next);
-        auto appended = rewriter.create<func::CallOp>(
-            loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
+      Type firstTy = getInitialType(pauli, firstControlIndex);
+      Value resultArray = wrapIfQubit(firstOperand, firstTy);
+      SmallVector<Type> origCtrlTys;
+      for (auto [i, _] : llvm::enumerate(adaptor.getControls().drop_front()))
+        origCtrlTys.push_back(getInitialType(pauli, firstControlIndex + i + 1));
+      for (auto [next, origCtrlTy] :
+           llvm::zip(adaptor.getControls().drop_front(), origCtrlTys)) {
+        Value wrapNext = wrapIfQubit(next, origCtrlTy);
+        auto appended = func::CallOp::create(
+            rewriter, loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
             ArrayRef<Value>{resultArray, wrapNext});
         resultArray = appended.getResult(0);
       }
@@ -1199,16 +1249,23 @@ struct ExpPauliOpPattern
       controls.push_back(adaptor.getControls().front());
     }
     SmallVector<Value> targets;
-    if (adaptor.getTargets().size() > 1 ||
-        !isa<quake::VeqType>(adaptor.getTargets().front().getType())) {
+    const auto firstTargetIndex =
+        firstControlIndex + adaptor.getControls().size();
+    Type firstTy = getInitialType(pauli, firstTargetIndex);
+    if (adaptor.getTargets().size() > 1 || !isa<quake::VeqType>(firstTy)) {
       // Concat all targets into a single Array.
       Type arrayTy = M::getArrayType(rewriter.getContext());
       Value firstOperand = adaptor.getTargets().front();
-      Value resultArray = Base::wrapQubitAsArray(loc, rewriter, firstOperand);
-      for (auto next : adaptor.getTargets().drop_front()) {
-        Value wrapNext = Base::wrapQubitAsArray(loc, rewriter, next);
-        auto appended = rewriter.create<func::CallOp>(
-            loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
+      Value resultArray =
+          Base::wrapQubitAsArray(loc, rewriter, firstOperand, firstTy);
+      SmallVector<Type> origTargTys;
+      for (auto [i, _] : llvm::enumerate(adaptor.getTargets().drop_front()))
+        origTargTys.push_back(getInitialType(pauli, firstTargetIndex + i + 1));
+      for (auto [next, origTy] :
+           llvm::zip(adaptor.getTargets().drop_front(), origTargTys)) {
+        Value wrapNext = Base::wrapQubitAsArray(loc, rewriter, next, origTy);
+        auto appended = func::CallOp::create(
+            rewriter, loc, arrayTy, cudaq::opt::QIRArrayConcatArray,
             ArrayRef<Value>{resultArray, wrapNext});
         resultArray = appended.getResult(0);
       }
@@ -1221,7 +1278,7 @@ struct ExpPauliOpPattern
     auto qirFunctionName = M::quakeToFuncName(pauli);
     if (pauli.isAdj()) {
       for (auto v : adaptor.getParameters())
-        operands.push_back(rewriter.create<arith::NegFOp>(loc, v));
+        operands.push_back(arith::NegFOp::create(rewriter, loc, v));
     } else {
       operands.append(adaptor.getParameters().begin(),
                       adaptor.getParameters().end());
@@ -1242,7 +1299,7 @@ struct ExpPauliOpPattern
         auto arrSize = llvmArrTy.getNumElements();
         auto toTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(
             rewriter.getContext(), arrEleTy, arrSize));
-        return rewriter.create<cudaq::cc::CastOp>(loc, toTy, glob);
+        return cudaq::cc::CastOp::create(rewriter, loc, toTy, glob);
       }
       return adaptor.getPauli();
     }();
@@ -1253,64 +1310,74 @@ struct ExpPauliOpPattern
     // directly (a.k.a. a span)`{i8*,i64}` or a string literal `ptr<array<i8 x
     // n>>`. If it is a string literal, we need to map it to a pauli word.
     auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
-    if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(pauliWord.getType())) {
-      // Make sure we have the right types to extract the length of the string
-      // literal
-      auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(ptrTy.getElementType());
-      if (!arrayTy)
-        return pauli.emitOpError(
-            "exp_pauli string literal must have ptr<array<i8 x N> type.");
-      if (!arrayTy.getSize())
-        return pauli.emitOpError("string literal may not be empty.");
-
-      // We must create the {i8*, i64} struct from the string literal
-      SmallVector<Type> structTys{i8PtrTy, rewriter.getI64Type()};
-      auto structTy =
-          cudaq::cc::StructType::get(rewriter.getContext(), structTys);
-
-      // Allocate the char span struct
+    Type wordTy;
+    if (!pauli.getPauliLiteral())
+      wordTy =
+          getInitialType(pauli, firstTargetIndex + adaptor.getTargets().size());
+    if (wordTy && isa<cudaq::cc::SpanLikeType>(wordTy)) {
+      // The attribute tells us we have a pauli word expressed as `{i8*, i64}`.
+      // Allocate a stack slot for it and store what we have to that pointer,
+      // pass the pointer to NVQIR.
+      auto newPauliWord = pauliWord;
+      auto newPauliWordTy = newPauliWord.getType();
       Value alloca =
-          cudaq::opt::factory::createTemporary(loc, rewriter, structTy);
-
-      // Convert the number of elements to a constant op.
-      auto size =
-          rewriter.create<arith::ConstantIntOp>(loc, arrayTy.getSize() - 1, 64);
-
-      // Set the string literal data
+          cudaq::opt::factory::createTemporary(loc, rewriter, newPauliWordTy);
+      auto castedVar = cudaq::cc::CastOp::create(
+          rewriter, loc, cudaq::cc::PointerType::get(newPauliWordTy), alloca);
+      cudaq::cc::StoreOp::create(rewriter, loc, newPauliWord, castedVar);
       auto castedPauli =
-          rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, pauliWord);
-      auto strPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(i8PtrTy), alloca,
-          ArrayRef<cudaq::cc::ComputePtrArg>{0, 0});
-      rewriter.create<cudaq::cc::StoreOp>(loc, castedPauli, strPtr);
-
-      // Set the integer length
-      auto intPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(rewriter.getI64Type()), alloca,
-          ArrayRef<cudaq::cc::ComputePtrArg>{0, 1});
-      rewriter.create<cudaq::cc::StoreOp>(loc, size, intPtr);
-
-      // Cast to raw opaque pointer
-      auto castedStore =
-          rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, alloca);
-      operands.back() = castedStore;
+          cudaq::cc::CastOp::create(rewriter, loc, i8PtrTy, alloca);
+      operands.back() = castedPauli;
       rewriter.replaceOpWithNewOp<func::CallOp>(pauli, TypeRange{},
                                                 qirFunctionName, operands);
       return success();
     }
+    // Make sure we have the right types to extract the length of the string
+    // literal.
 
-    // Here we know we have a pauli word expressed as `{i8*, i64}`. Allocate a
-    // stack slot for it and store what we have to that pointer, pass the
-    // pointer to NVQIR.
-    auto newPauliWord = pauliWord;
-    auto newPauliWordTy = newPauliWord.getType();
+    auto ptrTy = [&]() -> cudaq::cc::PointerType {
+      if (wordTy)
+        return dyn_cast<cudaq::cc::PointerType>(wordTy);
+      return dyn_cast<cudaq::cc::PointerType>(pauliWord.getType());
+    }();
+    auto arrayTy = dyn_cast<cudaq::cc::ArrayType>(ptrTy.getElementType());
+    if (!arrayTy)
+      return pauli.emitOpError(
+          "exp_pauli string literal must have ptr<array<i8 x N> type.");
+    if (!arrayTy.getSize())
+      return pauli.emitOpError("string literal may not be empty.");
+
+    // We must create the {i8*, i64} struct from the string literal
+    SmallVector<Type> structTys{i8PtrTy, rewriter.getI64Type()};
+    auto structTy =
+        cudaq::cc::StructType::get(rewriter.getContext(), structTys);
+
+    // Allocate the char span struct
     Value alloca =
-        cudaq::opt::factory::createTemporary(loc, rewriter, newPauliWordTy);
-    auto castedVar = rewriter.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(newPauliWordTy), alloca);
-    rewriter.create<cudaq::cc::StoreOp>(loc, newPauliWord, castedVar);
-    auto castedPauli = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, alloca);
-    operands.back() = castedPauli;
+        cudaq::opt::factory::createTemporary(loc, rewriter, structTy);
+
+    // Convert the number of elements to a constant op.
+    auto size =
+        arith::ConstantIntOp::create(rewriter, loc, arrayTy.getSize() - 1, 64);
+
+    // Set the string literal data
+    auto castedPauli =
+        cudaq::cc::CastOp::create(rewriter, loc, i8PtrTy, pauliWord);
+    auto strPtr = cudaq::cc::ComputePtrOp::create(
+        rewriter, loc, cudaq::cc::PointerType::get(i8PtrTy), alloca,
+        ArrayRef<cudaq::cc::ComputePtrArg>{0, 0});
+    cudaq::cc::StoreOp::create(rewriter, loc, castedPauli, strPtr);
+
+    // Set the integer length
+    auto intPtr = cudaq::cc::ComputePtrOp::create(
+        rewriter, loc, cudaq::cc::PointerType::get(rewriter.getI64Type()),
+        alloca, ArrayRef<cudaq::cc::ComputePtrArg>{0, 1});
+    cudaq::cc::StoreOp::create(rewriter, loc, size, intPtr);
+
+    // Cast to raw opaque pointer
+    auto castedStore =
+        cudaq::cc::CastOp::create(rewriter, loc, i8PtrTy, alloca);
+    operands.back() = castedStore;
     rewriter.replaceOpWithNewOp<func::CallOp>(pauli, TypeRange{},
                                               qirFunctionName, operands);
     return success();
@@ -1356,13 +1423,13 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
       }
       auto resultTy = M::getResultType(rewriter.getContext());
       auto call =
-          rewriter.create<func::CallOp>(loc, resultTy, functionName, args);
+          func::CallOp::create(rewriter, loc, resultTy, functionName, args);
       auto assundry = filterArgs(mz, adaptor.getTargets());
       SmallVector<Value> replaceVals;
       if (measOutIsHandle) {
         auto i64Ty = rewriter.getI64Type();
         replaceVals.push_back(
-            rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, call.getResult(0)));
+            cudaq::cc::CastOp::create(rewriter, loc, i64Ty, call.getResult(0)));
       } else {
         replaceVals.append(call.getResults().begin(), call.getResults().end());
       }
@@ -1381,12 +1448,12 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
 
       auto resultAttr = mz->getAttr(cudaq::opt::ResultIndexAttrName);
       std::int64_t annInt = cast<IntegerAttr>(resultAttr).getInt();
-      Value intVal = rewriter.create<arith::ConstantIntOp>(loc, annInt, 64);
+      Value intVal = arith::ConstantIntOp::create(rewriter, loc, annInt, 64);
       auto resultTy = M::getResultType(rewriter.getContext());
-      Value res = rewriter.create<cudaq::cc::CastOp>(loc, resultTy, intVal);
+      Value res = cudaq::cc::CastOp::create(rewriter, loc, resultTy, intVal);
       args.push_back(res);
       auto call =
-          rewriter.create<func::CallOp>(loc, TypeRange{}, functionName, args);
+          func::CallOp::create(rewriter, loc, TypeRange{}, functionName, args);
       call->setAttr(cudaq::opt::QIRRegisterNameAttr, regNameAttr);
       // For handle-form callers, materialize the back-cast `Result* -> i64`
       // here so it dominates downstream uses. The `!discriminateToClassical`
@@ -1395,7 +1462,7 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
 
       auto i64Ty = rewriter.getI64Type();
       Value handleRes =
-          measOutIsHandle ? rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, res)
+          measOutIsHandle ? cudaq::cc::CastOp::create(rewriter, loc, i64Ty, res)
                           : res;
       auto cstringGlobal =
           createGlobalCString(mz, loc, rewriter, regNameAttr.getValue());
@@ -1408,9 +1475,9 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
       }
       auto func = mz->getParentOfType<func::FuncOp>();
       if (!func->hasAttr(cudaq::runtime::enableCudaqRun)) {
-        auto recOut = rewriter.create<func::CallOp>(
-            loc, TypeRange{}, cudaq::opt::QIRRecordOutput,
-            ArrayRef<Value>{res, cstringGlobal});
+        auto recOut = func::CallOp::create(rewriter, loc, TypeRange{},
+                                           cudaq::opt::QIRRecordOutput,
+                                           ArrayRef<Value>{res, cstringGlobal});
         recOut->setAttr(cudaq::opt::ResultIndexAttrName, resultAttr);
         recOut->setAttr(cudaq::opt::QIRRegisterNameAttr, regNameAttr);
       }
@@ -1440,8 +1507,8 @@ struct ResetOpPattern : public OpConversionPattern<quake::ResetOp> {
     } else {
       auto loc = reset.getLoc();
       auto results = filterArgs(reset, adaptor.getOperands());
-      rewriter.create<func::CallOp>(loc, TypeRange{}, qirFunctionName,
-                                    adaptor.getOperands());
+      func::CallOp::create(rewriter, loc, TypeRange{}, qirFunctionName,
+                           adaptor.getOperands());
       rewriter.replaceOp(reset, results);
     }
     return success();
@@ -1458,12 +1525,12 @@ struct ApplyOpTrap : public OpConversionPattern<quake::ApplyOp> {
   matchAndRewrite(quake::ApplyOp apply, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = apply.getLoc();
-    Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    rewriter.create<func::CallOp>(loc, TypeRange{}, cudaq::opt::QISTrap,
-                                  ValueRange{zero});
+    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+    func::CallOp::create(rewriter, loc, TypeRange{}, cudaq::opt::QISTrap,
+                         ValueRange{zero});
     SmallVector<Value> values;
     for (auto r : apply.getResults()) {
-      Value v = rewriter.create<cudaq::cc::PoisonOp>(loc, r.getType());
+      Value v = cudaq::cc::PoisonOp::create(rewriter, loc, r.getType());
       values.push_back(v);
     }
     rewriter.replaceOp(apply, values);
@@ -1488,8 +1555,8 @@ struct CallByRefOpRewrite : public OpConversionPattern<quake::CallByRefOp> {
       if (quake::isQuantumValueType(valarg.getType()))
         quantumArgs.push_back(qirarg);
 
-    auto refCall = rewriter.create<func::CallOp>(
-        loc, fn.getFunctionType().getResults(),
+    auto refCall = func::CallOp::create(
+        rewriter, loc, fn.getFunctionType().getResults(),
         adaptor.getCallee().getRootReference().getValue(), adaptor.getArgs());
 
     // Concat the formal results and the quantum arguments to rewrite the uses.
@@ -1614,14 +1681,14 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
       // If this is adjoint, each parameter is negated.
       if (op.getIsAdj()) {
         for (std::size_t i = 0; i < opParams.size(); ++i)
-          opParams[i] = rewriter.create<arith::NegFOp>(loc, opParams[i]);
+          opParams[i] = arith::NegFOp::create(rewriter, loc, opParams[i]);
         if constexpr (std::is_same_v<OP, quake::U2Op>) {
           std::swap(opParams[0], opParams[1]);
           auto fltTy = cast<FloatType>(opParams[0].getType());
-          Value pi = rewriter.create<arith::ConstantFloatOp>(
-              loc, llvm::APFloat{M_PI}, fltTy);
-          opParams[0] = rewriter.create<arith::SubFOp>(loc, opParams[0], pi);
-          opParams[1] = rewriter.create<arith::AddFOp>(loc, opParams[1], pi);
+          Value pi = arith::ConstantFloatOp::create(rewriter, loc, fltTy,
+                                                    llvm::APFloat{M_PI});
+          opParams[0] = arith::SubFOp::create(rewriter, loc, opParams[0], pi);
+          opParams[1] = arith::AddFOp::create(rewriter, loc, opParams[1], pi);
         } else if constexpr (std::is_same_v<OP, quake::U3Op>) {
           // swap the 2nd and 3rd parameter for correctness
           std::swap(opParams[1], opParams[2]);
@@ -1633,7 +1700,7 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
       for (std::size_t i = 0; i < opParams.size(); ++i) {
         if (opParams[i].getType().getIntOrFloatBitWidth() != 64)
           opParams[i] =
-              rewriter.create<cudaq::cc::CastOp>(loc, f64Ty, opParams[i]);
+              cudaq::cc::CastOp::create(rewriter, loc, f64Ty, opParams[i]);
       }
     }
 
@@ -1641,14 +1708,14 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
     // just add a call and forward the target qubits as needed.
     auto numControls = adaptor.getControls().size();
     if (op.getControls().empty() ||
-        conformsToIntendedCall(numControls, op.getControls().front(), op,
-                               qirFunctionName)) {
+        conformsToIntendedCall(numControls, getInitialType(op, opParams.size()),
+                               op, qirFunctionName)) {
       SmallVector<Value> args{opParams.begin(), opParams.end()};
       args.append(adaptor.getControls().begin(), adaptor.getControls().end());
       args.append(adaptor.getTargets().begin(), adaptor.getTargets().end());
       qirFunctionName =
           specializeFunctionName(op, qirFunctionName, numControls);
-      rewriter.create<func::CallOp>(loc, TypeRange{}, qirFunctionName, args);
+      func::CallOp::create(rewriter, loc, TypeRange{}, qirFunctionName, args);
       return forwardOrEraseOp();
     }
 
@@ -1665,22 +1732,24 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
     Type i64Ty = rewriter.getI64Type();
     auto ptrNoneTy = M::getLLVMPointerType(rewriter.getContext());
 
-    // Process the controls, sorting them by type.
-    for (auto pr : llvm::zip(op.getControls(), adaptor.getControls())) {
-      if (isaVeqArgument(std::get<0>(pr).getType())) {
+    // Process the controls, sorting them by type. Using the original
+    // type recorded by QuakeToQIRAPIPrep, since opaque pointers
+    // make Array* and Qubit* indistinguishable on the live operand.
+    for (auto [i, val] : llvm::enumerate(adaptor.getControls())) {
+      Type origCtrlTy = getInitialType(op, opParams.size() + i);
+      if (isaVeqArgument(origCtrlTy)) {
         numArrayCtrls++;
-        auto sizeCall = rewriter.create<func::CallOp>(
-            loc, i64Ty, cudaq::opt::QIRArrayGetSize,
-            ValueRange{std::get<1>(pr)});
+        auto sizeCall = func::CallOp::create(
+            rewriter, loc, i64Ty, cudaq::opt::QIRArrayGetSize, ValueRange{val});
         // Arrays are encoded as pairs of arguments: length and Array*
         opArrCtrls.push_back(sizeCall.getResult(0));
-        opArrCtrls.push_back(rewriter.create<cudaq::cc::CastOp>(
-            loc, ptrNoneTy, std::get<1>(pr)));
+        opArrCtrls.push_back(
+            cudaq::cc::CastOp::create(rewriter, loc, ptrNoneTy, val));
       } else {
         numQubitCtrls++;
         // Qubits are simply the Qubit**
-        opQubitCtrls.emplace_back(rewriter.create<cudaq::cc::CastOp>(
-            loc, ptrNoneTy, std::get<1>(pr)));
+        opQubitCtrls.emplace_back(
+            cudaq::cc::CastOp::create(rewriter, loc, ptrNoneTy, val));
       }
     }
 
@@ -1694,9 +1763,9 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
       return op.emitError("cannot find " + qirFunctionName);
     FunctionType qirFunctionTy = funOp.getFunctionType();
     auto funCon =
-        rewriter.create<func::ConstantOp>(loc, qirFunctionTy, qirFunctionName);
+        func::ConstantOp::create(rewriter, loc, qirFunctionTy, qirFunctionName);
     auto funPtr =
-        rewriter.create<cudaq::cc::FuncToPtrOp>(loc, ptrNoneTy, funCon);
+        cudaq::cc::FuncToPtrOp::create(rewriter, loc, ptrNoneTy, funCon);
 
     // Process the target qubits.
     auto numTargets = adaptor.getTargets().size();
@@ -1704,18 +1773,18 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
       return op.emitOpError("quake op must have at least 1 target.");
     SmallVector<Value> opTargs;
     for (auto t : adaptor.getTargets())
-      opTargs.push_back(rewriter.create<cudaq::cc::CastOp>(loc, ptrNoneTy, t));
+      opTargs.push_back(cudaq::cc::CastOp::create(rewriter, loc, ptrNoneTy, t));
 
     // Build the declared arguments for the helper call (5 total).
     SmallVector<Value> args;
     args.emplace_back(
-        rewriter.create<arith::ConstantIntOp>(loc, opParams.size(), 64));
+        arith::ConstantIntOp::create(rewriter, loc, opParams.size(), 64));
     args.emplace_back(
-        rewriter.create<arith::ConstantIntOp>(loc, numArrayCtrls, 64));
+        arith::ConstantIntOp::create(rewriter, loc, numArrayCtrls, 64));
     args.emplace_back(
-        rewriter.create<arith::ConstantIntOp>(loc, numQubitCtrls, 64));
+        arith::ConstantIntOp::create(rewriter, loc, numQubitCtrls, 64));
     args.emplace_back(
-        rewriter.create<arith::ConstantIntOp>(loc, numTargets, 64));
+        arith::ConstantIntOp::create(rewriter, loc, numTargets, 64));
     args.emplace_back(funPtr);
 
     // Finally, append the varargs to the end of the argument list.
@@ -1725,8 +1794,9 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
     args.append(opTargs.begin(), opTargs.end());
 
     // Call the generalized version of the gate invocation.
-    rewriter.create<cudaq::cc::VarargCallOp>(
-        loc, TypeRange{}, cudaq::opt::NVQIRGeneralizedInvokeAny, args);
+    cudaq::cc::VarargCallOp::create(rewriter, loc, TypeRange{},
+                                    cudaq::opt::NVQIRGeneralizedInvokeAny,
+                                    args);
     return forwardOrEraseOp();
   }
 
@@ -1742,11 +1812,10 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
     return isa<quake::VeqType>(ty) || alreadyConverted(ty);
   }
 
-  static bool conformsToIntendedCall(std::size_t numControls, Value ctrl, OP op,
-                                     StringRef qirFunctionName) {
+  static bool conformsToIntendedCall(std::size_t numControls, Type ctrlTy,
+                                     OP op, StringRef qirFunctionName) {
     if (numControls != 1)
       return false;
-    auto ctrlTy = ctrl.getType();
     auto trivialName = specializeFunctionName(op, qirFunctionName, numControls);
     const bool nameChanged = trivialName != qirFunctionName;
     if (nameChanged && !isa<quake::VeqType>(ctrlTy))
@@ -1794,11 +1863,10 @@ struct AllocaOpPattern : public OpConversionPattern<cudaq::cc::AllocaOp> {
 };
 
 struct ReturnOpPattern : public OpConversionPattern<func::ReturnOp> {
-  using Base = OpConversionPattern<func::ReturnOp>;
-  using Base::Base;
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(func::ReturnOp op, typename Base::OpAdaptor adaptor,
+  matchAndRewrite(func::ReturnOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<func::ReturnOp>(op, adaptor.getOperands());
     return success();
@@ -1835,7 +1903,7 @@ struct FuncSignaturePattern : public OpConversionPattern<func::FuncOp> {
         blockArg.setType(newTy);
     }
     // Replace the signature.
-    rewriter.updateRootInPlace(func, [&]() {
+    rewriter.modifyOpInPlace(func, [&]() {
       func.setFunctionType(newFuncTy);
       func->setAttr(FuncIsQIRAPI, rewriter.getUnitAttr());
     });
@@ -1863,8 +1931,8 @@ struct CreateLambdaPattern
         blockArg.setType(argTy);
     }
     // Replace the signature.
-    rewriter.updateRootInPlace(op,
-                               [&]() { op.getSignature().setType(newSigTy); });
+    rewriter.modifyOpInPlace(op,
+                             [&]() { op.getSignature().setType(newSigTy); });
     return success();
   }
 };
@@ -1986,7 +2054,8 @@ struct CondBranchOpPattern : public OpConversionPattern<cf::CondBranchOp> {
                   ConversionPatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
         op, adaptor.getCondition(), adaptor.getTrueDestOperands(),
-        adaptor.getFalseDestOperands(), op.getTrueDest(), op.getFalseDest());
+        adaptor.getFalseDestOperands(), DenseI32ArrayAttr(), op.getTrueDest(),
+        op.getFalseDest());
     return success();
   }
 };
@@ -2040,11 +2109,7 @@ static void commonQuakeHandlingPatterns(RewritePatternSet &patterns,
 
 template <bool opaquePtr>
 Type GetLLVMPointerType(MLIRContext *ctx) {
-  if constexpr (opaquePtr) {
-    return LLVM::LLVMPointerType::get(ctx);
-  } else {
-    return LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
-  }
+  return LLVM::LLVMPointerType::get(ctx);
 }
 
 /// The modifier class for the "full QIR" API.
@@ -2423,7 +2488,7 @@ struct QuakeToQIRAPIPrepPass
       RewritePatternSet patterns(ctx);
       QIRAPITypeConverter typeConverter(opaquePtr);
       cudaq::opt::populateQuakeToCCPrepPatterns(patterns);
-      if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
+      if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
         signalPassFailure();
         return;
       }
@@ -2575,10 +2640,11 @@ struct QuakeToQIRAPIPrepPass
 
     auto *ctx = module.getContext();
     module.walk([&](Operation *op) {
-      if (!std::any_of(op->getResultTypes().begin(), op->getResultTypes().end(),
-                       quake::isQuantumValueType) ||
-          !std::any_of(op->getOperandTypes().begin(),
-                       op->getOperandTypes().end(), quake::isQuantumValueType))
+      if (std::all_of(op->getResultTypes().begin(), op->getResultTypes().end(),
+                      [&](Type ty) { return !quake::isQuantumType(ty); }) &&
+          std::all_of(op->getOperandTypes().begin(),
+                      op->getOperandTypes().end(),
+                      [&](Type ty) { return !quake::isQuantumType(ty); }))
         return;
       SmallVector<Attribute> typeAttrs;
       typeAttrs.reserve(op->getOperands().size());
@@ -2626,7 +2692,7 @@ struct QuakeToQIRAPIFinalPass
     RewritePatternSet patterns(ctx);
     patterns.insert<MaterializeConstantArrayOpRewrite,
                     AnnotateKernelsWithMeasurementStringsPattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    if (failed(applyPatternsGreedily(module, std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp b/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp
index 25a3689252c..ae7f05db870 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp
@@ -11,18 +11,23 @@
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/Peephole.h"
 #include "cudaq/Optimizer/CodeGen/QIRAttributeNames.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Todo.h"
 #include "nlohmann/json.hpp"
 #include "llvm/ADT/SmallSet.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
+namespace cudaq::opt {
+#define GEN_PASS_DEF_QIRTOQIRPROFILE
+#define GEN_PASS_DEF_QIRTOQIRPROFILEFUNC
+#define GEN_PASS_DEF_QIRTOQIRPROFILEPREP
+#include "cudaq/Optimizer/CodeGen/Passes.h.inc"
+} // namespace cudaq::opt
+
 #define DEBUG_TYPE "qir-profile"
 
 /**
@@ -49,7 +54,7 @@ static std::size_t getNumQubits(LLVM::CallOp callOp) {
   while (defOp && !dyn_cast<LLVM::ConstantOp>(defOp))
     defOp = defOp->getOperand(0).getDefiningOp();
   if (auto constOp = dyn_cast_or_null<LLVM::ConstantOp>(defOp))
-    return constOp.getValue().cast<IntegerAttr>().getValue().getLimitedValue();
+    return cast<IntegerAttr>(constOp.getValue()).getValue().getLimitedValue();
   TODO_loc(callOp.getLoc(), "cannot compute number of qubits allocated");
 }
 
@@ -64,7 +69,7 @@ static bool isQIRSliceCall(Operation *op) {
 static std::optional<std::int64_t> sliceLowerBound(Operation *op) {
   Value low = op->getOperand(2);
   if (auto con = low.getDefiningOp<LLVM::ConstantOp>())
-    return con.getValue().cast<IntegerAttr>().getInt();
+    return cast<IntegerAttr>(con.getValue()).getInt();
   return {};
 }
 
@@ -179,7 +184,7 @@ struct FunctionProfileAnalysis {
             if (constVal)
               if (auto incr = constVal->getDefiningOp<LLVM::ConstantOp>())
                 optQb =
-                    allocOffset + incr.getValue().cast<IntegerAttr>().getInt();
+                    allocOffset + cast<IntegerAttr>(incr.getValue()).getInt();
           }
         }
         if (optQb) {
@@ -189,8 +194,8 @@ struct FunctionProfileAnalysis {
           auto resIdx = IntegerAttr::get(intTy, data.nResults);
           callOp->setAttr(resultIndexName, resIdx);
           auto regName = [&]() -> StringAttr {
-            if (auto nameAttr = callOp->getAttr(cudaq::opt::QIRRegisterNameAttr)
-                                    .dyn_cast_or_null<StringAttr>())
+            if (auto nameAttr = dyn_cast_if_present<StringAttr>(
+                    callOp->getAttr(cudaq::opt::QIRRegisterNameAttr)))
               return nameAttr;
             return {};
           }();
@@ -219,7 +224,7 @@ struct AddFuncAttribute : public OpRewritePattern<LLVM::LLVMFuncOp> {
     // Add attributes to the function.
     auto iter = infoMap.find(op);
     assert(iter != infoMap.end());
-    rewriter.startRootUpdate(op);
+    rewriter.startOpModification(op);
     const auto &info = iter->second;
     nlohmann::json resultQubitJSON{info.resultQubitVals};
     bool isAdaptive = convertTo == "qir-adaptive";
@@ -227,20 +232,18 @@ struct AddFuncAttribute : public OpRewritePattern<LLVM::LLVMFuncOp> {
 
     auto requiredQubitsStr = std::to_string(info.nQubits);
     StringRef requiredQubitsStrRef = requiredQubitsStr;
-    if (auto stringAttr =
-            op->getAttr(cudaq::opt::qir0_1::RequiredQubitsAttrName)
-                .dyn_cast_or_null<mlir::StringAttr>())
+    if (auto stringAttr = dyn_cast_if_present<mlir::StringAttr>(
+            op->getAttr(cudaq::opt::qir0_1::RequiredQubitsAttrName)))
       requiredQubitsStrRef = stringAttr;
     auto requiredResultsStr = std::to_string(info.nResults);
     StringRef requiredResultsStrRef = requiredResultsStr;
-    if (auto stringAttr =
-            op->getAttr(cudaq::opt::qir0_1::RequiredResultsAttrName)
-                .dyn_cast_or_null<mlir::StringAttr>())
+    if (auto stringAttr = dyn_cast_if_present<mlir::StringAttr>(
+            op->getAttr(cudaq::opt::qir0_1::RequiredResultsAttrName)))
       requiredResultsStrRef = stringAttr;
     StringRef outputNamesStrRef;
     std::string resultQubitJSONStr;
-    if (auto strAttr = op->getAttr(cudaq::opt::QIROutputNamesAttrName)
-                           .dyn_cast_or_null<mlir::StringAttr>()) {
+    if (auto strAttr = dyn_cast_if_present<mlir::StringAttr>(
+            op->getAttr(cudaq::opt::QIROutputNamesAttrName))) {
       outputNamesStrRef = strAttr;
     } else {
       resultQubitJSONStr = resultQubitJSON.dump();
@@ -271,7 +274,7 @@ struct AddFuncAttribute : public OpRewritePattern<LLVM::LLVMFuncOp> {
     auto builder = cudaq::IRBuilder::atBlockTerminator(&op.getBody().back());
     auto loc = op.getBody().back().getTerminator()->getLoc();
 
-    auto resultTy = cudaq::opt::getResultType(rewriter.getContext());
+    auto resultTy = cudaq::cg::getLLVMResultType(rewriter.getContext());
     auto i64Ty = rewriter.getI64Type();
     auto module = op->getParentOfType<ModuleOp>();
     for (auto &iv : info.resultQubitVals) {
@@ -282,29 +285,31 @@ struct AddFuncAttribute : public OpRewritePattern<LLVM::LLVMFuncOp> {
       if (isAdaptive)
         builder.setInsertionPointAfter(
             info.resultOperation.find(iv.first)->getSecond());
-      Value idx = builder.create<LLVM::ConstantOp>(loc, i64Ty, iv.first);
-      Value ptr = builder.create<LLVM::IntToPtrOp>(loc, resultTy, idx);
+      Value idx = LLVM::ConstantOp::create(builder, loc, i64Ty, iv.first);
+      Value ptr = LLVM::IntToPtrOp::create(builder, loc, resultTy, idx);
       auto regName = [&]() -> Value {
-        auto charPtrTy = cudaq::opt::getCharPointerType(builder.getContext());
+        auto charPtrTy =
+            cudaq::cg::getLLVMCharPointerType(builder.getContext());
         if (!rec.second.empty()) {
           // Note: it should be the case that this string literal has already
           // been added to the IR, so this step does not actually update the
           // module.
           auto globl =
               builder.genCStringLiteralAppendNul(loc, module, rec.second);
-          auto addrOf = builder.create<LLVM::AddressOfOp>(
-              loc, cudaq::opt::factory::getPointerType(globl.getType()),
+          auto addrOf = LLVM::AddressOfOp::create(
+              builder, loc,
+              cudaq::opt::factory::getPointerType(globl.getType()),
               globl.getName());
-          return builder.create<LLVM::BitcastOp>(loc, charPtrTy, addrOf);
+          return LLVM::BitcastOp::create(builder, loc, charPtrTy, addrOf);
         }
-        Value zero = builder.create<LLVM::ConstantOp>(loc, i64Ty, 0);
-        return builder.create<LLVM::IntToPtrOp>(loc, charPtrTy, zero);
+        Value zero = LLVM::ConstantOp::create(builder, loc, i64Ty, 0);
+        return LLVM::IntToPtrOp::create(builder, loc, charPtrTy, zero);
       }();
-      builder.create<LLVM::CallOp>(loc, TypeRange{},
-                                   cudaq::opt::QIRRecordOutput,
-                                   ValueRange{ptr, regName});
+      LLVM::CallOp::create(builder, loc, TypeRange{},
+                           cudaq::opt::QIRRecordOutput,
+                           ValueRange{ptr, regName});
     }
-    rewriter.finalizeRootUpdate(op);
+    rewriter.finalizeOpModification(op);
     return success();
   }
 
@@ -326,10 +331,10 @@ struct AddCallAttribute : public OpRewritePattern<LLVM::CallOp> {
     auto startIter = info.allocationOffsets.find(op.getOperation());
     assert(startIter != info.allocationOffsets.end());
     auto startVal = startIter->second;
-    rewriter.startRootUpdate(op);
+    rewriter.startOpModification(op);
     op->setAttr(cudaq::opt::StartingOffsetAttrName,
                 rewriter.getIntegerAttr(rewriter.getI64Type(), startVal));
-    rewriter.finalizeRootUpdate(op);
+    rewriter.finalizeOpModification(op);
     return success();
   }
 
@@ -343,7 +348,8 @@ struct AddCallAttribute : public OpRewritePattern<LLVM::CallOp> {
 /// calls are bijective with all distinct measurement calls in the original
 /// function, however the indices used may be renumbered and start at 0.
 struct QIRToQIRProfileFuncPass
-    : public cudaq::opt::QIRToQIRProfileFuncBase<QIRToQIRProfileFuncPass> {
+    : public cudaq::opt::impl::QIRToQIRProfileFuncBase<
+          QIRToQIRProfileFuncPass> {
   using QIRToQIRProfileFuncBase::QIRToQIRProfileFuncBase;
 
   explicit QIRToQIRProfileFuncPass(llvm::StringRef convertTo_)
@@ -408,15 +414,15 @@ struct ArrayGetElementPtrConv : public OpRewritePattern<LLVM::LoadOp> {
     if (!call)
       return failure();
     auto loc = op.getLoc();
-    if (call.getCallee()->equals(cudaq::opt::QIRArrayGetElementPtr1d)) {
+    if (call.getCallee() == cudaq::opt::QIRArrayGetElementPtr1d) {
       auto *alloc = call.getOperand(0).getDefiningOp();
       if (!alloc->hasAttr(cudaq::opt::StartingOffsetAttrName))
         return failure();
       Value disp = call.getOperand(1);
-      Value off = rewriter.create<LLVM::ConstantOp>(
-          loc, disp.getType(),
+      Value off = LLVM::ConstantOp::create(
+          rewriter, loc, disp.getType(),
           alloc->getAttr(cudaq::opt::StartingOffsetAttrName));
-      Value qubit = rewriter.create<LLVM::AddOp>(loc, off, disp);
+      Value qubit = LLVM::AddOp::create(rewriter, loc, off, disp);
       rewriter.replaceOpWithNewOp<LLVM::IntToPtrOp>(op, op.getType(), qubit);
       return success();
     }
@@ -429,13 +435,13 @@ struct CallAlloc : public OpRewritePattern<LLVM::CallOp> {
 
   LogicalResult matchAndRewrite(LLVM::CallOp call,
                                 PatternRewriter &rewriter) const override {
-    if (!call.getCallee()->equals(cudaq::opt::QIRQubitAllocate))
+    if (call.getCallee() != cudaq::opt::QIRQubitAllocate)
       return failure();
     if (!call->hasAttr(cudaq::opt::StartingOffsetAttrName))
       return failure();
     auto loc = call.getLoc();
-    Value qubit = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI64Type(),
+    Value qubit = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI64Type(),
         call->getAttr(cudaq::opt::StartingOffsetAttrName));
     auto resTy = call.getResult().getType();
     rewriter.replaceOpWithNewOp<LLVM::IntToPtrOp>(call, resTy, qubit);
@@ -454,10 +460,10 @@ struct ZCtrlOneTargetToCZ : public OpRewritePattern<LLVM::CallOp> {
                                 PatternRewriter &rewriter) const override {
     ValueRange args(call.getArgOperands());
     if (args.size() == 4 && call.getCallee() &&
-        call.getCallee()->equals(cudaq::opt::NVQIRInvokeWithControlBits)) {
+        call.getCallee() == cudaq::opt::NVQIRInvokeWithControlBits) {
       if (auto addrOf = dyn_cast_or_null<mlir::LLVM::AddressOfOp>(
               args[1].getDefiningOp())) {
-        if (addrOf.getGlobalName().startswith(
+        if (addrOf.getGlobalName().starts_with(
                 std::string(cudaq::opt::QIRQISPrefix) + "z__ctl")) {
           rewriter.replaceOpWithNewOp<LLVM::CallOp>(
               call, TypeRange{}, cudaq::opt::QIRCZ, args.drop_front(2));
@@ -476,7 +482,7 @@ struct ZCtrlOneTargetToCZ : public OpRewritePattern<LLVM::CallOp> {
 /// DAGs in the IR and replace them to meet the requirements of the base
 /// profile. The patterns are defined in Peephole.td.
 struct QIRToQIRProfileQIRPass
-    : public cudaq::opt::QIRToQIRProfileBase<QIRToQIRProfileQIRPass> {
+    : public cudaq::opt::impl::QIRToQIRProfileBase<QIRToQIRProfileQIRPass> {
   explicit QIRToQIRProfileQIRPass() = default;
 
   /// @brief Construct pass
@@ -499,7 +505,7 @@ struct QIRToQIRProfileQIRPass
                 XCtrlOneTargetToCNot, ZCtrlOneTargetToCZ>(context);
     if (convertTo.getValue() == "qir-adaptive")
       patterns.insert<LoadMeasureResult>(context);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After QIR profile:\n" << *op << '\n');
   }
@@ -530,7 +536,8 @@ static constexpr std::array<const char *, 3> measurementFunctionNames{
     cudaq::opt::QIRMeasureToRegister};
 
 struct QIRProfilePreparationPass
-    : public cudaq::opt::QIRToQIRProfilePrepBase<QIRProfilePreparationPass> {
+    : public cudaq::opt::impl::QIRToQIRProfilePrepBase<
+          QIRProfilePreparationPass> {
 
   void runOnOperation() override {
     ModuleOp module = getOperation();
@@ -539,27 +546,30 @@ struct QIRProfilePreparationPass
     // Add cnot declaration as it may be referenced after peepholes run.
     cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::opt::QIRCnot, LLVM::LLVMVoidType::get(ctx),
-        {cudaq::opt::getQubitType(ctx), cudaq::opt::getQubitType(ctx)}, module);
+        {cudaq::cg::getLLVMQubitType(ctx), cudaq::cg::getLLVMQubitType(ctx)},
+        module);
 
     // Add cz declaration as it may be referenced after peepholes run.
     cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::opt::QIRCZ, LLVM::LLVMVoidType::get(ctx),
-        {cudaq::opt::getQubitType(ctx), cudaq::opt::getQubitType(ctx)}, module);
+        {cudaq::cg::getLLVMQubitType(ctx), cudaq::cg::getLLVMQubitType(ctx)},
+        module);
 
     // Add measure_body as it has a different signature than measure.
     cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::opt::QIRMeasureBody, LLVM::LLVMVoidType::get(ctx),
-        {cudaq::opt::getQubitType(ctx), cudaq::opt::getResultType(ctx)},
+        {cudaq::cg::getLLVMQubitType(ctx), cudaq::cg::getLLVMResultType(ctx)},
         module);
 
     cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::opt::qir0_1::ReadResultBody, IntegerType::get(ctx, 1),
-        {cudaq::opt::getResultType(ctx)}, module);
+        {cudaq::cg::getLLVMResultType(ctx)}, module);
 
     // Add record functions for any measurements.
     cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::opt::QIRRecordOutput, LLVM::LLVMVoidType::get(ctx),
-        {cudaq::opt::getResultType(ctx), cudaq::opt::getCharPointerType(ctx)},
+        {cudaq::cg::getLLVMResultType(ctx),
+         cudaq::cg::getLLVMCharPointerType(ctx)},
         module);
 
     // Add functions `__quantum__qis__*__body` for all functions matching
diff --git a/lib/Optimizer/CodeGen/DelayMeasurements.cpp b/lib/Optimizer/CodeGen/DelayMeasurements.cpp
index 1760e115484..1883c3b429e 100644
--- a/lib/Optimizer/CodeGen/DelayMeasurements.cpp
+++ b/lib/Optimizer/CodeGen/DelayMeasurements.cpp
@@ -8,7 +8,6 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 
 namespace cudaq::opt {
 #define GEN_PASS_DEF_DELAYMEASUREMENTS
diff --git a/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp b/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
index 981a45b0f48..a2275f66698 100644
--- a/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
+++ b/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
@@ -8,15 +8,14 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-
-#define DEBUG_TYPE "eliminate-dead-heap-copy"
 
 namespace cudaq::opt {
 #define GEN_PASS_DEF_ELIMINATEDEADHEAPCOPY
 #include "cudaq/Optimizer/CodeGen/Passes.h.inc"
 } // namespace cudaq::opt
 
+#define DEBUG_TYPE "eliminate-dead-heap-copy"
+
 using namespace mlir;
 
 namespace {
diff --git a/lib/Optimizer/CodeGen/OptUtils.cpp b/lib/Optimizer/CodeGen/OptUtils.cpp
index 74ba16ec161..a8f2674dcce 100644
--- a/lib/Optimizer/CodeGen/OptUtils.cpp
+++ b/lib/Optimizer/CodeGen/OptUtils.cpp
@@ -18,13 +18,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "cudaq/Optimizer/CodeGen/OptUtils.h"
-
 #include "llvm/IR/Module.h"
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Target/TargetMachine.h"
-#include <optional>
 
 using namespace llvm;
 
@@ -54,6 +52,7 @@ static std::optional<OptimizationLevel> mapToLevel(unsigned optLevel,
   }
   return std::nullopt;
 }
+
 // Create and return a lambda that uses LLVM pass manager builder to set up
 // optimizations based on the given level.
 std::function<Error(Module *)>
diff --git a/lib/Optimizer/CodeGen/PassDetails.h b/lib/Optimizer/CodeGen/PassDetails.h
index e0fb0d4e4fc..038736d792f 100644
--- a/lib/Optimizer/CodeGen/PassDetails.h
+++ b/lib/Optimizer/CodeGen/PassDetails.h
@@ -10,9 +10,15 @@
 
 #include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
@@ -22,7 +28,7 @@
 
 namespace cudaq::opt {
 
-#define GEN_PASS_CLASSES
-#include "cudaq/Optimizer/CodeGen/Passes.h.inc"
+// Note: Individual pass implementations should define their specific pass
+// using #define GEN_PASS_DEF_<PassName> before including Passes.h.inc
 
 } // namespace cudaq::opt
diff --git a/lib/Optimizer/CodeGen/Passes.cpp b/lib/Optimizer/CodeGen/Passes.cpp
index 8ff6c53c2d1..1bdf2f24363 100644
--- a/lib/Optimizer/CodeGen/Passes.cpp
+++ b/lib/Optimizer/CodeGen/Passes.cpp
@@ -15,88 +15,72 @@
 using namespace mlir;
 
 static void addAnyonPPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "z(1)",
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
 static void addAnyonCPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)",
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
 static void addOQCPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       // TODO: make this our native gate set
       "h", "s", "t", "r1", "rx", "ry", "rz", "x", "y", "z", "x(1)",
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
 static void addQCIPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
   // Note: QCI's basis gate set is "sx", "rz", "cz", but QCI currently has
   // a transpiler converting all other gates to that basis.
   // We use the gate set below so we can translate all gates to QIR.
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)",
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
 static void addQuantinuumPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)",
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
 static void addIQMPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       "phased_rx",
       "z(1)",
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
 static void addIonQPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       "h",  "s", "t", "rx", "ry",
       "rz", "x", "y", "z",  "x(1)", // TODO set to ms, gpi, gpi2
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
 static void addFermioniqPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
+  cudaq::opt::BasisConversionOptions options;
+  options.basis = {
       "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)",
   };
-  BasisConversionOptions options;
-  options.basis = basis;
   pm.addPass(createBasisConversion(options));
 }
 
diff --git a/lib/Optimizer/CodeGen/PeepholePatterns.inc b/lib/Optimizer/CodeGen/PeepholePatterns.inc
index 3e408af8375..67beb82e772 100644
--- a/lib/Optimizer/CodeGen/PeepholePatterns.inc
+++ b/lib/Optimizer/CodeGen/PeepholePatterns.inc
@@ -27,9 +27,10 @@ struct XCtrlOneTargetToCNot : public OpRewritePattern<LLVM::CallOp> {
       return failure();
     auto *ctx = rewriter.getContext();
     auto funcSymbol = FlatSymbolRefAttr::get(ctx, cudaq::opt::QIRCnot);
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
-        call, TypeRange{}, funcSymbol, args.drop_front(2),
-        call.getFastmathFlagsAttr(), call.getBranchWeightsAttr());
+    LLVM::CallOp::Properties properties = call.getProperties();
+    properties.setCallee(funcSymbol);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(call, TypeRange{},
+                                              args.drop_front(2), properties);
     return success();
   }
 };
@@ -69,13 +70,14 @@ struct CalleeConv : public OpRewritePattern<LLVM::CallOp> {
     if (!callee)
       return failure();
     if (!needsToBeRenamed(*callee) ||
-        callee->startswith(cudaq::opt::QIRMeasure))
+        callee->starts_with(cudaq::opt::QIRMeasure))
       return failure();
     auto *ctx = rewriter.getContext();
     auto symbol = FlatSymbolRefAttr::get(ctx, callee->str() + "__body");
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
-        call, TypeRange{}, symbol, call.getOperands(),
-        call.getFastmathFlagsAttr(), call.getBranchWeightsAttr());
+    LLVM::CallOp::Properties properties = call.getProperties();
+    properties.setCallee(symbol);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(call, TypeRange{},
+                                              call.getOperands(), properties);
     return success();
   }
 };
@@ -119,7 +121,7 @@ struct EraseArrayAlloc : public OpRewritePattern<LLVM::CallOp> {
       return failure();
     auto *ctx = rewriter.getContext();
     rewriter.replaceOpWithNewOp<LLVM::UndefOp>(call,
-                                               cudaq::opt::getArrayType(ctx));
+                                               cudaq::cg::getLLVMArrayType(ctx));
     return success();
   }
 };
@@ -224,7 +226,7 @@ struct LoadMeasureResult : public OpRewritePattern<LLVM::LoadOp> {
     if (bitcast.getType() !=
         cudaq::opt::factory::getPointerType(IntegerType::get(ctx, 1)))
       return failure();
-    if (inttoptr.getType() != cudaq::opt::getResultType(ctx))
+    if (inttoptr.getType() != cudaq::cg::getLLVMResultType(ctx))
       return failure();
     if (!isa<IntegerAttr>(conint.getValue()))
       return failure();
diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp
index 377b52b7797..29efd35699f 100644
--- a/lib/Optimizer/CodeGen/Pipelines.cpp
+++ b/lib/Optimizer/CodeGen/Pipelines.cpp
@@ -162,7 +162,9 @@ void cudaq::opt::createPipelineTransformsForPythonToOpenQASM(
   pm.addNestedPass<func::FuncOp>(createCSEPass());
   pm.addNestedPass<func::FuncOp>(createMultiControlDecomposition());
   pm.addPass(createDecomposition(
-      {.basis = {"h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)"}}));
+      {.basis = {"h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)"},
+       .disabledPatterns = {},
+       .enabledPatterns = {}}));
   pm.addPass(createQuakeToCCPrep());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addNestedPass<func::FuncOp>(createExpandControlVeqs());
diff --git a/lib/Optimizer/CodeGen/QirInsertArrayRecord.cpp b/lib/Optimizer/CodeGen/QirInsertArrayRecord.cpp
index 77e4f5b77ff..1c7d19ae0c5 100644
--- a/lib/Optimizer/CodeGen/QirInsertArrayRecord.cpp
+++ b/lib/Optimizer/CodeGen/QirInsertArrayRecord.cpp
@@ -43,14 +43,14 @@ static LogicalResult insertArrayRecordingCall(OpBuilder &builder,
   std::string labelStr = "array<i1 x " + std::to_string(resultCount) + ">";
   auto strLitTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(
       builder.getContext(), builder.getI8Type(), labelStr.size() + 1));
-  Value lit = builder.create<cudaq::cc::CreateStringLiteralOp>(
-      loc, strLitTy, builder.getStringAttr(labelStr));
+  Value lit = cudaq::cc::CreateStringLiteralOp::create(
+      builder, loc, strLitTy, builder.getStringAttr(labelStr));
   auto i8PtrTy = cudaq::cc::PointerType::get(builder.getI8Type());
-  Value label = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, lit);
-  Value size = builder.create<arith::ConstantIntOp>(loc, resultCount, 64);
-  builder.create<func::CallOp>(loc, TypeRange{},
-                               cudaq::opt::QIRArrayRecordOutput,
-                               ArrayRef<Value>{size, label});
+  Value label = cudaq::cc::CastOp::create(builder, loc, i8PtrTy, lit);
+  Value size = arith::ConstantIntOp::create(builder, loc, resultCount, 64);
+  func::CallOp::create(builder, loc, TypeRange{},
+                       cudaq::opt::QIRArrayRecordOutput,
+                       ArrayRef<Value>{size, label});
   return success();
 }
 
diff --git a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
index e9503b31559..e3d2308eaea 100644
--- a/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToCodegen.cpp
@@ -8,14 +8,12 @@
 
 #include "QuakeToCodegen.h"
 #include "CodeGenOps.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 
 using namespace mlir;
 
@@ -55,11 +53,11 @@ class ExpandComplexCast : public OpRewritePattern<cudaq::cc::CastOp> {
       return failure();
     auto loc = castOp.getLoc();
     auto ty = cast<ComplexType>(castOp.getValue().getType()).getElementType();
-    Value rePart = rewriter.create<complex::ReOp>(loc, ty, castOp.getValue());
-    Value imPart = rewriter.create<complex::ImOp>(loc, ty, castOp.getValue());
+    Value rePart = complex::ReOp::create(rewriter, loc, ty, castOp.getValue());
+    Value imPart = complex::ImOp::create(rewriter, loc, ty, castOp.getValue());
     auto eleTy = complexTy.getElementType();
-    auto reCast = rewriter.create<cudaq::cc::CastOp>(loc, eleTy, rePart);
-    auto imCast = rewriter.create<cudaq::cc::CastOp>(loc, eleTy, imPart);
+    auto reCast = cudaq::cc::CastOp::create(rewriter, loc, eleTy, rePart);
+    auto imCast = cudaq::cc::CastOp::create(rewriter, loc, eleTy, imPart);
     rewriter.replaceOpWithNewOp<complex::CreateOp>(castOp, complexTy, reCast,
                                                    imCast);
     return success();
@@ -108,7 +106,7 @@ class CreateStateOpPattern : public OpRewritePattern<quake::CreateStateOp> {
     auto stateTy = quake::StateType::get(ctx);
     auto statePtrTy = cudaq::cc::PointerType::get(stateTy);
     auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
-    auto cast = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
+    auto cast = cudaq::cc::CastOp::create(rewriter, loc, i8PtrTy, buffer);
 
     rewriter.replaceOpWithNewOp<func::CallOp>(
         createStateOp, statePtrTy, createStateFunc, ValueRange{cast, size});
@@ -130,7 +128,7 @@ class DeleteStateOpPattern : public OpRewritePattern<quake::DeleteStateOp> {
     auto result = irBuilder.loadIntrinsic(module, cudaq::deleteCudaqState);
     assert(succeeded(result) && "loading intrinsic should never fail");
 
-    rewriter.replaceOpWithNewOp<func::CallOp>(deleteStateOp, std::nullopt,
+    rewriter.replaceOpWithNewOp<func::CallOp>(deleteStateOp, mlir::TypeRange{},
                                               cudaq::deleteCudaqState,
                                               mlir::ValueRange{state});
     return success();
diff --git a/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp b/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
index 8dbeac4659c..8c8dc564234 100644
--- a/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
@@ -7,11 +7,10 @@
  ******************************************************************************/
 
 #include "cudaq/Optimizer/CodeGen/QuakeToExecMgr.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/CudaqFunctionNames.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
@@ -40,47 +39,49 @@ static Value packQubitSpans(Location loc, ConversionPatternRewriter &rewriter,
   auto qspanTy = cudaq::opt::getCudaqQubitSpanType(rewriter.getContext());
   Value newspan;
   if (operands.empty()) {
-    newspan = rewriter.create<cudaq::cc::AllocaOp>(loc, qspanTy);
-    auto zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto nullPtrVal = rewriter.create<cudaq::cc::CastOp>(
-        loc, cudaq::opt::getCudaqQubitType(rewriter.getContext()), zero);
-    rewriter.create<func::CallOp>(loc, std::nullopt,
-                                  cudaq::opt::CudaqEMWriteToSpan,
-                                  ValueRange{newspan, nullPtrVal, zero});
+    newspan = cudaq::cc::AllocaOp::create(rewriter, loc, qspanTy);
+    auto zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+    auto nullPtrVal = cudaq::cc::CastOp::create(
+        rewriter, loc, cudaq::opt::getCudaqQubitType(rewriter.getContext()),
+        zero);
+    func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                         cudaq::opt::CudaqEMWriteToSpan,
+                         ValueRange{newspan, nullPtrVal, zero});
   } else if (operands.size() == 1) {
     // Nothing to concatenate in this case.
     newspan = operands[0];
   } else {
-    newspan = rewriter.create<cudaq::cc::AllocaOp>(loc, qspanTy);
+    newspan = cudaq::cc::AllocaOp::create(rewriter, loc, qspanTy);
     // Loop over all arguments and count the number of qubits.
-    Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
+    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
     Value sum = zero;
     auto i64Ty = rewriter.getI64Type();
     auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
     for (auto v : operands) {
-      auto sizePtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI64Ty, v, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-      auto size = rewriter.create<cudaq::cc::LoadOp>(loc, sizePtr);
-      sum = rewriter.create<arith::AddIOp>(loc, sum, size);
+      auto sizePtr = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrI64Ty, v, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+      auto size = cudaq::cc::LoadOp::create(rewriter, loc, sizePtr);
+      sum = arith::AddIOp::create(rewriter, loc, sum, size);
     }
     // Allocate a fresh buffer.
-    auto newBuffer = rewriter.create<cudaq::cc::AllocaOp>(loc, i64Ty, sum);
-    rewriter.create<func::CallOp>(loc, std::nullopt,
-                                  cudaq::opt::CudaqEMWriteToSpan,
-                                  ValueRange{newspan, newBuffer, sum});
+    auto newBuffer = cudaq::cc::AllocaOp::create(rewriter, loc, i64Ty, sum);
+    func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                         cudaq::opt::CudaqEMWriteToSpan,
+                         ValueRange{newspan, newBuffer, sum});
     // Copy the i64 values to the new buffer.
     sum = zero;
     Value size = zero;
     for (auto v : operands) {
-      auto dest = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI64Ty, newBuffer, ArrayRef<cudaq::cc::ComputePtrArg>{sum});
-      auto sizePtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI64Ty, v, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-      size = rewriter.create<cudaq::cc::LoadOp>(loc, sizePtr);
-      rewriter.create<func::CallOp>(loc, std::nullopt,
-                                    cudaq::opt::CudaqEMConcatSpan,
-                                    ValueRange{dest, v, size});
-      sum = rewriter.create<arith::AddIOp>(loc, sum, size);
+      auto dest = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrI64Ty, newBuffer,
+          ArrayRef<cudaq::cc::ComputePtrArg>{sum});
+      auto sizePtr = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrI64Ty, v, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+      size = cudaq::cc::LoadOp::create(rewriter, loc, sizePtr);
+      func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                           cudaq::opt::CudaqEMConcatSpan,
+                           ValueRange{dest, v, size});
+      sum = arith::AddIOp::create(rewriter, loc, sum, size);
     }
   }
   return newspan;
@@ -107,19 +108,20 @@ class AllocaOpRewrite : public OpConversionPattern<quake::AllocaOp> {
     auto loc = alloca.getLoc();
     auto i64Ty = rewriter.getI64Type();
     auto qspanTy = cudaq::opt::getCudaqQubitSpanType(rewriter.getContext());
-    Value qspan = rewriter.create<cudaq::cc::AllocaOp>(loc, qspanTy);
+    Value qspan = cudaq::cc::AllocaOp::create(rewriter, loc, qspanTy);
     if (auto resultType = dyn_cast<quake::RefType>(alloca.getType())) {
-      auto one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-      Value buffer = rewriter.create<cudaq::cc::AllocaOp>(loc, i64Ty, one);
-      auto call = rewriter.create<func::CallOp>(
-          loc, i64Ty, cudaq::opt::CudaqEMAllocate, ValueRange{});
+      auto one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
+      Value buffer = cudaq::cc::AllocaOp::create(rewriter, loc, i64Ty, one);
+      auto call = func::CallOp::create(
+          rewriter, loc, i64Ty, cudaq::opt::CudaqEMAllocate, ValueRange{});
       auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
-      auto toAddr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI64Ty, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{0});
-      rewriter.create<cudaq::cc::StoreOp>(loc, call.getResult(0), toAddr);
-      rewriter.create<func::CallOp>(loc, std::nullopt,
-                                    cudaq::opt::CudaqEMWriteToSpan,
-                                    ValueRange{qspan, buffer, one});
+      auto toAddr = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrI64Ty, buffer,
+          ArrayRef<cudaq::cc::ComputePtrArg>{0});
+      cudaq::cc::StoreOp::create(rewriter, loc, call.getResult(0), toAddr);
+      func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                           cudaq::opt::CudaqEMWriteToSpan,
+                           ValueRange{qspan, buffer, one});
     } else {
       Value sizeOperand;
       if (adaptor.getOperands().empty()) {
@@ -127,25 +129,26 @@ class AllocaOpRewrite : public OpConversionPattern<quake::AllocaOp> {
         assert(type.hasSpecifiedSize() && "veq must have a constant size");
         auto constantSize = type.getSize();
         sizeOperand =
-            rewriter.create<arith::ConstantIntOp>(loc, constantSize, 64);
+            arith::ConstantIntOp::create(rewriter, loc, constantSize, 64);
       } else if (auto intSizeTy =
                      dyn_cast<IntegerType>(adaptor.getSize().getType())) {
         sizeOperand = adaptor.getSize();
         if (intSizeTy.getWidth() != 64)
-          sizeOperand = rewriter.create<cudaq::cc::CastOp>(
-              loc, i64Ty, sizeOperand, cudaq::cc::CastOpMode::Unsigned);
+          sizeOperand =
+              cudaq::cc::CastOp::create(rewriter, loc, i64Ty, sizeOperand,
+                                        cudaq::cc::CastOpMode::Unsigned);
       }
       if (!sizeOperand)
         return failure();
 
       Value buffer =
-          rewriter.create<cudaq::cc::AllocaOp>(loc, i64Ty, sizeOperand);
-      rewriter.create<func::CallOp>(loc, std::nullopt,
-                                    cudaq::opt::CudaqEMWriteToSpan,
-                                    ValueRange{qspan, buffer, sizeOperand});
-      rewriter.create<func::CallOp>(loc, std::nullopt,
-                                    cudaq::opt::CudaqEMAllocateVeq,
-                                    ValueRange{qspan, sizeOperand});
+          cudaq::cc::AllocaOp::create(rewriter, loc, i64Ty, sizeOperand);
+      func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                           cudaq::opt::CudaqEMWriteToSpan,
+                           ValueRange{qspan, buffer, sizeOperand});
+      func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                           cudaq::opt::CudaqEMAllocateVeq,
+                           ValueRange{qspan, sizeOperand});
     }
     rewriter.replaceOp(alloca, qspan);
     return success();
@@ -160,7 +163,7 @@ class DeallocOpRewrite : public OpConversionPattern<quake::DeallocOp> {
   matchAndRewrite(quake::DeallocOp dealloc, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<func::CallOp>(
-        dealloc, std::nullopt, cudaq::opt::CudaqEMReturn,
+        dealloc, mlir::TypeRange{}, cudaq::opt::CudaqEMReturn,
         ValueRange{adaptor.getReference()});
     return success();
   }
@@ -205,8 +208,8 @@ class ExtractRefOpRewrite : public OpConversionPattern<quake::ExtractRefOp> {
     auto loc = extract.getLoc();
     auto offset = [&]() -> Value {
       if (extract.hasConstantIndex())
-        return rewriter.create<arith::ConstantIntOp>(
-            loc, extract.getConstantIndex(), 64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            extract.getConstantIndex(), 64);
       return adaptor.getIndex();
     }();
 
@@ -218,18 +221,19 @@ class ExtractRefOpRewrite : public OpConversionPattern<quake::ExtractRefOp> {
     auto ptrptrTy = cudaq::cc::PointerType::get(ptrArrTy);
 
     auto qspan = adaptor.getVeq();
-    auto qspanDataPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrptrTy, qspan, ArrayRef<cudaq::cc::ComputePtrArg>{0});
-    auto qspanData = rewriter.create<cudaq::cc::LoadOp>(loc, qspanDataPtr);
-    auto buffer = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrI64Ty, qspanData, ArrayRef<cudaq::cc::ComputePtrArg>{offset});
+    auto qspanDataPtr = cudaq::cc::ComputePtrOp::create(
+        rewriter, loc, ptrptrTy, qspan, ArrayRef<cudaq::cc::ComputePtrArg>{0});
+    auto qspanData = cudaq::cc::LoadOp::create(rewriter, loc, qspanDataPtr);
+    auto buffer = cudaq::cc::ComputePtrOp::create(
+        rewriter, loc, ptrI64Ty, qspanData,
+        ArrayRef<cudaq::cc::ComputePtrArg>{offset});
     auto qspanTy = cudaq::opt::getCudaqQubitSpanType(rewriter.getContext());
-    Value newspan = rewriter.create<cudaq::cc::AllocaOp>(loc, qspanTy);
-    auto one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-    auto buf1 = rewriter.create<cudaq::cc::CastOp>(loc, ptrArrTy, buffer);
-    rewriter.create<func::CallOp>(loc, std::nullopt,
-                                  cudaq::opt::CudaqEMWriteToSpan,
-                                  ValueRange{newspan, buf1, one});
+    Value newspan = cudaq::cc::AllocaOp::create(rewriter, loc, qspanTy);
+    auto one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
+    auto buf1 = cudaq::cc::CastOp::create(rewriter, loc, ptrArrTy, buffer);
+    func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                         cudaq::opt::CudaqEMWriteToSpan,
+                         ValueRange{newspan, buf1, one});
     rewriter.replaceOp(extract, newspan);
     return success();
   }
@@ -248,35 +252,37 @@ class SubveqOpRewrite : public OpConversionPattern<quake::SubVeqOp> {
     auto loc = subveq.getLoc();
     auto up = [&]() -> Value {
       if (!adaptor.getUpper())
-        return rewriter.create<arith::ConstantIntOp>(loc, adaptor.getRawUpper(),
-                                                     64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            adaptor.getRawUpper(), 64);
       return adaptor.getUpper();
     }();
     auto lo = [&]() -> Value {
       if (!adaptor.getLower())
-        return rewriter.create<arith::ConstantIntOp>(loc, adaptor.getRawLower(),
-                                                     64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            adaptor.getRawLower(), 64);
       return adaptor.getLower();
     }();
-    auto diff = rewriter.create<arith::SubIOp>(loc, up, lo);
-    auto one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-    auto length = rewriter.create<arith::AddIOp>(loc, diff, one);
+    auto diff = arith::SubIOp::create(rewriter, loc, up, lo);
+    auto one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
+    auto length = arith::AddIOp::create(rewriter, loc, diff, one);
     // Compute the pointer to the first element in the subveq and build a new
     // array type.
     auto i64Ty = rewriter.getI64Type();
     auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
     auto ptrTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i64Ty));
     auto ptrptrTy = cudaq::cc::PointerType::get(ptrTy);
-    auto qspanDataPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrptrTy, adaptor.getVeq(), ArrayRef<cudaq::cc::ComputePtrArg>{0});
-    auto qspanData = rewriter.create<cudaq::cc::LoadOp>(loc, qspanDataPtr);
-    auto buffer = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrI64Ty, qspanData, ArrayRef<cudaq::cc::ComputePtrArg>{lo});
+    auto qspanDataPtr = cudaq::cc::ComputePtrOp::create(
+        rewriter, loc, ptrptrTy, adaptor.getVeq(),
+        ArrayRef<cudaq::cc::ComputePtrArg>{0});
+    auto qspanData = cudaq::cc::LoadOp::create(rewriter, loc, qspanDataPtr);
+    auto buffer =
+        cudaq::cc::ComputePtrOp::create(rewriter, loc, ptrI64Ty, qspanData,
+                                        ArrayRef<cudaq::cc::ComputePtrArg>{lo});
     auto qspanTy = cudaq::opt::getCudaqQubitSpanType(rewriter.getContext());
-    Value newspan = rewriter.create<cudaq::cc::AllocaOp>(loc, qspanTy);
-    rewriter.create<func::CallOp>(loc, std::nullopt,
-                                  cudaq::opt::CudaqEMWriteToSpan,
-                                  ValueRange{newspan, buffer, length});
+    Value newspan = cudaq::cc::AllocaOp::create(rewriter, loc, qspanTy);
+    func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                         cudaq::opt::CudaqEMWriteToSpan,
+                         ValueRange{newspan, buffer, length});
     rewriter.replaceOp(subveq, newspan);
     return success();
   }
@@ -289,8 +295,9 @@ class ResetRewrite : public OpConversionPattern<quake::ResetOp> {
   LogicalResult
   matchAndRewrite(quake::ResetOp resetOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<func::CallOp>(
-        resetOp, std::nullopt, cudaq::opt::CudaqEMReset, adaptor.getOperands());
+    rewriter.replaceOpWithNewOp<func::CallOp>(resetOp, mlir::TypeRange{},
+                                              cudaq::opt::CudaqEMReset,
+                                              adaptor.getOperands());
     return success();
   }
 };
@@ -311,27 +318,30 @@ class GenericRewrite : public OpConversionPattern<OP> {
     auto i8Ty = rewriter.getI8Type();
     auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
     auto regTy = cudaq::cc::PointerType::get(opName.getType());
-    auto addr = rewriter.create<cudaq::cc::AddressOfOp>(loc, regTy,
-                                                        opName.getSymName());
-    auto opString = rewriter.create<cudaq::cc::CastOp>(loc, ptrI8Ty, addr);
+    auto addr = cudaq::cc::AddressOfOp::create(rewriter, loc, regTy,
+                                               opName.getSymName());
+    auto opString = cudaq::cc::CastOp::create(rewriter, loc, ptrI8Ty, addr);
     auto paramSize = adaptor.getParameters().size();
-    Value numParams = rewriter.create<arith::ConstantIntOp>(loc, paramSize, 64);
+    Value numParams =
+        arith::ConstantIntOp::create(rewriter, loc, paramSize, 64);
     auto f64Ty = rewriter.getF64Type();
     auto arrF64Ty = cudaq::cc::ArrayType::get(f64Ty);
     auto ptrParamTy = cudaq::cc::PointerType::get(arrF64Ty);
     auto ptrF64Ty = cudaq::cc::PointerType::get(f64Ty);
     auto params = [&]() -> Value {
       if (paramSize == 0) {
-        auto zero = rewriter.create<arith::ConstantIntOp>(loc, paramSize, 64);
-        return rewriter.create<cudaq::cc::CastOp>(loc, ptrParamTy, zero);
+        auto zero = arith::ConstantIntOp::create(rewriter, loc, paramSize, 64);
+        return cudaq::cc::CastOp::create(rewriter, loc, ptrParamTy, zero);
       }
-      auto buffer = rewriter.create<cudaq::cc::AllocaOp>(loc, f64Ty, numParams);
+      auto buffer =
+          cudaq::cc::AllocaOp::create(rewriter, loc, f64Ty, numParams);
       for (auto iter : llvm::enumerate(adaptor.getParameters())) {
         std::int32_t i = iter.index();
         auto p = iter.value();
-        auto ptr = rewriter.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrF64Ty, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-        rewriter.create<cudaq::cc::StoreOp>(loc, p, ptr);
+        auto ptr = cudaq::cc::ComputePtrOp::create(
+            rewriter, loc, ptrF64Ty, buffer,
+            ArrayRef<cudaq::cc::ComputePtrArg>{i});
+        cudaq::cc::StoreOp::create(rewriter, loc, p, ptr);
       }
       return buffer;
     }();
@@ -339,11 +349,11 @@ class GenericRewrite : public OpConversionPattern<OP> {
     auto targets = packQubitSpans(loc, rewriter, adaptor.getTargets());
     auto isAdj = [&]() -> Value {
       if (qop.isAdj())
-        return rewriter.create<arith::ConstantIntOp>(loc, 1, 1);
-      return rewriter.create<arith::ConstantIntOp>(loc, 0, 1);
+        return arith::ConstantIntOp::create(rewriter, loc, 1, 1);
+      return arith::ConstantIntOp::create(rewriter, loc, 0, 1);
     }();
     rewriter.template replaceOpWithNewOp<func::CallOp>(
-        qop, std::nullopt, cudaq::opt::CudaqEMApply,
+        qop, mlir::TypeRange{}, cudaq::opt::CudaqEMApply,
         ValueRange{opString, numParams, params, controls, targets, isAdj});
     return success();
   }
@@ -392,9 +402,9 @@ class MzOpRewrite : public OpConversionPattern<quake::MzOp> {
     auto i8Ty = rewriter.getI8Type();
     auto ptrI8Ty = cudaq::cc::PointerType::get(i8Ty);
     auto regTy = cudaq::cc::PointerType::get(regName.getType());
-    auto addr = rewriter.create<cudaq::cc::AddressOfOp>(loc, regTy,
-                                                        regName.getSymName());
-    auto nameAddr = rewriter.create<cudaq::cc::CastOp>(loc, ptrI8Ty, addr);
+    auto addr = cudaq::cc::AddressOfOp::create(rewriter, loc, regTy,
+                                               regName.getSymName());
+    auto nameAddr = cudaq::cc::CastOp::create(rewriter, loc, ptrI8Ty, addr);
     auto i32Ty = rewriter.getI32Type();
     rewriter.replaceOpWithNewOp<func::CallOp>(
         mzOp, i32Ty, cudaq::opt::CudaqEMMeasure,
@@ -410,7 +420,7 @@ class MxToMzRewrite : public OpRewritePattern<quake::MxOp> {
 
   LogicalResult matchAndRewrite(quake::MxOp mx,
                                 PatternRewriter &rewriter) const override {
-    rewriter.create<quake::HOp>(mx.getLoc(), mx.getTargets());
+    quake::HOp::create(rewriter, mx.getLoc(), mx.getTargets());
     rewriter.replaceOpWithNewOp<quake::MzOp>(
         mx, mx.getResultTypes(), mx.getTargets(), mx.getRegisterNameAttr());
     return success();
@@ -424,9 +434,9 @@ class MyToMzRewrite : public OpRewritePattern<quake::MyOp> {
 
   LogicalResult matchAndRewrite(quake::MyOp my,
                                 PatternRewriter &rewriter) const override {
-    rewriter.create<quake::SOp>(my.getLoc(), true, ValueRange{}, ValueRange{},
-                                my.getTargets());
-    rewriter.create<quake::HOp>(my.getLoc(), my.getTargets());
+    quake::SOp::create(rewriter, my.getLoc(), true, ValueRange{}, ValueRange{},
+                       my.getTargets());
+    quake::HOp::create(rewriter, my.getLoc(), my.getTargets());
     rewriter.replaceOpWithNewOp<quake::MzOp>(
         my, my.getResultTypes(), my.getTargets(), my.getRegisterNameAttr());
     return success();
@@ -443,8 +453,9 @@ class VeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
     auto loc = vecsize->getLoc();
     auto i64Ty = rewriter.getI64Type();
     auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
-    auto sizeptr = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrI64Ty, adaptor.getVeq(), ArrayRef<cudaq::cc::ComputePtrArg>{1});
+    auto sizeptr = cudaq::cc::ComputePtrOp::create(
+        rewriter, loc, ptrI64Ty, adaptor.getVeq(),
+        ArrayRef<cudaq::cc::ComputePtrArg>{1});
     rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(vecsize, sizeptr);
     return success();
   }
diff --git a/lib/Optimizer/CodeGen/QuakeToLLVM.cpp b/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
index 32d845d2b6a..43ae3e03a73 100644
--- a/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
@@ -8,14 +8,13 @@
 
 #include "cudaq/Optimizer/CodeGen/QuakeToLLVM.h"
 #include "CodeGenOps.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h"
 #include "cudaq/Optimizer/CodeGen/QuakeToExecMgr.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
@@ -46,19 +45,19 @@ class AllocaOpRewrite : public ConvertOpToLLVMPattern<quake::AllocaOp> {
             dyn_cast_if_present<quake::RefType>(alloca.getResult().getType())) {
 
       StringRef qirQubitAllocate = cudaq::opt::QIRQubitAllocate;
-      auto qubitType = cudaq::opt::getQubitType(context);
+      auto qubitType = cudaq::cg::getLLVMQubitType(context);
       FlatSymbolRefAttr symbolRef =
           cudaq::opt::factory::createLLVMFunctionSymbol(
               qirQubitAllocate, qubitType, {}, parentModule);
 
       rewriter.replaceOpWithNewOp<LLVM::CallOp>(alloca, qubitType, symbolRef,
-                                                std::nullopt);
+                                                ValueRange{});
       return success();
     }
 
     // Create a QIR call to allocate the qubits.
     StringRef qir_qubit_array_allocate = cudaq::opt::QIRArrayQubitAllocateArray;
-    auto array_qbit_type = cudaq::opt::getArrayType(context);
+    auto array_qbit_type = cudaq::cg::getLLVMArrayType(context);
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         qir_qubit_array_allocate, array_qbit_type, {rewriter.getI64Type()},
         parentModule);
@@ -70,13 +69,12 @@ class AllocaOpRewrite : public ConvertOpToLLVMPattern<quake::AllocaOp> {
       auto type = cast<quake::VeqType>(alloca.getResult().getType());
       auto constantSize = type.getSize();
       sizeOperand =
-          rewriter.create<arith::ConstantIntOp>(loc, constantSize, 64);
+          arith::ConstantIntOp::create(rewriter, loc, constantSize, 64);
     } else {
       sizeOperand = adaptor.getOperands().front();
-      if (cast<IntegerType>(sizeOperand.getType()).getWidth() < 64) {
-        sizeOperand = rewriter.create<LLVM::ZExtOp>(loc, rewriter.getI64Type(),
-                                                    sizeOperand);
-      }
+      if (cast<IntegerType>(sizeOperand.getType()).getWidth() < 64)
+        sizeOperand = LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(),
+                                           sizeOperand);
     }
 
     // Replace the AllocaOp with the QIR call.
@@ -98,7 +96,7 @@ class QmemRAIIOpRewrite
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = raii->getLoc();
     auto parentModule = raii->getParentOfType<ModuleOp>();
-    auto array_qbit_type = cudaq::opt::getArrayType(rewriter.getContext());
+    auto array_qbit_type = cudaq::cg::getLLVMArrayType(rewriter.getContext());
 
     // Get the CC Pointer for the state
     auto ccState = adaptor.getInitState();
@@ -139,14 +137,14 @@ class QmemRAIIOpRewrite
       sizeOperand = allocSize;
       auto sizeTy = cast<IntegerType>(sizeOperand.getType());
       if (sizeTy.getWidth() < 64)
-        sizeOperand = rewriter.create<LLVM::ZExtOp>(loc, i64Ty, sizeOperand);
+        sizeOperand = LLVM::ZExtOp::create(rewriter, loc, i64Ty, sizeOperand);
       else if (sizeTy.getWidth() > 64)
-        sizeOperand = rewriter.create<LLVM::TruncOp>(loc, i64Ty, sizeOperand);
+        sizeOperand = LLVM::TruncOp::create(rewriter, loc, i64Ty, sizeOperand);
     } else {
       auto type = cast<quake::VeqType>(allocTy);
       auto constantSize = type.getSize();
       sizeOperand =
-          rewriter.create<arith::ConstantIntOp>(loc, constantSize, 64);
+          arith::ConstantIntOp::create(rewriter, loc, constantSize, 64);
     }
 
     // Create QIR allocation with initializer function.
@@ -159,7 +157,7 @@ class QmemRAIIOpRewrite
 
     // Call the allocation function
     Value castedInitState =
-        rewriter.create<LLVM::BitcastOp>(loc, ptrTy, ccState);
+        LLVM::BitcastOp::create(rewriter, loc, ptrTy, ccState);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         raii, array_qbit_type, raiiSymbolRef,
         ArrayRef<Value>{sizeOperand, castedInitState});
@@ -185,10 +183,10 @@ class DeallocOpRewrite : public ConvertOpToLLVMPattern<quake::DeallocOp> {
     Type operandType, qType = dealloc.getOperand().getType();
     if (isa<quake::VeqType>(qType)) {
       qirQuantumDeallocateFunc = cudaq::opt::QIRArrayQubitReleaseArray;
-      operandType = cudaq::opt::getArrayType(context);
+      operandType = cudaq::cg::getLLVMArrayType(context);
     } else {
       qirQuantumDeallocateFunc = cudaq::opt::QIRArrayQubitReleaseQubit;
-      operandType = cudaq::opt::getQubitType(context);
+      operandType = cudaq::cg::getLLVMQubitType(context);
     }
 
     FlatSymbolRefAttr deallocSymbolRef =
@@ -220,7 +218,7 @@ class ConcatOpRewrite : public ConvertOpToLLVMPattern<quake::ConcatOp> {
     auto parentModule = concat->getParentOfType<ModuleOp>();
     auto context = parentModule->getContext();
 
-    auto arrType = cudaq::opt::getArrayType(context);
+    auto arrType = cudaq::cg::getLLVMArrayType(context);
     auto loc = concat.getLoc();
 
     StringRef qirArrayConcatName = cudaq::opt::QIRArrayConcatArray;
@@ -233,7 +231,7 @@ class ConcatOpRewrite : public ConvertOpToLLVMPattern<quake::ConcatOp> {
       return success();
     }
 
-    auto qirArrayTy = cudaq::opt::getArrayType(context);
+    auto qirArrayTy = cudaq::cg::getLLVMArrayType(context);
     auto i8PtrTy = cudaq::opt::factory::getPointerType(context);
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::opt::QIRArrayCreateArray, qirArrayTy,
@@ -242,23 +240,24 @@ class ConcatOpRewrite : public ConvertOpToLLVMPattern<quake::ConcatOp> {
         cudaq::opt::factory::createLLVMFunctionSymbol(
             cudaq::opt::QIRArrayGetElementPtr1d, i8PtrTy,
             {qirArrayTy, rewriter.getIntegerType(64)}, parentModule);
-    Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
+    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+    Value one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
     // FIXME: 8 bytes is assumed to be the sizeof(char*) on the target machine.
-    Value eight = rewriter.create<arith::ConstantIntOp>(loc, 8, 32);
+    Value eight = arith::ConstantIntOp::create(rewriter, loc, 8, 32);
     // Function to convert a QIR Qubit value to an Array value.
     auto wrapQubitInArray = [&](Value v) -> Value {
-      if (v.getType() != cudaq::opt::getQubitType(context))
+      if (v.getType() != cudaq::cg::getLLVMQubitType(context))
         return v;
-      auto createCall = rewriter.create<LLVM::CallOp>(
-          loc, qirArrayTy, symbolRef, ArrayRef<Value>{eight, one});
+      auto createCall = LLVM::CallOp::create(
+          rewriter, loc, qirArrayTy, symbolRef, ArrayRef<Value>{eight, one});
       auto result = createCall.getResult();
-      auto call = rewriter.create<LLVM::CallOp>(loc, i8PtrTy, getSymbolRef,
-                                                ArrayRef<Value>{result, zero});
-      Value pointer = rewriter.create<LLVM::BitcastOp>(
-          loc, cudaq::opt::factory::getPointerType(i8PtrTy), call.getResult());
-      auto cast = rewriter.create<LLVM::BitcastOp>(loc, i8PtrTy, v);
-      rewriter.create<LLVM::StoreOp>(loc, cast, pointer);
+      auto call = LLVM::CallOp::create(rewriter, loc, i8PtrTy, getSymbolRef,
+                                       ArrayRef<Value>{result, zero});
+      Value pointer = LLVM::BitcastOp::create(
+          rewriter, loc, cudaq::opt::factory::getPointerType(i8PtrTy),
+          call.getResult());
+      auto cast = LLVM::BitcastOp::create(rewriter, loc, i8PtrTy, v);
+      LLVM::StoreOp::create(rewriter, loc, cast, pointer);
       return result;
     };
 
@@ -267,8 +266,8 @@ class ConcatOpRewrite : public ConvertOpToLLVMPattern<quake::ConcatOp> {
     auto frontArr = wrapQubitInArray(adaptor.getOperands().front());
     for (auto oper : adaptor.getOperands().drop_front(1)) {
       auto backArr = wrapQubitInArray(oper);
-      auto glue = rewriter.create<LLVM::CallOp>(
-          loc, qirArrayTy, concatFunc, ArrayRef<Value>{frontArr, backArr});
+      auto glue = LLVM::CallOp::create(rewriter, loc, qirArrayTy, concatFunc,
+                                       ArrayRef<Value>{frontArr, backArr});
       frontArr = glue.getResult();
     }
     rewriter.replaceOp(concat, frontArr);
@@ -305,9 +304,8 @@ class ExtractQubitOpRewrite
 
     auto qir_array_get_element_ptr_1d = cudaq::opt::QIRArrayGetElementPtr1d;
 
-    auto array_qbit_type = cudaq::opt::getArrayType(context);
-    auto qbit_element_ptr_type =
-        LLVM::LLVMPointerType::get(rewriter.getI8Type());
+    auto array_qbit_type = cudaq::cg::getLLVMArrayType(context);
+    auto qbit_element_ptr_type = cudaq::opt::factory::getPointerType(context);
 
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         qir_array_get_element_ptr_1d, qbit_element_ptr_type,
@@ -316,25 +314,25 @@ class ExtractQubitOpRewrite
     Value idx_operand;
     auto i64Ty = rewriter.getI64Type();
     if (extract.hasConstantIndex()) {
-      idx_operand = rewriter.create<arith::ConstantIntOp>(
-          loc, extract.getConstantIndex(), i64Ty);
+      idx_operand = arith::ConstantIntOp::create(
+          rewriter, loc, extract.getConstantIndex(), 64);
     } else {
       idx_operand = adaptor.getOperands()[1];
 
       if (idx_operand.getType().isIntOrFloat() &&
           cast<IntegerType>(idx_operand.getType()).getWidth() < 64)
-        idx_operand = rewriter.create<LLVM::ZExtOp>(loc, i64Ty, idx_operand);
+        idx_operand = LLVM::ZExtOp::create(rewriter, loc, i64Ty, idx_operand);
     }
 
-    auto get_qbit_qir_call = rewriter.create<LLVM::CallOp>(
-        loc, qbit_element_ptr_type, symbolRef,
+    auto get_qbit_qir_call = LLVM::CallOp::create(
+        rewriter, loc, qbit_element_ptr_type, symbolRef,
         llvm::ArrayRef({adaptor.getOperands().front(), idx_operand}));
 
-    auto bitcast = rewriter.create<LLVM::BitcastOp>(
-        loc, LLVM::LLVMPointerType::get(cudaq::opt::getQubitType(context)),
+    auto bitcast = LLVM::BitcastOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context),
         get_qbit_qir_call.getResult());
     rewriter.replaceOpWithNewOp<LLVM::LoadOp>(
-        extract, cudaq::opt::getQubitType(context), bitcast.getResult());
+        extract, cudaq::cg::getLLVMQubitType(context), bitcast.getResult());
     return success();
   }
 };
@@ -364,11 +362,12 @@ class MakeStruqOpPattern : public ConvertOpToLLVMPattern<quake::MakeStruqOp> {
     auto loc = mkStruq.getLoc();
     auto *ctx = rewriter.getContext();
     auto toTy = getTypeConverter()->convertType(mkStruq.getType());
-    Value result = rewriter.create<LLVM::UndefOp>(loc, toTy);
+    Value result = LLVM::UndefOp::create(rewriter, loc, toTy);
     std::int64_t count = 0;
     for (auto op : adaptor.getOperands()) {
       auto off = DenseI64ArrayAttr::get(ctx, ArrayRef<std::int64_t>{count});
-      result = rewriter.create<LLVM::InsertValueOp>(loc, toTy, result, op, off);
+      result =
+          LLVM::InsertValueOp::create(rewriter, loc, toTy, result, op, off);
       count++;
     }
     rewriter.replaceOp(mkStruq, result);
@@ -387,7 +386,7 @@ class SubveqOpRewrite : public ConvertOpToLLVMPattern<quake::SubVeqOp> {
     auto parentModule = subveq->getParentOfType<ModuleOp>();
     auto *context = parentModule->getContext();
     constexpr auto rtSubveqFuncName = cudaq::opt::QIRArraySlice;
-    auto arrayTy = cudaq::opt::getArrayType(context);
+    auto arrayTy = cudaq::cg::getLLVMArrayType(context);
     auto resultTy = arrayTy;
 
     auto i32Ty = rewriter.getI32Type();
@@ -398,27 +397,27 @@ class SubveqOpRewrite : public ConvertOpToLLVMPattern<quake::SubVeqOp> {
 
     auto lowArg = [&]() -> Value {
       if (!adaptor.getLower())
-        return rewriter.create<arith::ConstantIntOp>(loc, adaptor.getRawLower(),
-                                                     64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            adaptor.getRawLower(), 64);
       return adaptor.getLower();
     }();
     auto highArg = [&]() -> Value {
       if (!adaptor.getUpper())
-        return rewriter.create<arith::ConstantIntOp>(loc, adaptor.getRawUpper(),
-                                                     64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            adaptor.getRawUpper(), 64);
       return adaptor.getUpper();
     }();
     auto extend = [&](Value &v) -> Value {
       if (isa<IntegerType>(v.getType()) &&
           cast<IntegerType>(v.getType()).getWidth() < 64)
-        return rewriter.create<LLVM::ZExtOp>(loc, i64Ty, v);
+        return LLVM::ZExtOp::create(rewriter, loc, i64Ty, v);
       return v;
     };
     lowArg = extend(lowArg);
     highArg = extend(highArg);
     Value inArr = adaptor.getOperands()[0];
-    auto one32 = rewriter.create<arith::ConstantIntOp>(loc, 1, i32Ty);
-    auto one64 = rewriter.create<arith::ConstantIntOp>(loc, 1, i64Ty);
+    auto one32 = arith::ConstantIntOp::create(rewriter, loc, 1, 32);
+    auto one64 = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         subveq, resultTy, symbolRef,
         ValueRange{inArr, one32, lowArg, one64, highArg});
@@ -442,7 +441,7 @@ class ResetRewrite : public ConvertOpToLLVMPattern<quake::ResetOp> {
     auto qirFunctionName = cudaq::opt::QIRQISPrefix + instName;
 
     // Create the qubit pointer type
-    auto qirQubitPointerType = cudaq::opt::getQubitType(context);
+    auto qirQubitPointerType = cudaq::cg::getLLVMQubitType(context);
 
     // Get the function reference for the reset function
     auto qirFunctionSymbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
@@ -471,7 +470,7 @@ class ExpPauliRewrite : public ConvertOpToLLVMPattern<quake::ExpPauliOp> {
     auto qirFunctionName = qirQisPrefix + "exp_pauli";
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         qirFunctionName, /*return type=*/LLVM::LLVMVoidType::get(context),
-        {rewriter.getF64Type(), cudaq::opt::getArrayType(context),
+        {rewriter.getF64Type(), cudaq::cg::getLLVMArrayType(context),
          cudaq::opt::factory::getPointerType(context)},
         parentModule);
     SmallVector<Value> operands = adaptor.getOperands();
@@ -484,35 +483,27 @@ class ExpPauliRewrite : public ConvertOpToLLVMPattern<quake::ExpPauliOp> {
       auto pauliConst = builder.genCStringLiteralAppendNul(
           loc, parentModule, *instOp.getPauliLiteral());
       // Create a pauli reference and make it the last operand.
-      operands.push_back(rewriter.create<LLVM::AddressOfOp>(
-          loc, cudaq::opt::factory::getPointerType(pauliConst.getType()),
+      operands.push_back(LLVM::AddressOfOp::create(
+          rewriter, loc,
+          cudaq::opt::factory::getPointerType(pauliConst.getType()),
           pauliConst.getSymName()));
     }
     auto pauliWord = operands.back();
-    if (auto ptrTy = dyn_cast<LLVM::LLVMPointerType>(pauliWord.getType())) {
-      // Make sure we have the right types to extract the
-      // length of the string literal
-      auto ptrEleTy = ptrTy.getElementType();
-      auto innerArrTy = dyn_cast<LLVM::LLVMArrayType>(ptrEleTy);
-      if (!innerArrTy)
-        return instOp.emitError(
-            "exp_pauli string literal expected to be ptr<array<i8 x N.>.");
-
-      // Get the number of elements in the provided string literal
-      auto numElements = innerArrTy.getNumElements() - 1;
+    if (isa<LLVM::LLVMPointerType>(pauliWord.getType())) {
+      // With opaque pointers we get the string length from the literal
+      auto numElements = static_cast<int64_t>(instOp.getPauliLiteral()->size());
 
       // Remove the old operand
       operands.pop_back();
 
       // We must create the {i8*, i64} struct from the string literal
-      SmallVector<Type> structTys{
-          LLVM::LLVMPointerType::get(rewriter.getI8Type()),
-          rewriter.getI64Type()};
+      auto ptrTy = cudaq::opt::factory::getPointerType(context);
+      SmallVector<Type> structTys{ptrTy, rewriter.getI64Type()};
       auto structTy = LLVM::LLVMStructType::getLiteral(context, structTys);
 
       // Allocate the char span struct
-      Value alloca = cudaq::opt::factory::createLLVMTemporary(
-          loc, rewriter, LLVM::LLVMPointerType::get(structTy));
+      Value alloca =
+          cudaq::opt::factory::createLLVMTemporary(loc, rewriter, structTy);
 
       // We'll need these constants
       auto zero = cudaq::opt::factory::genLlvmI64Constant(loc, rewriter, 0);
@@ -522,22 +513,21 @@ class ExpPauliRewrite : public ConvertOpToLLVMPattern<quake::ExpPauliOp> {
 
       // Set the string literal data
       auto charPtrTy = cudaq::opt::factory::getPointerType(context);
-      auto strPtrTy = LLVM::LLVMPointerType::get(charPtrTy);
-      auto strPtr = rewriter.create<LLVM::GEPOp>(loc, strPtrTy, alloca,
-                                                 ValueRange{zero, zero});
+      auto strPtr = LLVM::GEPOp::create(rewriter, loc, charPtrTy, structTy,
+                                        alloca, ValueRange{zero, zero});
       auto castedPauli =
-          rewriter.create<LLVM::BitcastOp>(loc, charPtrTy, pauliWord);
-      rewriter.create<LLVM::StoreOp>(loc, castedPauli, strPtr);
+          LLVM::BitcastOp::create(rewriter, loc, charPtrTy, pauliWord);
+      LLVM::StoreOp::create(rewriter, loc, castedPauli, strPtr);
 
       // Set the integer length
-      auto intPtr = rewriter.create<LLVM::GEPOp>(
-          loc, LLVM::LLVMPointerType::get(rewriter.getI64Type()), alloca,
-          ValueRange{zero, one});
-      rewriter.create<LLVM::StoreOp>(loc, size, intPtr);
+      auto i64PtrTy = cudaq::opt::factory::getPointerType(context);
+      auto intPtr = LLVM::GEPOp::create(rewriter, loc, i64PtrTy, structTy,
+                                        alloca, ValueRange{zero, one});
+      LLVM::StoreOp::create(rewriter, loc, size, intPtr);
 
       // Cast to raw opaque pointer
       auto castedStore =
-          rewriter.create<LLVM::BitcastOp>(loc, charPtrTy, alloca);
+          LLVM::BitcastOp::create(rewriter, loc, charPtrTy, alloca);
       operands.push_back(castedStore);
       rewriter.replaceOpWithNewOp<LLVM::CallOp>(instOp, TypeRange{}, symbolRef,
                                                 operands);
@@ -548,10 +538,10 @@ class ExpPauliRewrite : public ConvertOpToLLVMPattern<quake::ExpPauliOp> {
     // Allocate a stack slot for it and store what we have to that pointer,
     // pass the pointer to NVQIR
     Value alloca = cudaq::opt::factory::createLLVMTemporary(
-        loc, rewriter, LLVM::LLVMPointerType::get(pauliWord.getType()));
-    rewriter.create<LLVM::StoreOp>(loc, pauliWord, alloca);
-    auto castedPauli = rewriter.create<LLVM::BitcastOp>(
-        loc, cudaq::opt::factory::getPointerType(context), alloca);
+        loc, rewriter, cudaq::opt::factory::getPointerType(context));
+    LLVM::StoreOp::create(rewriter, loc, pauliWord, alloca);
+    auto castedPauli = LLVM::BitcastOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context), alloca);
     operands.pop_back();
     operands.push_back(castedPauli);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(instOp, TypeRange{}, symbolRef,
@@ -588,8 +578,8 @@ class ConvertOpWithControls : public ConvertOpToLLVMPattern<OP> {
     auto qirFunctionName = qirQisPrefix + instName + "__ctl";
 
     // Useful types we'll need
-    auto qirArrayType = cudaq::opt::getArrayType(context);
-    auto qirQubitPointerType = cudaq::opt::getQubitType(context);
+    auto qirArrayType = cudaq::cg::getLLVMArrayType(context);
+    auto qirQubitPointerType = cudaq::cg::getLLVMQubitType(context);
     auto i64Type = rewriter.getI64Type();
 
     // __quantum__qis__NAME__ctl(Array*, Qubit*) Type
@@ -599,9 +589,6 @@ class ConvertOpWithControls : public ConvertOpToLLVMPattern<OP> {
       return failure();
     if (numTargetOperands == 2)
       argTys.push_back(qirQubitPointerType);
-    auto instOpQISFunctionType =
-        LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(context), argTys);
-
     // Get the function pointer for the ctrl operation
     auto qirFunctionSymbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         qirFunctionName, LLVM::LLVMVoidType::get(context), argTys,
@@ -622,18 +609,20 @@ class ConvertOpWithControls : public ConvertOpToLLVMPattern<OP> {
     // function.
     FlatSymbolRefAttr applyMultiControlFunction;
     SmallVector<Value> args;
-    Value ctrlOpPointer = rewriter.create<LLVM::AddressOfOp>(
-        loc, LLVM::LLVMPointerType::get(instOpQISFunctionType),
+    Value ctrlOpPointer = LLVM::AddressOfOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context),
         qirFunctionSymbolRef);
     Value numControlOperands =
-        rewriter.create<LLVM::ConstantOp>(loc, i64Type, numControls);
+        arith::ConstantIntOp::create(rewriter, loc, numControls, 64);
     args.push_back(numControlOperands);
 
     // Check if all controls are qubit types, if so retain existing
-    // functionality.
+    // functionality. With opaque pointers, both qubit (RefType) and array
+    // (VeqType) convert to the same !llvm.ptr type, so we must check the
+    // original quake types to distinguish them.
     auto allControlsAreQubits = [&]() {
-      for (auto c : adaptor.getControls())
-        if (c.getType() != qirQubitPointerType)
+      for (auto c : instOp.getControls())
+        if (!isa<quake::RefType>(c.getType()))
           return false;
       return true;
     }();
@@ -644,8 +633,8 @@ class ConvertOpWithControls : public ConvertOpToLLVMPattern<OP> {
       applyMultiControlFunction = cudaq::opt::factory::createLLVMFunctionSymbol(
           cudaq::opt::NVQIRInvokeWithControlBits,
           LLVM::LLVMVoidType::get(context),
-          {i64Type, LLVM::LLVMPointerType::get(instOpQISFunctionType)},
-          parentModule, true);
+          {i64Type, cudaq::opt::factory::getPointerType(context)}, parentModule,
+          true);
     } else {
       // Otherwise use the general function, which can handle registers of
       // qubits and multiple target qubits. Get symbol for the
@@ -653,8 +642,8 @@ class ConvertOpWithControls : public ConvertOpToLLVMPattern<OP> {
       applyMultiControlFunction = cudaq::opt::factory::createLLVMFunctionSymbol(
           cudaq::opt::NVQIRInvokeWithControlRegisterOrBits,
           LLVM::LLVMVoidType::get(context),
-          {i64Type, LLVM::LLVMPointerType::get(i64Type), i64Type,
-           LLVM::LLVMPointerType::get(instOpQISFunctionType)},
+          {i64Type, cudaq::opt::factory::getPointerType(context), i64Type,
+           cudaq::opt::factory::getPointerType(context)},
           parentModule, true);
 
       // The total number of control qubits may be more than the number of
@@ -667,17 +656,26 @@ class ConvertOpWithControls : public ConvertOpToLLVMPattern<OP> {
       // and $0$ otherwise.
       Value isArrayAndLengthArr =
           cudaq::opt::factory::packIsArrayAndLengthArray(
-              loc, rewriter, parentModule, numControls, adaptor.getControls());
+              loc, rewriter, parentModule, numControls, adaptor.getControls(),
+              instOp.getControls());
       args.push_back(isArrayAndLengthArr);
       args.push_back(
-          rewriter.create<LLVM::ConstantOp>(loc, i64Type, numTargetOperands));
+          arith::ConstantIntOp::create(rewriter, loc, numTargetOperands, 64));
     }
     args.push_back(ctrlOpPointer);
     args.append(instOperands.begin(), instOperands.end());
 
     // Call our utility function.
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(instOp, TypeRange{},
-                                              applyMultiControlFunction, args);
+    // For vararg calls, we need to set the var_callee_type attribute.
+    TypeAttr varCalleeType;
+    if (auto fn = parentModule.template lookupSymbol<LLVM::LLVMFuncOp>(
+            applyMultiControlFunction.getLeafReference())) {
+      varCalleeType = TypeAttr::get(fn.getFunctionType());
+    }
+    auto callOp = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        instOp, TypeRange{}, applyMultiControlFunction, args);
+    if (varCalleeType)
+      callOp.setVarCalleeTypeAttr(varCalleeType);
 
     return success();
   }
@@ -710,7 +708,7 @@ class OneTargetRewrite : public ConvertOpWithControls<OP> {
         qirQisPrefix + instName + (instOp.getIsAdj() ? "__adj" : "");
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         qirFunctionName, /*return type=*/LLVM::LLVMVoidType::get(context),
-        {cudaq::opt::getQubitType(context)}, parentModule);
+        {cudaq::cg::getLLVMQubitType(context)}, parentModule);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(instOp, TypeRange{}, symbolRef,
                                               adaptor.getOperands());
     return success();
@@ -738,18 +736,18 @@ class OneTargetOneParamRewrite : public ConvertOpToLLVMPattern<OP> {
     std::string qirQisPrefix = cudaq::opt::QIRQISPrefix;
     auto qirFunctionName = qirQisPrefix + instName;
 
-    auto qubitIndexType = cudaq::opt::getQubitType(context);
-    auto qubitArrayType = cudaq::opt::getArrayType(context);
-    auto paramType = FloatType::getF64(context);
+    auto qubitIndexType = cudaq::cg::getLLVMQubitType(context);
+    auto qubitArrayType = cudaq::cg::getLLVMArrayType(context);
+    auto paramType = rewriter.getF64Type();
 
     SmallVector<Value> funcArgs;
     auto castToDouble = [&](Value v) {
       if (v.getType().getIntOrFloatBitWidth() < 64)
-        v = rewriter.create<arith::ExtFOp>(loc, rewriter.getF64Type(), v);
+        v = arith::ExtFOp::create(rewriter, loc, rewriter.getF64Type(), v);
       return v;
     };
     Value val = instOp.getIsAdj()
-                    ? rewriter.create<arith::NegFOp>(loc, instOperands[0])
+                    ? arith::NegFOp::create(rewriter, loc, instOperands[0])
                     : instOperands[0];
     funcArgs.push_back(castToDouble(val));
 
@@ -771,10 +769,6 @@ class OneTargetOneParamRewrite : public ConvertOpToLLVMPattern<OP> {
     qirFunctionName += "__ctl";
 
     // __quantum__qis__NAME__ctl(double, Array*, Qubit*) Type
-    auto instOpQISFunctionType = LLVM::LLVMFunctionType::get(
-        LLVM::LLVMVoidType::get(context),
-        {paramType, qubitArrayType, qubitIndexType});
-
     // Get function pointer to ctrl operation
     FlatSymbolRefAttr instSymbolRef =
         cudaq::opt::factory::createLLVMFunctionSymbol(
@@ -802,8 +796,9 @@ class OneTargetOneParamRewrite : public ConvertOpToLLVMPattern<OP> {
     // The remaining scenarios are best handled with the
     // invokeRotationWithControlQubits function.
 
-    Value ctrlOpPointer = rewriter.create<LLVM::AddressOfOp>(
-        loc, LLVM::LLVMPointerType::get(instOpQISFunctionType), instSymbolRef);
+    Value ctrlOpPointer = LLVM::AddressOfOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context),
+        instSymbolRef);
 
     // Get symbol for
     // void invokeRotationWithControlQubits(double param, const std::size_t
@@ -814,14 +809,15 @@ class OneTargetOneParamRewrite : public ConvertOpToLLVMPattern<OP> {
         cudaq::opt::factory::createLLVMFunctionSymbol(
             cudaq::opt::NVQIRInvokeRotationWithControlBits,
             LLVM::LLVMVoidType::get(context),
-            {paramType, i64Type, LLVM::LLVMPointerType::get(i64Type),
-             LLVM::LLVMPointerType::get(instOpQISFunctionType)},
+            {paramType, i64Type, cudaq::opt::factory::getPointerType(context),
+             cudaq::opt::factory::getPointerType(context)},
             parentModule, true);
 
     // Create an integer array where the kth element is N if the kth
     // control operand is a veq<N>, and 0 otherwise.
     Value isArrayAndLengthArr = cudaq::opt::factory::packIsArrayAndLengthArray(
-        loc, rewriter, parentModule, numControls, adaptor.getControls());
+        loc, rewriter, parentModule, numControls, adaptor.getControls(),
+        instOp.getControls());
 
     funcArgs.push_back(
         cudaq::opt::factory::genLlvmI64Constant(loc, rewriter, numControls));
@@ -831,8 +827,16 @@ class OneTargetOneParamRewrite : public ConvertOpToLLVMPattern<OP> {
     funcArgs.append(adaptor.getTargets().begin(), adaptor.getTargets().end());
 
     // Call our utility function.
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+    // For vararg calls, we need to set the var_callee_type attribute.
+    TypeAttr varCalleeType1;
+    if (auto fn = parentModule.template lookupSymbol<LLVM::LLVMFuncOp>(
+            applyMultiControlFunction.getLeafReference())) {
+      varCalleeType1 = TypeAttr::get(fn.getFunctionType());
+    }
+    auto callOp1 = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         instOp, TypeRange{}, applyMultiControlFunction, funcArgs);
+    if (varCalleeType1)
+      callOp1.setVarCalleeTypeAttr(varCalleeType1);
 
     return success();
   }
@@ -856,9 +860,9 @@ class OneTargetTwoParamRewrite : public ConvertOpToLLVMPattern<OP> {
     auto qirFunctionName = std::string(cudaq::opt::QIRQISPrefix) + instName;
 
     SmallVector<Type> tmpArgTypes;
-    auto qubitIndexType = cudaq::opt::getQubitType(context);
+    auto qubitIndexType = cudaq::cg::getLLVMQubitType(context);
 
-    auto paramType = FloatType::getF64(context);
+    auto paramType = rewriter.getF64Type();
     tmpArgTypes.push_back(paramType);
     tmpArgTypes.push_back(paramType);
     tmpArgTypes.push_back(qubitIndexType);
@@ -870,14 +874,14 @@ class OneTargetTwoParamRewrite : public ConvertOpToLLVMPattern<OP> {
     SmallVector<Value> funcArgs;
     auto castToDouble = [&](Value v) {
       if (v.getType().getIntOrFloatBitWidth() < 64)
-        v = rewriter.create<arith::ExtFOp>(loc, rewriter.getF64Type(), v);
+        v = arith::ExtFOp::create(rewriter, loc, rewriter.getF64Type(), v);
       return v;
     };
     Value v = adaptor.getOperands()[0];
-    v = instOp.getIsAdj() ? rewriter.create<arith::NegFOp>(loc, v) : v;
+    v = instOp.getIsAdj() ? arith::NegFOp::create(rewriter, loc, v) : v;
     funcArgs.push_back(castToDouble(v));
     v = adaptor.getOperands()[1];
-    v = instOp.getIsAdj() ? rewriter.create<arith::NegFOp>(loc, v) : v;
+    v = instOp.getIsAdj() ? arith::NegFOp::create(rewriter, loc, v) : v;
     funcArgs.push_back(castToDouble(v));
 
     // TODO: What about the control qubits?
@@ -915,20 +919,20 @@ class OneTargetThreeParamRewrite : public ConvertOpToLLVMPattern<OP> {
     std::string qirQisPrefix = cudaq::opt::QIRQISPrefix;
     auto qirFunctionName = qirQisPrefix + instName;
 
-    auto qubitIndexType = cudaq::opt::getQubitType(context);
-    auto qubitArrayType = cudaq::opt::getArrayType(context);
-    auto paramType = FloatType::getF64(context);
+    auto qubitIndexType = cudaq::cg::getLLVMQubitType(context);
+    auto qubitArrayType = cudaq::cg::getLLVMArrayType(context);
+    auto paramType = rewriter.getF64Type();
 
     SmallVector<Value> funcArgs;
     auto castToDouble = [&](Value v) {
       if (v.getType().getIntOrFloatBitWidth() < 64)
-        v = rewriter.create<arith::ExtFOp>(loc, rewriter.getF64Type(), v);
+        v = arith::ExtFOp::create(rewriter, loc, rewriter.getF64Type(), v);
       return v;
     };
     // 3 parameters
     for (int i = 0; i < 3; i++) {
       Value val = instOp.getIsAdj()
-                      ? rewriter.create<arith::NegFOp>(loc, instOperands[i])
+                      ? arith::NegFOp::create(rewriter, loc, instOperands[i])
                       : instOperands[i];
       funcArgs.push_back(castToDouble(val));
     }
@@ -951,10 +955,6 @@ class OneTargetThreeParamRewrite : public ConvertOpToLLVMPattern<OP> {
     qirFunctionName += "__ctl";
 
     // __quantum__qis__u3__ctl(double, double, double, Array*, Qubit*) Type
-    auto instOpQISFunctionType = LLVM::LLVMFunctionType::get(
-        LLVM::LLVMVoidType::get(context),
-        {paramType, paramType, paramType, qubitArrayType, qubitIndexType});
-
     // Get function pointer to ctrl operation
     FlatSymbolRefAttr instSymbolRef =
         cudaq::opt::factory::createLLVMFunctionSymbol(
@@ -983,8 +983,9 @@ class OneTargetThreeParamRewrite : public ConvertOpToLLVMPattern<OP> {
     // The remaining scenarios are best handled with the
     // invokeU3RotationWithControlQubits function.
 
-    Value ctrlOpPointer = rewriter.create<LLVM::AddressOfOp>(
-        loc, LLVM::LLVMPointerType::get(instOpQISFunctionType), instSymbolRef);
+    Value ctrlOpPointer = LLVM::AddressOfOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context),
+        instSymbolRef);
 
     // Get symbol for void invokeU3RotationWithControlQubits(double theta,
     // double phi, double lambda, const std::size_t numControlOperands, i64*
@@ -996,14 +997,15 @@ class OneTargetThreeParamRewrite : public ConvertOpToLLVMPattern<OP> {
             cudaq::opt::NVQIRInvokeU3RotationWithControlBits,
             LLVM::LLVMVoidType::get(context),
             {paramType, paramType, paramType, i64Type,
-             LLVM::LLVMPointerType::get(i64Type),
-             LLVM::LLVMPointerType::get(instOpQISFunctionType)},
+             cudaq::opt::factory::getPointerType(context),
+             cudaq::opt::factory::getPointerType(context)},
             parentModule, true);
 
     // Create an integer array where the kth element is N if the kth
     // control operand is a veq<N>, and 0 otherwise.
     Value isArrayAndLengthArr = cudaq::opt::factory::packIsArrayAndLengthArray(
-        loc, rewriter, parentModule, numControls, adaptor.getControls());
+        loc, rewriter, parentModule, numControls, adaptor.getControls(),
+        instOp.getControls());
 
     funcArgs.push_back(
         cudaq::opt::factory::genLlvmI64Constant(loc, rewriter, numControls));
@@ -1013,8 +1015,16 @@ class OneTargetThreeParamRewrite : public ConvertOpToLLVMPattern<OP> {
     funcArgs.append(adaptor.getTargets().begin(), adaptor.getTargets().end());
 
     // Call our utility function.
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+    // For vararg calls, we need to set the var_callee_type attribute.
+    TypeAttr varCalleeType2;
+    if (auto fn = parentModule.template lookupSymbol<LLVM::LLVMFuncOp>(
+            applyMultiControlFunction.getLeafReference())) {
+      varCalleeType2 = TypeAttr::get(fn.getFunctionType());
+    }
+    auto callOp2 = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         instOp, TypeRange{}, applyMultiControlFunction, funcArgs);
+    if (varCalleeType2)
+      callOp2.setVarCalleeTypeAttr(varCalleeType2);
 
     return success();
   }
@@ -1041,7 +1051,7 @@ class TwoTargetRewrite : public ConvertOpWithControls<OP> {
     auto context = parentModule->getContext();
     auto qirFunctionName = std::string(cudaq::opt::QIRQISPrefix) + instName;
 
-    auto qubitIndexType = cudaq::opt::getQubitType(context);
+    auto qubitIndexType = cudaq::cg::getLLVMQubitType(context);
     SmallVector<Type> tmpArgTypes = {qubitIndexType, qubitIndexType};
 
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
@@ -1082,7 +1092,7 @@ class MeasureRewrite : public ConvertOpToLLVMPattern<OP> {
 
     std::string qFunctionName = cudaq::opt::QIRMeasure;
     Attribute regName = measure.getRegisterNameAttr();
-    std::vector<Type> funcTypes{cudaq::opt::getQubitType(context)};
+    std::vector<Type> funcTypes{cudaq::cg::getLLVMQubitType(context)};
     std::vector<Value> args{adaptor.getOperands().front()};
 
     bool appendName;
@@ -1090,7 +1100,7 @@ class MeasureRewrite : public ConvertOpToLLVMPattern<OP> {
       // Change the function name
       qFunctionName += "__to__register";
       // Append a string type argument
-      funcTypes.push_back(LLVM::LLVMPointerType::get(rewriter.getI8Type()));
+      funcTypes.push_back(cudaq::opt::factory::getPointerType(context));
       appendName = true;
     } else {
       // If no register name is supplied, make one up. Zero pad the counter so
@@ -1126,28 +1136,30 @@ class MeasureRewrite : public ConvertOpToLLVMPattern<OP> {
     rewriter.restoreInsertionPoint(insertPoint);
 
     // Get the string address and bit cast
-    auto regNameRef = rewriter.create<LLVM::AddressOfOp>(
-        loc, cudaq::opt::factory::getPointerType(regNameGlobal.getType()),
+    auto regNameRef = LLVM::AddressOfOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context),
         regNameGlobal.getSymName());
-    auto castedRegNameRef = rewriter.create<LLVM::BitcastOp>(
-        loc, cudaq::opt::factory::getPointerType(context), regNameRef);
+    auto castedRegNameRef = LLVM::BitcastOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context),
+        regNameRef);
 
     // Append to the args list
     if (appendName)
       args.push_back(castedRegNameRef);
 
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
-        qFunctionName, cudaq::opt::getResultType(context),
+        qFunctionName, cudaq::cg::getLLVMResultType(context),
         llvm::ArrayRef(funcTypes), parentModule);
 
-    auto callOp = rewriter.create<LLVM::CallOp>(
-        loc, cudaq::opt::getResultType(context), symbolRef, ValueRange{args});
+    auto callOp = LLVM::CallOp::create(rewriter, loc,
+                                       cudaq::cg::getLLVMResultType(context),
+                                       symbolRef, ArrayRef<Value>(args));
     if (regName)
       callOp->setAttr("registerName", regName);
     auto i1Ty = rewriter.getI1Type();
-    auto i1PtrTy = LLVM::LLVMPointerType::get(i1Ty);
+    auto i1PtrTy = cudaq::opt::factory::getPointerType(context);
     auto cast =
-        rewriter.create<LLVM::BitcastOp>(loc, i1PtrTy, callOp.getResult());
+        LLVM::BitcastOp::create(rewriter, loc, i1PtrTy, callOp.getResult());
     rewriter.replaceOpWithNewOp<LLVM::LoadOp>(measure, i1Ty, cast);
 
     return success();
@@ -1168,10 +1180,10 @@ class GetVeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
 
     auto symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         qFunctionName, rewriter.getI64Type(),
-        {cudaq::opt::getArrayType(context)}, parentModule);
+        {cudaq::cg::getLLVMArrayType(context)}, parentModule);
 
-    auto c = rewriter.create<LLVM::CallOp>(loc, rewriter.getI64Type(),
-                                           symbolRef, adaptor.getOperands());
+    auto c = LLVM::CallOp::create(rewriter, loc, rewriter.getI64Type(),
+                                  symbolRef, adaptor.getOperands());
     vecsize->getResult(0).replaceAllUsesWith(c->getResult(0));
     rewriter.eraseOp(vecsize);
     return success();
@@ -1200,8 +1212,9 @@ class ReturnBitRewrite : public OpConversionPattern<func::ReturnOp> {
 
     // If we are returning a llvm.ptr<Result> then we've really
     // been asked to return a bit, set that up here
-    if (ret.getNumOperands() == 1 && adaptor.getOperands().front().getType() ==
-                                         cudaq::opt::getResultType(context)) {
+    if (ret.getNumOperands() == 1 &&
+        adaptor.getOperands().front().getType() ==
+            cudaq::cg::getLLVMResultType(context)) {
 
       // Bitcast the produced value, which corresponds to the value in
       // ret.operands()[0], from llvm.ptr<Result> to llvm.ptr<i1>. There is a
@@ -1210,13 +1223,13 @@ class ReturnBitRewrite : public OpConversionPattern<func::ReturnOp> {
       // be a call to __quantum__qis__mz(Qubit*) and that in the LLVM dialect,
       // functions always have a single result, this should be fine. If things
       // change, we will need to update this.
-      auto bitcast = rewriter.create<LLVM::BitcastOp>(
-          loc, LLVM::LLVMPointerType::get(rewriter.getI1Type()),
+      auto bitcast = LLVM::BitcastOp::create(
+          rewriter, loc, cudaq::opt::factory::getPointerType(context),
           adaptor.getOperands().front());
 
       // Load the bool
-      auto loadBit = rewriter.create<LLVM::LoadOp>(loc, rewriter.getI1Type(),
-                                                   bitcast.getResult());
+      auto loadBit = LLVM::LoadOp::create(rewriter, loc, rewriter.getI1Type(),
+                                          bitcast.getResult());
 
       // Replace all uses of the llvm.ptr<Result> with the i1, which includes
       // the return op. Do not replace its use in the bitcast.
@@ -1255,7 +1268,7 @@ class CustomUnitaryOpRewrite
   Value wrapQubitInArray(Location &loc, ConversionPatternRewriter &rewriter,
                          ModuleOp parentModule, Value v) const {
     auto context = rewriter.getContext();
-    auto qirArrayTy = cudaq::opt::getArrayType(context);
+    auto qirArrayTy = cudaq::cg::getLLVMArrayType(context);
     auto ptrTy = cudaq::opt::factory::getPointerType(context);
     FlatSymbolRefAttr symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
         cudaq::opt::QIRArrayCreateArray, qirArrayTy,
@@ -1264,21 +1277,22 @@ class CustomUnitaryOpRewrite
         cudaq::opt::factory::createLLVMFunctionSymbol(
             cudaq::opt::QIRArrayGetElementPtr1d, ptrTy,
             {qirArrayTy, rewriter.getIntegerType(64)}, parentModule);
-    Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
+    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+    Value one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
     // FIXME: 8 bytes is assumed to be the sizeof(char*) on the target machine.
-    Value eight = rewriter.create<arith::ConstantIntOp>(loc, 8, 32);
-    if (v.getType() != cudaq::opt::getQubitType(context))
+    Value eight = arith::ConstantIntOp::create(rewriter, loc, 8, 32);
+    if (v.getType() != cudaq::cg::getLLVMQubitType(context))
       return v;
-    auto createCall = rewriter.create<LLVM::CallOp>(
-        loc, qirArrayTy, symbolRef, ArrayRef<Value>{eight, one});
+    auto createCall = LLVM::CallOp::create(rewriter, loc, qirArrayTy, symbolRef,
+                                           ArrayRef<Value>{eight, one});
     auto result = createCall.getResult();
-    auto call = rewriter.create<LLVM::CallOp>(loc, ptrTy, getSymbolRef,
-                                              ArrayRef<Value>{result, zero});
-    Value pointer = rewriter.create<LLVM::BitcastOp>(
-        loc, cudaq::opt::factory::getPointerType(ptrTy), call.getResult());
-    auto cast = rewriter.create<LLVM::BitcastOp>(loc, ptrTy, v);
-    rewriter.create<LLVM::StoreOp>(loc, cast, pointer);
+    auto call = LLVM::CallOp::create(rewriter, loc, ptrTy, getSymbolRef,
+                                     ArrayRef<Value>{result, zero});
+    Value pointer = LLVM::BitcastOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(ptrTy),
+        call.getResult());
+    auto cast = LLVM::BitcastOp::create(rewriter, loc, ptrTy, v);
+    LLVM::StoreOp::create(rewriter, loc, cast, pointer);
     return result;
   }
 
@@ -1294,8 +1308,8 @@ class CustomUnitaryOpRewrite
     if (numParameters)
       op.emitOpError("Parameterized custom operations not yet supported.");
 
-    auto arrType = cudaq::opt::getArrayType(context);
-    auto qirArrayTy = cudaq::opt::getArrayType(context);
+    auto arrType = cudaq::cg::getLLVMArrayType(context);
+    auto qirArrayTy = cudaq::cg::getLLVMArrayType(context);
     FlatSymbolRefAttr concatFunc =
         cudaq::opt::factory::createLLVMFunctionSymbol(
             cudaq::opt::QIRArrayConcatArray, arrType, {arrType, arrType},
@@ -1306,8 +1320,8 @@ class CustomUnitaryOpRewrite
                                       adaptor.getTargets().front());
     for (auto oper : adaptor.getTargets().drop_front(1)) {
       auto backArr = wrapQubitInArray(loc, rewriter, parentModule, oper);
-      auto glue = rewriter.create<LLVM::CallOp>(
-          loc, qirArrayTy, concatFunc, ArrayRef<Value>{targetArr, backArr});
+      auto glue = LLVM::CallOp::create(rewriter, loc, qirArrayTy, concatFunc,
+                                       ArrayRef<Value>{targetArr, backArr});
       targetArr = glue.getResult();
     }
 
@@ -1316,25 +1330,24 @@ class CustomUnitaryOpRewrite
     Value controlArr;
     if (controls.empty()) {
       // make an empty array
-      Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-      Value zero32 = rewriter.create<arith::ConstantIntOp>(loc, 8, 32);
+      Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+      Value zero32 = arith::ConstantIntOp::create(rewriter, loc, 8, 32);
       FlatSymbolRefAttr symbolRef =
           cudaq::opt::factory::createLLVMFunctionSymbol(
               cudaq::opt::QIRArrayCreateArray,
-              cudaq::opt::getArrayType(context),
+              cudaq::cg::getLLVMArrayType(context),
               {rewriter.getI32Type(), rewriter.getI64Type()}, parentModule);
-      controlArr = rewriter
-                       .create<LLVM::CallOp>(
-                           loc, TypeRange{cudaq::opt::getArrayType(context)},
-                           symbolRef, ValueRange{zero32, zero})
+      controlArr = LLVM::CallOp::create(
+                       rewriter, loc, cudaq::cg::getLLVMArrayType(context),
+                       symbolRef, ArrayRef<Value>{zero32, zero})
                        .getResult();
     } else {
       controlArr = wrapQubitInArray(loc, rewriter, parentModule,
                                     adaptor.getControls().front());
       for (auto oper : adaptor.getControls().drop_front(1)) {
         auto backArr = wrapQubitInArray(loc, rewriter, parentModule, oper);
-        auto glue = rewriter.create<LLVM::CallOp>(
-            loc, qirArrayTy, concatFunc, ArrayRef<Value>{controlArr, backArr});
+        auto glue = LLVM::CallOp::create(rewriter, loc, qirArrayTy, concatFunc,
+                                         ArrayRef<Value>{controlArr, backArr});
         controlArr = glue.getResult();
       }
     }
@@ -1369,22 +1382,21 @@ class CustomUnitaryOpRewrite
     // Shift back to the function
     rewriter.restoreInsertionPoint(insertPoint);
     // Get the string address and bit cast
-    auto opNameRef = rewriter.create<LLVM::AddressOfOp>(
-        loc, cudaq::opt::factory::getPointerType(opNameGlobal.getType()),
+    auto opNameRef = LLVM::AddressOfOp::create(
+        rewriter, loc,
+        cudaq::opt::factory::getPointerType(opNameGlobal.getType()),
         opNameGlobal.getSymName());
-    auto castedOpNameRef = rewriter.create<LLVM::BitcastOp>(
-        loc, cudaq::opt::factory::getPointerType(context), opNameRef);
+    auto castedOpNameRef = LLVM::BitcastOp::create(
+        rewriter, loc, cudaq::opt::factory::getPointerType(context), opNameRef);
 
     if (!globalOp)
       return op.emitOpError("global not found for custom op");
 
-    auto complex64Ty =
-        typeConverter->convertType(ComplexType::get(rewriter.getF64Type()));
-    auto complex64PtrTy = LLVM::LLVMPointerType::get(complex64Ty);
+    auto complex64PtrTy = cudaq::opt::factory::getPointerType(context);
     Type type = typeConverter->convertType(globalOp.getType());
-    auto addrOp = rewriter.create<LLVM::AddressOfOp>(loc, type, generatorName);
+    auto addrOp = LLVM::AddressOfOp::create(rewriter, loc, type, generatorName);
     auto unitaryData =
-        rewriter.create<LLVM::BitcastOp>(loc, complex64PtrTy, addrOp);
+        LLVM::BitcastOp::create(rewriter, loc, complex64PtrTy, addrOp);
 
     StringRef qirFunctionName =
         op.isAdj() ? cudaq::opt::QIRCustomAdjOp : cudaq::opt::QIRCustomOp;
@@ -1392,14 +1404,14 @@ class CustomUnitaryOpRewrite
     FlatSymbolRefAttr customSymbolRef =
         cudaq::opt::factory::createLLVMFunctionSymbol(
             qirFunctionName, LLVM::LLVMVoidType::get(context),
-            {complex64PtrTy, cudaq::opt::getArrayType(context),
-             cudaq::opt::getArrayType(context),
-             LLVM::LLVMPointerType::get(rewriter.getI8Type())},
+            {complex64PtrTy, cudaq::cg::getLLVMArrayType(context),
+             cudaq::cg::getLLVMArrayType(context),
+             cudaq::opt::factory::getPointerType(context)},
             parentModule);
 
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         op, TypeRange{}, customSymbolRef,
-        ValueRange{unitaryData, controlArr, targetArr, castedOpNameRef});
+        ArrayRef<Value>{unitaryData, controlArr, targetArr, castedOpNameRef});
 
     return success();
   }
diff --git a/lib/Optimizer/CodeGen/RemoveMeasurements.cpp b/lib/Optimizer/CodeGen/RemoveMeasurements.cpp
index 056276f50e2..e3719db1efc 100644
--- a/lib/Optimizer/CodeGen/RemoveMeasurements.cpp
+++ b/lib/Optimizer/CodeGen/RemoveMeasurements.cpp
@@ -10,18 +10,17 @@
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
-#define DEBUG_TYPE "qir-remove-measurements"
-
 namespace cudaq::opt {
 #define GEN_PASS_DEF_REMOVEMEASUREMENTS
 #include "cudaq/Optimizer/CodeGen/Passes.h.inc"
 } // namespace cudaq::opt
 
+#define DEBUG_TYPE "qir-remove-measurements"
+
 using namespace mlir;
 
 namespace {
@@ -32,9 +31,9 @@ class EraseMeasurements : public OpRewritePattern<LLVM::CallOp> {
   LogicalResult matchAndRewrite(LLVM::CallOp call,
                                 PatternRewriter &rewriter) const override {
     if (auto callee = call.getCallee()) {
-      if (callee->equals(cudaq::opt::QIRMeasureBody) ||
-          callee->equals(cudaq::opt::QIRRecordOutput) ||
-          callee->equals(cudaq::opt::QIRArrayRecordOutput)) {
+      if (*callee == cudaq::opt::QIRMeasureBody ||
+          *callee == cudaq::opt::QIRRecordOutput ||
+          *callee == cudaq::opt::QIRArrayRecordOutput) {
         rewriter.eraseOp(call);
         return success();
       }
@@ -58,7 +57,7 @@ struct RemoveMeasurementsPass
     RewritePatternSet patterns(context);
     patterns.insert<EraseMeasurements>(context);
     LLVM_DEBUG(llvm::dbgs() << "Before measurement erasure:\n" << *op);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After measurement erasure:\n" << *op);
   }
diff --git a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
index 128ba8f64ef..790ff9e6ac8 100644
--- a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
+++ b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
@@ -12,19 +12,17 @@
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/QIRAttributeNames.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
-#define DEBUG_TYPE "return-to-output-log"
-
 namespace cudaq::opt {
 #define GEN_PASS_DEF_RETURNTOOUTPUTLOG
 #include "cudaq/Optimizer/CodeGen/Passes.h.inc"
 } // namespace cudaq::opt
 
+#define DEBUG_TYPE "return-to-output-log"
+
 using namespace mlir;
 
 namespace {
@@ -58,9 +56,9 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
             labelStr = prefix->str();
           Value label = makeLabel(loc, rewriter, labelStr);
           if (intTy.getWidth() == 1) {
-            rewriter.create<func::CallOp>(loc, TypeRange{},
-                                          cudaq::opt::QIRBoolRecordOutput,
-                                          ArrayRef<Value>{val, label});
+            func::CallOp::create(rewriter, loc, TypeRange{},
+                                 cudaq::opt::QIRBoolRecordOutput,
+                                 ArrayRef<Value>{val, label});
             return;
           }
           // Integer: convert to (signed) i64. The decoder *must* lop off any
@@ -68,14 +66,15 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           // bits by examining the real integer type.
           Value castVal = val;
           if (intTy.getWidth() < 64)
-            castVal = rewriter.create<cudaq::cc::CastOp>(
-                loc, rewriter.getI64Type(), val, cudaq::cc::CastOpMode::Signed);
+            castVal =
+                cudaq::cc::CastOp::create(rewriter, loc, rewriter.getI64Type(),
+                                          val, cudaq::cc::CastOpMode::Signed);
           else if (intTy.getWidth() > 64)
-            castVal = rewriter.create<cudaq::cc::CastOp>(
-                loc, rewriter.getI64Type(), val);
-          rewriter.create<func::CallOp>(loc, TypeRange{},
-                                        cudaq::opt::QIRIntegerRecordOutput,
-                                        ArrayRef<Value>{castVal, label});
+            castVal = cudaq::cc::CastOp::create(rewriter, loc,
+                                                rewriter.getI64Type(), val);
+          func::CallOp::create(rewriter, loc, TypeRange{},
+                               cudaq::opt::QIRIntegerRecordOutput,
+                               ArrayRef<Value>{castVal, label});
         })
         .Case([&](FloatType floatTy) {
           int width = floatTy.getWidth();
@@ -86,11 +85,11 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           // Floating point: convert it to double, whatever it actually is.
           Value castVal = val;
           if (floatTy != rewriter.getF64Type())
-            castVal = rewriter.create<cudaq::cc::CastOp>(
-                loc, rewriter.getF64Type(), val);
-          rewriter.create<func::CallOp>(loc, TypeRange{},
-                                        cudaq::opt::QIRDoubleRecordOutput,
-                                        ArrayRef<Value>{castVal, label});
+            castVal = cudaq::cc::CastOp::create(rewriter, loc,
+                                                rewriter.getF64Type(), val);
+          func::CallOp::create(rewriter, loc, TypeRange{},
+                               cudaq::opt::QIRDoubleRecordOutput,
+                               ArrayRef<Value>{castVal, label});
         })
         .Case([&](cudaq::cc::StructType structTy) {
           auto labelStr = translateType(structTy);
@@ -98,15 +97,15 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
             labelStr = prefix->str();
           Value label = makeLabel(loc, rewriter, labelStr);
           std::int32_t sz = structTy.getNumMembers();
-          Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
-          rewriter.create<func::CallOp>(loc, TypeRange{},
-                                        cudaq::opt::QIRTupleRecordOutput,
-                                        ArrayRef<Value>{size, label});
+          Value size = arith::ConstantIntOp::create(rewriter, loc, sz, 64);
+          func::CallOp::create(rewriter, loc, TypeRange{},
+                               cudaq::opt::QIRTupleRecordOutput,
+                               ArrayRef<Value>{size, label});
           std::string preStr = prefix ? prefix->str() : std::string{};
           for (std::int32_t i = 0; i < sz; ++i) {
             std::string offset = preStr + std::string(".") + std::to_string(i);
-            Value w = rewriter.create<cudaq::cc::ExtractValueOp>(
-                loc, structTy.getMember(i), val,
+            Value w = cudaq::cc::ExtractValueOp::create(
+                rewriter, loc, structTy.getMember(i), val,
                 ArrayRef<cudaq::cc::ExtractValueArg>{i});
             genOutputLog(loc, rewriter, w, offset, allowDynamic);
           }
@@ -115,16 +114,16 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           auto labelStr = translateType(arrTy);
           Value label = makeLabel(loc, rewriter, labelStr);
           std::int32_t sz = arrTy.getSize();
-          Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
-          rewriter.create<func::CallOp>(loc, TypeRange{},
-                                        cudaq::opt::QIRArrayRecordOutput,
-                                        ArrayRef<Value>{size, label});
+          Value size = arith::ConstantIntOp::create(rewriter, loc, sz, 64);
+          func::CallOp::create(rewriter, loc, TypeRange{},
+                               cudaq::opt::QIRArrayRecordOutput,
+                               ArrayRef<Value>{size, label});
           std::string preStr = prefix ? prefix->str() : std::string{};
           for (std::int32_t i = 0; i < sz; ++i) {
             std::string offset = preStr + std::string("[") + std::to_string(i) +
                                  std::string("]");
-            Value w = rewriter.create<cudaq::cc::ExtractValueOp>(
-                loc, arrTy.getElementType(), val,
+            Value w = cudaq::cc::ExtractValueOp::create(
+                rewriter, loc, arrTy.getElementType(), val,
                 ArrayRef<cudaq::cc::ExtractValueArg>{i});
             genOutputLog(loc, rewriter, w, offset, allowDynamic);
           }
@@ -138,24 +137,42 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
               std::int32_t sz = *maybeLen;
               auto labelStr = translateType(vecTy, sz);
               Value label = makeLabel(loc, rewriter, labelStr);
-              Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
-              rewriter.create<func::CallOp>(loc, TypeRange{},
-                                            cudaq::opt::QIRArrayRecordOutput,
-                                            ArrayRef<Value>{size, label});
+              Value size = arith::ConstantIntOp::create(rewriter, loc, sz, 64);
+              func::CallOp::create(rewriter, loc, TypeRange{},
+                                   cudaq::opt::QIRArrayRecordOutput,
+                                   ArrayRef<Value>{size, label});
               std::string preStr = prefix ? prefix->str() : std::string{};
               Value rawBuffer = vecInit.getBuffer();
+              if (auto callOp = rawBuffer.getDefiningOp<func::CallOp>()) {
+                if (callOp.getCallee() == "__nvqpp_vectorCopyCtor" &&
+                    callOp.getNumOperands() >= 1) {
+                  rawBuffer = callOp.getOperand(0);
+                } else if (callOp.getCallee() == "malloc") {
+                  for (auto *user : rawBuffer.getUsers()) {
+                    auto memcpy = dyn_cast<func::CallOp>(user);
+                    if (memcpy &&
+                        memcpy.getCallee().starts_with("llvm.memcpy") &&
+                        memcpy.getNumOperands() >= 2 &&
+                        memcpy.getOperand(0) == rawBuffer) {
+                      rawBuffer = memcpy.getOperand(1);
+                      break;
+                    }
+                  }
+                }
+              }
               auto eleTy = vecTy.getElementType();
               auto buffTy = cudaq::cc::PointerType::get(eleTy);
               auto ptrArrTy =
                   cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
               Value buffer =
-                  rewriter.create<cudaq::cc::CastOp>(loc, ptrArrTy, rawBuffer);
+                  cudaq::cc::CastOp::create(rewriter, loc, ptrArrTy, rawBuffer);
               for (std::int32_t i = 0; i < sz; ++i) {
                 std::string offset = preStr + std::string("[") +
                                      std::to_string(i) + std::string("]");
-                auto v = rewriter.create<cudaq::cc::ComputePtrOp>(
-                    loc, buffTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-                Value w = rewriter.create<cudaq::cc::LoadOp>(loc, v);
+                auto v = cudaq::cc::ComputePtrOp::create(
+                    rewriter, loc, buffTy, buffer,
+                    ArrayRef<cudaq::cc::ComputePtrArg>{i});
+                Value w = cudaq::cc::LoadOp::create(rewriter, loc, v);
                 genOutputLog(loc, rewriter, w, offset, allowDynamic);
               }
               return;
@@ -165,46 +182,46 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
             return;
           auto eleTy = vecTy.getElementType();
           auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
-          Value size = rewriter.create<cudaq::cc::StdvecSizeOp>(
-              loc, rewriter.getI64Type(), val);
+          Value size = cudaq::cc::StdvecSizeOp::create(
+              rewriter, loc, rewriter.getI64Type(), val);
           Value rawData =
-              rewriter.create<cudaq::cc::StdvecDataOp>(loc, i8PtrTy, val);
+              cudaq::cc::StdvecDataOp::create(rewriter, loc, i8PtrTy, val);
           if (auto intTy = dyn_cast<IntegerType>(eleTy)) {
             if (eleTy == rewriter.getI1Type()) {
-              rewriter.create<func::CallOp>(loc, TypeRange{},
-                                            cudaq::opt::QIRBoolSpanRecordOutput,
-                                            ArrayRef<Value>{rawData, size});
+              func::CallOp::create(rewriter, loc, TypeRange{},
+                                   cudaq::opt::QIRBoolSpanRecordOutput,
+                                   ArrayRef<Value>{rawData, size});
             } else {
               std::int32_t byteSize = (intTy.getWidth() + 7) / 8;
               Value elemSize =
-                  rewriter.create<arith::ConstantIntOp>(loc, byteSize, 32);
-              rewriter.create<func::CallOp>(
-                  loc, TypeRange{}, cudaq::opt::QIRIntSpanRecordOutput,
-                  ArrayRef<Value>{rawData, size, elemSize});
+                  arith::ConstantIntOp::create(rewriter, loc, byteSize, 32);
+              func::CallOp::create(rewriter, loc, TypeRange{},
+                                   cudaq::opt::QIRIntSpanRecordOutput,
+                                   ArrayRef<Value>{rawData, size, elemSize});
             }
           } else if (isa<FloatType>(eleTy)) {
             auto floatTy = cast<FloatType>(eleTy);
             std::int32_t byteSize = floatTy.getWidth() / 8;
             Value elemSize =
-                rewriter.create<arith::ConstantIntOp>(loc, byteSize, 32);
-            rewriter.create<func::CallOp>(
-                loc, TypeRange{}, cudaq::opt::QIRFloatSpanRecordOutput,
-                ArrayRef<Value>{rawData, size, elemSize});
+                arith::ConstantIntOp::create(rewriter, loc, byteSize, 32);
+            func::CallOp::create(rewriter, loc, TypeRange{},
+                                 cudaq::opt::QIRFloatSpanRecordOutput,
+                                 ArrayRef<Value>{rawData, size, elemSize});
           } else {
             // Unsupported element type — trap.
             LLVM_DEBUG(llvm::dbgs()
                        << "ReturnToOutputLog -- unsupported element type: "
                        << eleTy << "\n");
-            Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-            rewriter.create<func::CallOp>(loc, TypeRange{}, cudaq::opt::QISTrap,
-                                          ValueRange{one});
+            Value one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
+            func::CallOp::create(rewriter, loc, TypeRange{},
+                                 cudaq::opt::QISTrap, ValueRange{one});
           }
         })
         .Default([&](Type) {
           // If we reach here, we don't know how to handle this type.
-          Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-          rewriter.create<func::CallOp>(loc, TypeRange{}, cudaq::opt::QISTrap,
-                                        ValueRange{one});
+          Value one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
+          func::CallOp::create(rewriter, loc, TypeRange{}, cudaq::opt::QISTrap,
+                               ValueRange{one});
         });
   }
 
@@ -232,9 +249,12 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
       return {std::string("array<") + translateType(arrTy.getElementType()) +
               std::string(" x ") + std::to_string(size) + std::string(">")};
     }
-    if (auto arrTy = dyn_cast<cudaq::cc::StdvecType>(ty))
+    if (auto arrTy = dyn_cast<cudaq::cc::StdvecType>(ty)) {
+      if (!vecSz)
+        return {"error"};
       return {std::string("array<") + translateType(arrTy.getElementType()) +
               std::string(" x ") + std::to_string(*vecSz) + std::string(">")};
+    }
     return {"error"};
   }
 
@@ -242,10 +262,10 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
                          StringRef label) {
     auto strLitTy = cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(
         rewriter.getContext(), rewriter.getI8Type(), label.size() + 1));
-    Value lit = rewriter.create<cudaq::cc::CreateStringLiteralOp>(
-        loc, strLitTy, rewriter.getStringAttr(label));
+    Value lit = cudaq::cc::CreateStringLiteralOp::create(
+        rewriter, loc, strLitTy, rewriter.getStringAttr(label));
     auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
-    return rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, lit);
+    return cudaq::cc::CastOp::create(rewriter, loc, i8PtrTy, lit);
   }
 
   bool allowDynamic;
@@ -287,7 +307,7 @@ struct ReturnToOutputLogPass
     RewritePatternSet patterns(ctx);
     patterns.insert<ReturnRewrite>(ctx, allowDynamicResult);
     LLVM_DEBUG(llvm::dbgs() << "Before return to output logging:\n" << module);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    if (failed(applyPatternsGreedily(module, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After return to output logging:\n" << module);
   }
diff --git a/lib/Optimizer/CodeGen/TranslateToIQMJson.cpp b/lib/Optimizer/CodeGen/TranslateToIQMJson.cpp
index 023ca43709a..37add457bbb 100644
--- a/lib/Optimizer/CodeGen/TranslateToIQMJson.cpp
+++ b/lib/Optimizer/CodeGen/TranslateToIQMJson.cpp
@@ -15,9 +15,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatAdapters.h"
-#include <algorithm>
-#include <cmath>
-#include <numeric>
 
 using namespace mlir;
 
@@ -117,15 +114,15 @@ static LogicalResult emitOperation(nlohmann::json &json,
 
     // Propagate the name of this qubit into the operation output values.
     emitter.getOrAssignName(
-        optor->getResult(0),
+        optor.getControls()[0],
         emitter.getOrAssignName(optor.getControls()[0]).str());
-    emitter.getOrAssignName(optor->getResult(1),
+    emitter.getOrAssignName(optor.getTarget(0),
                             emitter.getOrAssignName(optor.getTarget(0)).str());
   } else {
     json["name"] = "prx";
 
     if (optor.getParameters().size() != 2)
-      optor.emitError("IQM prx gate expects exactly two parameters.");
+      optor.emitError("IQM phased_rx gate expects exactly two parameters.");
 
     auto parameter0 =
         cudaq::getParameterValueAsDouble(optor.getParameters()[0]);
@@ -139,7 +136,7 @@ static LogicalResult emitOperation(nlohmann::json &json,
     json["args"]["phase_t"] = convertToFullTurns(*parameter1);
 
     // Propagate the name of this qubit into the operation output values.
-    emitter.getOrAssignName(optor->getResult(0),
+    emitter.getOrAssignName(optor.getTarget(0),
                             emitter.getOrAssignName(optor.getTarget(0)).str());
   }
 
@@ -200,9 +197,9 @@ static LogicalResult emitOperation(nlohmann::json &json,
       .Case<arith::ConstantOp>([](auto) { return success(); })
       .Default([&](Operation *) -> LogicalResult {
         // Allow LLVM and cc dialect ops (for storing measure results).
-        if (op.getName().getDialectNamespace().equals("llvm") ||
-            op.getName().getDialectNamespace().equals("cc") ||
-            op.getName().getDialectNamespace().equals("arith"))
+        if (op.getName().getDialectNamespace() == "llvm" ||
+            op.getName().getDialectNamespace() == "cc" ||
+            op.getName().getDialectNamespace() == "arith")
           return success();
         return op.emitOpError() << "unable to translate op to IQM Json "
                                 << op.getName().getIdentifier().str();
diff --git a/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp b/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp
index 9b60034fa15..ed842e6a445 100644
--- a/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp
+++ b/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp
@@ -6,20 +6,17 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+#include "PassDetails.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
 #include "cudaq/Optimizer/Builder/RuntimeNames.h"
 #include "cudaq/Optimizer/CodeGen/Emitter.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Analysis/CallGraph.h"
 
 using namespace mlir;
-using namespace cudaq;
 
 //===----------------------------------------------------------------------===//
 // Helper functions
@@ -52,13 +49,14 @@ static LogicalResult translateOperatorName(quake::OperatorInterface optor,
   return success();
 }
 
-static LogicalResult printParameters(Emitter &emitter, ValueRange parameters) {
+static LogicalResult printParameters(cudaq::Emitter &emitter,
+                                     ValueRange parameters) {
   if (parameters.empty())
     return success();
   emitter.os << '(';
   auto isFailure = false;
   llvm::interleaveComma(parameters, emitter.os, [&](Value value) {
-    auto parameter = getParameterValueAsDouble(value);
+    auto parameter = cudaq::getParameterValueAsDouble(value);
     if (!parameter.has_value()) {
       isFailure = true;
       return;
@@ -70,8 +68,8 @@ static LogicalResult printParameters(Emitter &emitter, ValueRange parameters) {
   return failure(isFailure);
 }
 
-static StringRef printClassicalAllocation(Emitter &emitter, Value bitOrVector,
-                                          size_t size) {
+static StringRef printClassicalAllocation(cudaq::Emitter &emitter,
+                                          Value bitOrVector, size_t size) {
   auto name = emitter.createName();
   emitter.os << llvm::formatv("creg {0}[{1}];\n", name, size);
   if (size == 1)
@@ -83,10 +81,11 @@ static StringRef printClassicalAllocation(Emitter &emitter, Value bitOrVector,
 // Emitters functions
 //===----------------------------------------------------------------------===//
 
-static LogicalResult emitOperation(Emitter &emitter, Operation &op);
+static LogicalResult emitOperation(cudaq::Emitter &emitter, Operation &op);
 
-static LogicalResult emitEntryPoint(Emitter &emitter, func::FuncOp kernel) {
-  Emitter::Scope scope(emitter, /*isEntryPoint=*/true);
+static LogicalResult emitEntryPoint(cudaq::Emitter &emitter,
+                                    func::FuncOp kernel) {
+  cudaq::Emitter::Scope scope(emitter, /*isEntryPoint=*/true);
   for (Operation &op : kernel.getOps()) {
     if (failed(emitOperation(emitter, op)))
       return failure();
@@ -94,7 +93,7 @@ static LogicalResult emitEntryPoint(Emitter &emitter, func::FuncOp kernel) {
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter, ModuleOp moduleOp) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter, ModuleOp moduleOp) {
   func::FuncOp entryPoint = nullptr;
   emitter.os << "// Code generated by NVIDIA's nvq++ compiler\n";
   emitter.os << "OPENQASM 2.0;\n\n";
@@ -149,7 +148,8 @@ static LogicalResult emitOperation(Emitter &emitter, ModuleOp moduleOp) {
   return emitEntryPoint(emitter, entryPoint);
 }
 
-static LogicalResult emitOperation(Emitter &emitter, quake::AllocaOp allocaOp) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter,
+                                   quake::AllocaOp allocaOp) {
   Value refOrVeq = allocaOp.getRefOrVec();
   auto name = emitter.createName();
   auto size = 1;
@@ -165,7 +165,7 @@ static LogicalResult emitOperation(Emitter &emitter, quake::AllocaOp allocaOp) {
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter, quake::ApplyOp op) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter, quake::ApplyOp op) {
   // In Quake's reference semantics form, kernels only return classical types.
   // Thus, we check whether the numbers of results is zero or not.
   if (op.getNumResults() > 0)
@@ -203,7 +203,7 @@ static inline StringRef formatFunctionName(StringRef quakeName) {
   return quakeName.drop_while([](char C) { return C == '_'; });
 }
 
-static LogicalResult emitOperation(Emitter &emitter, func::FuncOp op) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter, func::FuncOp op) {
   if (op.isPrivate())
     return success();
 
@@ -214,7 +214,7 @@ static LogicalResult emitOperation(Emitter &emitter, func::FuncOp op) {
   // empty `__qpu__` helper), which have the prefix and are kept so that any
   // call sites remain valid.
   if (!op.isExternal() && op.front().without_terminator().empty() &&
-      !op.getName().starts_with(runtime::cudaqGenPrefixName))
+      !op.getName().starts_with(cudaq::runtime::cudaqGenPrefixName))
     return success();
 
   // In Quake's reference semantics form, kernels only return classical types.
@@ -232,7 +232,7 @@ static LogicalResult emitOperation(Emitter &emitter, func::FuncOp op) {
       parameters.push_back(arg);
   }
 
-  Emitter::Scope scope(emitter);
+  cudaq::Emitter::Scope scope(emitter);
   emitter.os << "gate " << formatFunctionName(op.getName());
   if (!parameters.empty()) {
     emitter.os << '(';
@@ -260,12 +260,13 @@ static LogicalResult emitOperation(Emitter &emitter, func::FuncOp op) {
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter, quake::ExtractRefOp op) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter,
+                                   quake::ExtractRefOp op) {
   std::optional<int64_t> index = std::nullopt;
   if (op.hasConstantIndex())
     index = op.getConstantIndex();
   else
-    index = getIndexValueAsInt(op.getIndex());
+    index = cudaq::getIndexValueAsInt(op.getIndex());
 
   auto veqName = emitter.getOrAssignName(op.getVeq());
   auto qrefName = llvm::formatv("{0}[{1}]", veqName, *index);
@@ -273,7 +274,8 @@ static LogicalResult emitOperation(Emitter &emitter, quake::ExtractRefOp op) {
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter, func::CallOp callOp) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter,
+                                   func::CallOp callOp) {
   StringRef funcName = formatFunctionName(callOp.getCallee());
   emitter.os << funcName;
   emitter.os << ' ';
@@ -284,7 +286,7 @@ static LogicalResult emitOperation(Emitter &emitter, func::CallOp callOp) {
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter,
+static LogicalResult emitOperation(cudaq::Emitter &emitter,
                                    quake::OperatorInterface optor) {
   // Handle adjoint for T and S
   StringRef name = "";
@@ -318,7 +320,7 @@ static LogicalResult emitOperation(Emitter &emitter,
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter, quake::MzOp op) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter, quake::MzOp op) {
   if (op.getTargets().size() > 1)
     return op.emitError(
         "cannot translate measurements with more than one target");
@@ -335,28 +337,29 @@ static LogicalResult emitOperation(Emitter &emitter, quake::MzOp op) {
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter, quake::ResetOp op) {
+static LogicalResult emitOperation(cudaq::Emitter &emitter, quake::ResetOp op) {
   emitter.os << "reset " << emitter.getOrAssignName(op.getTargets()) << ";";
   return success();
 }
 
-static LogicalResult emitOperation(Emitter &emitter, Operation &op) {
-  using namespace quake;
+static LogicalResult emitOperation(cudaq::Emitter &emitter, Operation &op) {
   return llvm::TypeSwitch<Operation *, LogicalResult>(&op)
       // MLIR
       .Case<ModuleOp>([&](auto op) { return emitOperation(emitter, op); })
       .Case<func::FuncOp>([&](auto op) { return emitOperation(emitter, op); })
       .Case<func::CallOp>([&](auto op) { return emitOperation(emitter, op); })
       // Quake
-      .Case<ApplyOp>([&](auto op) { return emitOperation(emitter, op); })
-      .Case<AllocaOp>([&](auto op) { return emitOperation(emitter, op); })
-      .Case<ExtractRefOp>([&](auto op) { return emitOperation(emitter, op); })
-      .Case<OperatorInterface>(
+      .Case<quake::ApplyOp>([&](auto op) { return emitOperation(emitter, op); })
+      .Case<quake::AllocaOp>(
+          [&](auto op) { return emitOperation(emitter, op); })
+      .Case<quake::ExtractRefOp>(
+          [&](auto op) { return emitOperation(emitter, op); })
+      .Case<quake::OperatorInterface>(
           [&](auto optor) { return emitOperation(emitter, optor); })
-      .Case<MzOp>([&](auto op) { return emitOperation(emitter, op); })
-      .Case<ResetOp>([&](auto op) { return emitOperation(emitter, op); })
+      .Case<quake::MzOp>([&](auto op) { return emitOperation(emitter, op); })
+      .Case<quake::ResetOp>([&](auto op) { return emitOperation(emitter, op); })
       // Ignore
-      .Case<DeallocOp>([&](auto op) { return success(); })
+      .Case<quake::DeallocOp>([&](auto op) { return success(); })
       .Case<func::ReturnOp>([&](auto op) { return success(); })
       .Case<arith::ConstantOp>([&](auto op) { return success(); })
       .Case<cudaq::cc::AllocaOp>([&](auto op) { return success(); })
@@ -365,13 +368,13 @@ static LogicalResult emitOperation(Emitter &emitter, Operation &op) {
       .Case<cudaq::cc::ComputePtrOp>([&](auto op) { return success(); })
       .Case<quake::DiscriminateOp>([&](auto op) { return success(); })
       .Default([&](Operation *) -> LogicalResult {
-        if (op.getName().getDialectNamespace().equals("llvm"))
+        if (op.getName().getDialectNamespace() == "llvm")
           return success();
         return op.emitOpError("unable to translate op to OpenQASM 2.0");
       });
 }
 
 LogicalResult cudaq::translateToOpenQASM(Operation *op, raw_ostream &os) {
-  Emitter emitter(os);
+  cudaq::Emitter emitter(os);
   return emitOperation(emitter, *op);
 }
diff --git a/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp b/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
index 8f7370a9947..88a9318a920 100644
--- a/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
+++ b/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
@@ -15,13 +15,9 @@
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h"
 #include "cudaq/Optimizer/CodeGen/QuakeToExecMgr.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "nlohmann/json.hpp"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -65,10 +61,10 @@ struct QuakeTypeConverter : public TypeConverter {
   QuakeTypeConverter() {
     addConversion([](Type ty) { return ty; });
     addConversion([](quake::WireType ty) {
-      return cudaq::opt::getQubitType(ty.getContext());
+      return cudaq::cg::getQubitType(ty.getContext());
     });
     addConversion([](quake::MeasureType ty) {
-      return cudaq::opt::getResultType(ty.getContext());
+      return cudaq::cg::getResultType(ty.getContext());
     });
   }
 };
@@ -124,8 +120,8 @@ struct GeneralRewrite : OpConversionPattern<OP> {
     if (funcName.ends_with(qis_ctl_suffix) &&
         adaptor.getControls().size() == 1 && adaptor.getTargets().size() == 1) {
       auto *ctx = rewriter.getContext();
-      auto qbTy = cudaq::opt::getQubitType(ctx);
-      auto arrTy = cudaq::opt::getArrayType(ctx);
+      auto qbTy = cudaq::cg::getQubitType(ctx);
+      auto arrTy = cudaq::cg::getArrayType(ctx);
       SmallVector<Type> argTys = {arrTy, qbTy};
       ModuleOp mod = qop->template getParentOfType<ModuleOp>();
       FlatSymbolRefAttr qisFuncSymbol;
@@ -133,20 +129,20 @@ struct GeneralRewrite : OpConversionPattern<OP> {
         auto fTy = f.getFunctionType();
         auto fSym = f.getSymNameAttr();
         qisFuncSymbol = FlatSymbolRefAttr::get(ctx, funcName);
-        Value fVal = rewriter.create<func::ConstantOp>(loc, fTy, fSym);
+        Value fVal = func::ConstantOp::create(rewriter, loc, fTy, fSym);
         auto ptrI8Ty = cudaq::cc::PointerType::get(rewriter.getI8Type());
         Value fPtrVal =
-            rewriter.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, fVal);
-        Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
+            cudaq::cc::FuncToPtrOp::create(rewriter, loc, ptrI8Ty, fVal);
+        Value one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
         SmallVector<Value> callParamVals{one, fPtrVal,
                                          *adaptor.getControls().begin(),
                                          *adaptor.getTargets().begin()};
         SmallVector<Value> qubits(adaptor.getControls().begin(),
                                   adaptor.getControls().end());
         qubits.append(adaptor.getTargets().begin(), adaptor.getTargets().end());
-        rewriter.create<func::CallOp>(loc, std::nullopt,
-                                      cudaq::opt::NVQIRInvokeWithControlBits,
-                                      callParamVals);
+        func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                             cudaq::opt::NVQIRInvokeWithControlBits,
+                             callParamVals);
         rewriter.replaceOp(qop, qubits);
         return success();
       }
@@ -155,8 +151,8 @@ struct GeneralRewrite : OpConversionPattern<OP> {
       SmallVector<Value> qubits(adaptor.getControls().begin(),
                                 adaptor.getControls().end());
       qubits.append(adaptor.getTargets().begin(), adaptor.getTargets().end());
-      rewriter.create<func::CallOp>(loc, std::nullopt, funcName,
-                                    adaptor.getOperands());
+      func::CallOp::create(rewriter, loc, mlir::TypeRange{}, funcName,
+                           adaptor.getOperands());
       rewriter.replaceOp(qop, qubits);
       return success();
     }
@@ -173,12 +169,12 @@ struct BorrowWireRewrite : OpConversionPattern<quake::BorrowWireOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto id = borrowWire.getIdentity();
     auto loc = borrowWire.getLoc();
-    Value idCon = rewriter.create<arith::ConstantIntOp>(loc, id, 64);
+    Value idCon = arith::ConstantIntOp::create(rewriter, loc, id, 64);
     auto imTy =
         cudaq::cc::PointerType::get(NoneType::get(rewriter.getContext()));
-    idCon = rewriter.create<cudaq::cc::CastOp>(loc, imTy, idCon);
+    idCon = cudaq::cc::CastOp::create(rewriter, loc, imTy, idCon);
     rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(
-        borrowWire, cudaq::opt::getQubitType(rewriter.getContext()), idCon);
+        borrowWire, cudaq::cg::getQubitType(rewriter.getContext()), idCon);
     return success();
   }
 };
@@ -192,8 +188,8 @@ struct ResetRewrite : OpConversionPattern<quake::ResetOp> {
     SmallVector<Value> qubits{adaptor.getTargets()};
     auto loc = reset.getLoc();
     std::string funcName = toQisBodyName(std::string("reset"));
-    rewriter.create<func::CallOp>(loc, std::nullopt, funcName,
-                                  adaptor.getOperands());
+    func::CallOp::create(rewriter, loc, mlir::TypeRange{}, funcName,
+                         adaptor.getOperands());
     rewriter.replaceOp(reset, qubits);
     return success();
   }
@@ -205,8 +201,8 @@ struct BranchRewrite : OpConversionPattern<cf::BranchOp> {
   LogicalResult
   matchAndRewrite(cf::BranchOp branchOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto qubitTy = cudaq::opt::getQubitType(rewriter.getContext());
-    rewriter.startRootUpdate(branchOp);
+    auto qubitTy = cudaq::cg::getQubitType(rewriter.getContext());
+    rewriter.startOpModification(branchOp);
     if (branchOp.getSuccessor())
       for (auto arg : branchOp.getSuccessor()->getArguments())
         if (isa<quake::WireType>(arg.getType()))
@@ -214,7 +210,7 @@ struct BranchRewrite : OpConversionPattern<cf::BranchOp> {
     for (auto operand : branchOp.getOperands())
       if (isa<quake::WireType>(operand.getType()))
         operand.setType(qubitTy);
-    rewriter.finalizeRootUpdate(branchOp);
+    rewriter.finalizeOpModification(branchOp);
     return success();
   }
 };
@@ -225,8 +221,8 @@ struct CondBranchRewrite : OpConversionPattern<cf::CondBranchOp> {
   LogicalResult
   matchAndRewrite(cf::CondBranchOp branchOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto qubitTy = cudaq::opt::getQubitType(rewriter.getContext());
-    rewriter.startRootUpdate(branchOp);
+    auto qubitTy = cudaq::cg::getQubitType(rewriter.getContext());
+    rewriter.startOpModification(branchOp);
     for (auto suc : branchOp.getSuccessors())
       for (auto arg : suc->getArguments())
         if (isa<quake::WireType>(arg.getType()))
@@ -234,7 +230,7 @@ struct CondBranchRewrite : OpConversionPattern<cf::CondBranchOp> {
     for (auto operand : branchOp.getOperands())
       if (isa<quake::WireType>(operand.getType()))
         operand.setType(qubitTy);
-    rewriter.finalizeRootUpdate(branchOp);
+    rewriter.finalizeOpModification(branchOp);
     return success();
   }
 };
@@ -283,15 +279,15 @@ struct MzRewrite : OpConversionPattern<quake::MzOp> {
     // FIXME: Must use sequentially assigned result ids
     std::string funcName = toQisBodyName(std::string("mz"));
     auto loc = meas.getLoc();
-    Value idCon = rewriter.create<arith::ConstantIntOp>(loc, resultCount++, 64);
+    Value idCon =
+        arith::ConstantIntOp::create(rewriter, loc, resultCount++, 64);
     auto imTy =
         cudaq::cc::PointerType::get(NoneType::get(rewriter.getContext()));
-    idCon = rewriter.create<cudaq::cc::CastOp>(loc, imTy, idCon);
-    Value resultVal = rewriter.create<cudaq::cc::CastOp>(
-        loc, cudaq::opt::getResultType(rewriter.getContext()), idCon);
-    rewriter.create<func::CallOp>(
-        loc, std::nullopt, funcName,
-        ValueRange{adaptor.getTargets()[0], resultVal});
+    idCon = cudaq::cc::CastOp::create(rewriter, loc, imTy, idCon);
+    Value resultVal = cudaq::cc::CastOp::create(
+        rewriter, loc, cudaq::cg::getResultType(rewriter.getContext()), idCon);
+    func::CallOp::create(rewriter, loc, mlir::TypeRange{}, funcName,
+                         ValueRange{adaptor.getTargets()[0], resultVal});
     rewriter.replaceOp(meas, ValueRange{resultVal, adaptor.getTargets()[0]});
 
     auto regName = meas.getRegisterName();
@@ -306,15 +302,15 @@ struct MzRewrite : OpConversionPattern<quake::MzOp> {
       auto arrI8Ty = mlir::LLVM::LLVMArrayType::get(rewriter.getI8Type(),
                                                     regName->size() + 1);
       auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
-      Value nameVal = rewriter.create<cudaq::cc::AddressOfOp>(
-          loc, ptrArrTy, nameObj.getName());
+      Value nameVal = cudaq::cc::AddressOfOp::create(rewriter, loc, ptrArrTy,
+                                                     nameObj.getName());
       auto cstrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
       Value nameValCStr =
-          rewriter.create<cudaq::cc::CastOp>(loc, cstrTy, nameVal);
+          cudaq::cc::CastOp::create(rewriter, loc, cstrTy, nameVal);
 
-      rewriter.create<func::CallOp>(loc, std::nullopt,
-                                    cudaq::opt::QIRRecordOutput,
-                                    ValueRange{resultVal, nameValCStr});
+      func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                           cudaq::opt::QIRRecordOutput,
+                           ValueRange{resultVal, nameValCStr});
     }
 
     // Populate resultQubitVals[]
@@ -361,15 +357,15 @@ struct DiscriminateRewrite : OpConversionPattern<quake::DiscriminateOp> {
     auto arrI8Ty = mlir::LLVM::LLVMArrayType::get(rewriter.getI8Type(),
                                                   iter->second.size() + 1);
     auto ptrArrTy = cudaq::cc::PointerType::get(arrI8Ty);
-    Value nameVal = rewriter.create<cudaq::cc::AddressOfOp>(loc, ptrArrTy,
-                                                            nameObj.getName());
+    Value nameVal = cudaq::cc::AddressOfOp::create(rewriter, loc, ptrArrTy,
+                                                   nameObj.getName());
     auto cstrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
     Value nameValCStr =
-        rewriter.create<cudaq::cc::CastOp>(loc, cstrTy, nameVal);
+        cudaq::cc::CastOp::create(rewriter, loc, cstrTy, nameVal);
 
-    rewriter.create<func::CallOp>(
-        loc, std::nullopt, cudaq::opt::QIRRecordOutput,
-        ValueRange{adaptor.getMeasurement(), nameValCStr});
+    func::CallOp::create(rewriter, loc, mlir::TypeRange{},
+                         cudaq::opt::QIRRecordOutput,
+                         ValueRange{adaptor.getMeasurement(), nameValCStr});
     if (isAdaptiveProfile) {
       std::string funcName = toQisBodyName(std::string("read_result"));
       rewriter.replaceOpWithNewOp<func::CallOp>(
@@ -377,7 +373,7 @@ struct DiscriminateRewrite : OpConversionPattern<quake::DiscriminateOp> {
           ValueRange{adaptor.getMeasurement()});
     } else {
       Value undef =
-          rewriter.create<cudaq::cc::UndefOp>(loc, rewriter.getI1Type());
+          cudaq::cc::UndefOp::create(rewriter, loc, rewriter.getI1Type());
       rewriter.replaceOp(disc, undef);
     }
     return success();
@@ -477,7 +473,7 @@ struct WireSetToProfileQIRPrepPass
     auto loc = builder.getUnknownLoc();
 
     auto createNewDecl = [&](const std::string &name, FunctionType ty) {
-      auto func = builder.create<func::FuncOp>(loc, name, ty);
+      auto func = func::FuncOp::create(builder, loc, name, ty);
       func.setPrivate();
     };
     auto addNewDecl = [&](std::string &&suffix, FunctionType ty) {
@@ -497,7 +493,7 @@ struct WireSetToProfileQIRPrepPass
 
     LLVM_DEBUG(llvm::dbgs() << "Module before prep:\n"; op.dump());
     // Insert declarations for all the functions we *may* be using.
-    auto qbTy = cudaq::opt::getQubitType(ctx);
+    auto qbTy = cudaq::cg::getQubitType(ctx);
     auto targ1Ty = FunctionType::get(ctx, TypeRange{qbTy}, TypeRange{});
     auto targ1CtrlTy =
         FunctionType::get(ctx, TypeRange{qbTy, qbTy}, TypeRange{});
@@ -539,7 +535,7 @@ struct WireSetToProfileQIRPrepPass
     addDecls("swap", targ2Ty, targ2CtrlTy);
     addBodyDecl("cnot", targ2Ty);
 
-    auto resTy = cudaq::opt::getResultType(ctx);
+    auto resTy = cudaq::cg::getResultType(ctx);
     auto measTy = FunctionType::get(ctx, TypeRange{qbTy, resTy}, TypeRange{});
     addBodyDecl("mz", measTy);
     auto readResTy = FunctionType::get(ctx, TypeRange{resTy},
@@ -608,9 +604,8 @@ struct WireSetToProfileQIRPostPass
                 callableRegion->getParentOfType<mlir::func::FuncOp>();
 
             if (auto reqQubits =
-                    parentFuncOp
-                        ->getAttr(cudaq::opt::qir0_1::RequiredQubitsAttrName)
-                        .dyn_cast_or_null<StringAttr>()) {
+                    dyn_cast_if_present<StringAttr>(parentFuncOp->getAttr(
+                        cudaq::opt::qir0_1::RequiredQubitsAttrName))) {
               std::uint32_t thisFuncReqQubits = 0;
               if (!reqQubits.strref().getAsInteger(10, thisFuncReqQubits)) {
                 auto thisFuncHighestIdentity = thisFuncReqQubits - 1;
@@ -622,9 +617,8 @@ struct WireSetToProfileQIRPostPass
             }
 
             if (auto reqResults =
-                    parentFuncOp
-                        ->getAttr(cudaq::opt::qir0_1::RequiredResultsAttrName)
-                        .dyn_cast_or_null<StringAttr>()) {
+                    dyn_cast_if_present<StringAttr>(parentFuncOp->getAttr(
+                        cudaq::opt::qir0_1::RequiredResultsAttrName))) {
               std::uint32_t thisFuncReqResults = 0;
               if (!reqResults.strref().getAsInteger(10, thisFuncReqResults)) {
                 auto thisFuncHighestResult = thisFuncReqResults - 1;
diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp
index f6def3c59e0..36dc7517212 100644
--- a/lib/Optimizer/Dialect/CC/CCOps.cpp
+++ b/lib/Optimizer/Dialect/CC/CCOps.cpp
@@ -50,7 +50,7 @@ std::optional<APFloat> cudaq::opt::factory::getDoubleIfConstant(Value value) {
 Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty,
                                    bool useSizeOf) {
   auto createInt = [&](std::int32_t byteWidth) -> Value {
-    return builder.create<arith::ConstantIntOp>(loc, byteWidth, 64);
+    return arith::ConstantIntOp::create(builder, loc, byteWidth, 64);
   };
 
   // Handle primitive types with constant sizes.
@@ -91,8 +91,8 @@ Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty,
           return createInt(byteWidth);
         }
         if (useSizeOf)
-          return builder.create<cudaq::cc::SizeOfOp>(loc, builder.getI64Type(),
-                                                     strTy);
+          return cudaq::cc::SizeOfOp::create(builder, loc, builder.getI64Type(),
+                                             strTy);
         return {};
       })
       .Case([&](cudaq::cc::ArrayType arrTy) -> Value {
@@ -103,8 +103,8 @@ Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty,
         if (!v)
           return {};
         auto scale = createInt(arrTy.getSize());
-        return builder.create<arith::MulIOp>(loc, builder.getI64Type(), v,
-                                             scale);
+        return arith::MulIOp::create(builder, loc, builder.getI64Type(), v,
+                                     scale);
       })
       .Case([&](cudaq::cc::SpanLikeType) -> Value {
         // Uniformly on the device size: {ptr, i64}
@@ -179,7 +179,7 @@ struct FuseAllocLength : public OpRewritePattern<cudaq::cc::AllocaOp> {
           Type oldTy = alloca.getElementType();
           auto arrTy = cudaq::cc::ArrayType::get(context, oldTy, *size);
           Type origTy = alloca.getType();
-          auto newAlloc = rewriter.create<cudaq::cc::AllocaOp>(loc, arrTy);
+          auto newAlloc = cudaq::cc::AllocaOp::create(rewriter, loc, arrTy);
           rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(alloca, origTy,
                                                          newAlloc);
           return success();
@@ -206,19 +206,40 @@ LogicalResult cudaq::cc::AllocaOp::verify() {
 // CastOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult cudaq::cc::CastOp::fold(FoldAdaptor adaptor) {
-  // If cast is a nop, just forward the argument to the uses.
-  if (getType() == getValue().getType())
-    return getValue();
-  if (auto optConst = adaptor.getValue()) {
+namespace {
+/// This pattern folds casts of (some) constants into new constant ops. This is
+/// meant to eliminate cast operations when result values are clearly
+/// computable.
+struct FoldCastOp : public OpRewritePattern<cudaq::cc::CastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::CastOp kast,
+                                PatternRewriter &rewriter) const override {
+    // If cast is a nop, just forward the argument to the uses.
+    auto ty = kast.getType();
+    if (ty == kast.getValue().getType()) {
+      Value val = kast.getValue();
+      rewriter.replaceOp(kast, val);
+      return success();
+    }
+
+    Operation *defOp = kast.getValue().getDefiningOp();
+    if (!defOp)
+      return failure();
+
+    Attribute optConst;
+    if (!matchPattern(kast.getValue(), m_Constant(&optConst)))
+      return failure();
+
     // Replace a constant + cast with a new constant of an updated type.
-    auto ty = getType();
-    OpBuilder builder(*this);
-    auto fltTy = builder.getF32Type();
-    auto dblTy = builder.getF64Type();
-    auto loc = getLoc();
+    auto fltTy = rewriter.getF32Type();
+    auto dblTy = rewriter.getF64Type();
+    auto loc = kast.getLoc();
+
     auto truncate = [&](std::int64_t val) -> std::int64_t {
-      auto srcTy = getValue().getType();
+      auto srcTy = kast.getValue().getType();
+      if (!srcTy.isIntOrFloat())
+        return val;
       auto srcWidth = srcTy.getIntOrFloatBitWidth();
       // Zero-extend to get the original integer value.
       if (srcWidth < 64)
@@ -231,42 +252,51 @@ OpFoldResult cudaq::cc::CastOp::fold(FoldAdaptor adaptor) {
       if (isa<IntegerType>(ty)) {
         auto width = ty.getIntOrFloatBitWidth();
 
-        if (getZint())
+        if (kast.getZint())
           val = truncate(val);
 
         if (width == 1) {
+          // C++ semantics. 0 is false. All other values are true.
           bool v = val != 0;
-          return builder.create<arith::ConstantIntOp>(loc, v, width)
-              .getResult();
+          auto c = arith::ConstantIntOp::create(rewriter, loc, v, width);
+          rewriter.replaceOp(kast, c);
+          return success();
         }
-        return builder.create<arith::ConstantIntOp>(loc, val, width)
-            .getResult();
-
-      } else if (ty == fltTy) {
-        if (getZint()) {
+        auto c = arith::ConstantIntOp::create(rewriter, loc, val, width);
+        rewriter.replaceOp(kast, c);
+        return success();
+      }
+      if (ty == fltTy) {
+        if (kast.getZint()) {
           val = truncate(val);
           APFloat fval(static_cast<float>(static_cast<std::uint64_t>(val)));
-          return builder.create<arith::ConstantFloatOp>(loc, fval, fltTy)
-              .getResult();
+          auto c = arith::ConstantFloatOp::create(rewriter, loc, fltTy, fval);
+          rewriter.replaceOp(kast, c);
+          return success();
         }
-        if (getSint()) {
+        if (kast.getSint()) {
           APFloat fval(static_cast<float>(val));
-          return builder.create<arith::ConstantFloatOp>(loc, fval, fltTy)
-              .getResult();
+          auto c = arith::ConstantFloatOp::create(rewriter, loc, fltTy, fval);
+          rewriter.replaceOp(kast, c);
+          return success();
         }
-      } else if (ty == dblTy) {
-        if (getZint()) {
+      }
+      if (ty == dblTy) {
+        if (kast.getZint()) {
           val = truncate(val);
           APFloat fval(static_cast<double>(static_cast<std::uint64_t>(val)));
-          return builder.create<arith::ConstantFloatOp>(loc, fval, dblTy)
-              .getResult();
+          auto c = arith::ConstantFloatOp::create(rewriter, loc, dblTy, fval);
+          rewriter.replaceOp(kast, c);
+          return success();
         }
-        if (getSint()) {
+        if (kast.getSint()) {
           APFloat fval(static_cast<double>(val));
-          return builder.create<arith::ConstantFloatOp>(loc, fval, dblTy)
-              .getResult();
+          auto c = arith::ConstantFloatOp::create(rewriter, loc, dblTy, fval);
+          rewriter.replaceOp(kast, c);
+          return success();
         }
       }
+      return failure();
     }
 
     // %5 = arith.constant ... : F1
@@ -278,27 +308,32 @@ OpFoldResult cudaq::cc::CastOp::fold(FoldAdaptor adaptor) {
       if (ty == fltTy) {
         float f = val.convertToDouble();
         APFloat fval(f);
-        return builder.create<arith::ConstantFloatOp>(loc, fval, fltTy)
-            .getResult();
+        auto c = arith::ConstantFloatOp::create(rewriter, loc, fltTy, fval);
+        rewriter.replaceOp(kast, c);
+        return success();
       }
       if (ty == dblTy) {
         APFloat fval{val.convertToDouble()};
-        return builder.create<arith::ConstantFloatOp>(loc, fval, dblTy)
-            .getResult();
+        auto c = arith::ConstantFloatOp::create(rewriter, loc, dblTy, fval);
+        rewriter.replaceOp(kast, c);
+        return success();
       }
       if (isa<IntegerType>(ty)) {
         auto width = ty.getIntOrFloatBitWidth();
-        if (getZint()) {
+        if (kast.getZint()) {
           std::uint64_t v = val.convertToDouble();
-          return builder.create<arith::ConstantIntOp>(loc, v, width)
-              .getResult();
+          auto c = arith::ConstantIntOp::create(rewriter, loc, v, width);
+          rewriter.replaceOp(kast, c);
+          return success();
         }
-        if (getSint()) {
+        if (kast.getSint()) {
           std::int64_t v = val.convertToDouble();
-          return builder.create<arith::ConstantIntOp>(loc, v, width)
-              .getResult();
+          auto c = arith::ConstantIntOp::create(rewriter, loc, v, width);
+          rewriter.replaceOp(kast, c);
+          return success();
         }
       }
+      return failure();
     }
 
     // %5 = complex.constant ... : complex<T>
@@ -306,6 +341,8 @@ OpFoldResult cudaq::cc::CastOp::fold(FoldAdaptor adaptor) {
     // ────────────────────────────────────────────
     // %6 = complex.constant ... : complex<U>
     if (auto attr = dyn_cast<ArrayAttr>(optConst)) {
+      if (!isa<ComplexType>(ty))
+        return failure();
       auto eleTy = cast<ComplexType>(ty).getElementType();
       auto reFp = dyn_cast<FloatAttr>(attr[0]);
       auto imFp = dyn_cast<FloatAttr>(attr[1]);
@@ -313,24 +350,35 @@ OpFoldResult cudaq::cc::CastOp::fold(FoldAdaptor adaptor) {
         if (eleTy == fltTy) {
           float reVal = reFp.getValue().convertToDouble();
           float imVal = imFp.getValue().convertToDouble();
-          auto rePart = builder.getFloatAttr(eleTy, APFloat{reVal});
-          auto imPart = builder.getFloatAttr(eleTy, APFloat{imVal});
-          auto cv = builder.getArrayAttr({rePart, imPart});
-          return builder.create<complex::ConstantOp>(loc, ty, cv).getResult();
+          auto rePart = rewriter.getFloatAttr(eleTy, APFloat{reVal});
+          auto imPart = rewriter.getFloatAttr(eleTy, APFloat{imVal});
+          auto cv = rewriter.getArrayAttr({rePart, imPart});
+          auto c =
+              complex::ConstantOp::create(rewriter, loc, ty, cv).getResult();
+          rewriter.replaceOp(kast, c);
+          return success();
         }
         if (eleTy == dblTy) {
           double reVal = reFp.getValue().convertToDouble();
           double imVal = imFp.getValue().convertToDouble();
-          auto rePart = builder.getFloatAttr(eleTy, APFloat{reVal});
-          auto imPart = builder.getFloatAttr(eleTy, APFloat{imVal});
-          auto cv = builder.getArrayAttr({rePart, imPart});
-          return builder.create<complex::ConstantOp>(loc, ty, cv).getResult();
+          auto rePart = rewriter.getFloatAttr(eleTy, APFloat{reVal});
+          auto imPart = rewriter.getFloatAttr(eleTy, APFloat{imVal});
+          auto cv = rewriter.getArrayAttr({rePart, imPart});
+          auto c =
+              complex::ConstantOp::create(rewriter, loc, ty, cv).getResult();
+          rewriter.replaceOp(kast, c);
+          return success();
         }
       }
+      // Might be a complex integer? Ignore for now.
+      return failure();
     }
+
+    // this is not a constant we try to fold.
+    return failure();
   }
-  return nullptr;
-}
+};
+} // namespace
 
 LogicalResult cudaq::cc::CastOp::verify() {
   auto inTy = getValue().getType();
@@ -536,7 +584,7 @@ struct FuseComplexRe : public OpRewritePattern<complex::ReOp> {
     if (comcon) {
       FloatType fltTy = reop.getType();
       APFloat reVal = cast<FloatAttr>(comcon.getValue()[0]).getValue();
-      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(reop, reVal, fltTy);
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(reop, fltTy, reVal);
       return success();
     }
     return failure();
@@ -551,7 +599,7 @@ struct FuseComplexIm : public OpRewritePattern<complex::ImOp> {
     if (comcon) {
       FloatType fltTy = imop.getType();
       APFloat imVal = cast<FloatAttr>(comcon.getValue()[1]).getValue();
-      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(imop, imVal, fltTy);
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(imop, fltTy, imVal);
       return success();
     }
     return failure();
@@ -567,7 +615,7 @@ getArbitraryCustomCanonicalizationPatterns(RewritePatternSet &patterns,
 
 void cudaq::cc::CastOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                                     MLIRContext *context) {
-  patterns.add<FuseCastCascade, SimplifyIntegerCompare>(context);
+  patterns.add<FuseCastCascade, SimplifyIntegerCompare, FoldCastOp>(context);
   getArbitraryCustomCanonicalizationPatterns(patterns, context);
 }
 
@@ -613,7 +661,7 @@ void printInterleavedIndices(OpAsmPrinter &printer, B computePtrOp,
                           if (Value val = dyn_cast<Value>(cst))
                             printer.printOperand(val);
                           else
-                            printer << cst.get<IntegerAttr>().getInt();
+                            printer << cast<IntegerAttr>(cst).getInt();
                         });
 }
 
@@ -700,7 +748,8 @@ void destructureIndices(Type currType, ArrayRef<B> indices,
       dynamicIndices.push_back(val);
     } else {
       rawConstantIndices.push_back(
-          iter.template get<cudaq::cc::InterleavedArgumentConstantIndex>());
+          iter.template dyn_cast<
+              cudaq::cc::InterleavedArgumentConstantIndex>());
     }
 
     currType =
@@ -737,51 +786,66 @@ void cudaq::cc::ComputePtrOp::build(OpBuilder &builder, OperationState &result,
   result.addOperands(dynamicIndices);
 }
 
-OpFoldResult cudaq::cc::ComputePtrOp::fold(FoldAdaptor adaptor) {
-  if (getDynamicIndices().empty())
-    return nullptr;
-  // Params is a list of possible substitutions (Attributes) the length of the
-  // SSA arguments. Skip the first one, which is the base pointer argument.
-  auto paramIter = adaptor.getOperands().begin();
-  ++paramIter;
-
-  auto dynamicIndexIter = getDynamicIndices().begin();
-  SmallVector<std::int32_t> newConstantIndices;
-  SmallVector<Value> newIndices;
-  bool changed = false;
-
-  // Build lists of raw constants and SSA values with the SSA values that have
-  // substituions omitted and properly interleaved in as constants in the first
-  // list.
-  for (auto index : getRawConstantIndices()) {
-    if (index != kDynamicIndex) {
-      newConstantIndices.push_back(index);
-      continue;
-    }
-    if (auto newVal = dyn_cast_if_present<IntegerAttr>(*paramIter)) {
-      newConstantIndices.push_back(newVal.getInt());
-      changed = true;
-    } else {
-      newConstantIndices.push_back(kDynamicIndex);
-      newIndices.push_back(*dynamicIndexIter);
-    }
-    ++dynamicIndexIter;
+namespace {
+struct FoldComputePtrOp : public OpRewritePattern<cudaq::cc::ComputePtrOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::ComputePtrOp ptrOp,
+                                PatternRewriter &rewriter) const override {
+    if (ptrOp.getDynamicIndices().empty())
+      return failure();
+
+    // Params is a list of possible substitutions (Attributes) the length of the
+    // SSA arguments. Skip the first one, which is the base pointer argument.
+    auto paramIter = ptrOp.getOperands().begin();
     ++paramIter;
-  }
 
-  // If any new constants were found, update the cc.compute_ptr in place, adding
-  // the new constants and dropping any unneeded SSA arguments on the floor.
-  if (changed) {
-    assert(newConstantIndices.size() == getRawConstantIndices().size());
-    assert(newIndices.size() < getDynamicIndices().size());
-    getDynamicIndicesMutable().assign(newIndices);
-    setRawConstantIndices(newConstantIndices);
-    return Value{*this};
+    auto dynamicIndexIter = ptrOp.getDynamicIndices().begin();
+    SmallVector<std::int32_t> newConstantIndices;
+    SmallVector<Value> newIndices;
+    bool changed = false;
+
+    // Build lists of raw constants and SSA values with the SSA values that have
+    // substituions omitted and properly interleaved in as constants in the
+    // first list.
+    for (auto index : ptrOp.getRawConstantIndices()) {
+      if (index != cudaq::cc::ComputePtrOp::kDynamicIndex) {
+        newConstantIndices.push_back(index);
+        continue;
+      }
+
+      Attribute konstant;
+      bool handleNonConstant = true;
+      if (matchPattern(*paramIter, m_Constant(&konstant)))
+        if (auto newVal = dyn_cast_if_present<IntegerAttr>(konstant)) {
+          newConstantIndices.push_back(newVal.getInt());
+          changed = true;
+          handleNonConstant = false;
+        }
+      if (handleNonConstant) {
+        newConstantIndices.push_back(cudaq::cc::ComputePtrOp::kDynamicIndex);
+        newIndices.push_back(*dynamicIndexIter);
+      }
+      ++dynamicIndexIter;
+      ++paramIter;
+    }
+
+    // If any new constants were found, update the cc.compute_ptr in place,
+    // adding the new constants and dropping any unneeded SSA arguments on the
+    // floor.
+    if (!changed)
+      return failure();
+
+    assert(newConstantIndices.size() == ptrOp.getRawConstantIndices().size());
+    assert(newIndices.size() < ptrOp.getDynamicIndices().size());
+    rewriter.modifyOpInPlace(ptrOp, [&]() {
+      ptrOp.getDynamicIndicesMutable().assign(newIndices);
+      ptrOp.setRawConstantIndices(newConstantIndices);
+    });
+    return success();
   }
-  return nullptr;
-}
+};
 
-namespace {
 /// If two (or more) `cc.compute_ptr` are chained then they can be fused into a
 /// single `cc.compute_ptr`.
 struct FuseAddressArithmetic
@@ -876,8 +940,8 @@ struct FuseAddressArithmetic
           auto eleTy = cast<cudaq::cc::ArrayType>(ptrTy.getElementType());
           auto subTy = eleTy.getElementType();
           auto simpleTy = cudaq::cc::PointerType::get(subTy);
-          auto simple = rewriter.create<cudaq::cc::CastOp>(
-              ptrOp.getLoc(), simpleTy, ptrOp.getBase());
+          auto simple = cudaq::cc::CastOp::create(rewriter, ptrOp.getLoc(),
+                                                  simpleTy, ptrOp.getBase());
 
           // Collect indices.
           auto iter = ptrOp.getDynamicIndices().begin();
@@ -901,7 +965,7 @@ struct FuseAddressArithmetic
 
 void cudaq::cc::ComputePtrOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<FuseAddressArithmetic>(context);
+  patterns.add<FuseAddressArithmetic, FoldComputePtrOp>(context);
 }
 
 std::optional<std::int32_t>
@@ -969,50 +1033,66 @@ LogicalResult cudaq::cc::ExtractValueOp::verify() {
   return success();
 }
 
-OpFoldResult cudaq::cc::ExtractValueOp::fold(FoldAdaptor adaptor) {
-  if (indicesAreConstant())
-    return nullptr;
+namespace {
+struct FoldExtractOp : public OpRewritePattern<cudaq::cc::ExtractValueOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(cudaq::cc::ExtractValueOp extval,
+                                PatternRewriter &rewriter) const override {
+    if (extval.indicesAreConstant())
+      return failure();
 
-  // Params is a list of possible substitutions (Attributes) the length of the
-  // SSA arguments. Skip the first one, which is the base pointer argument.
-  auto paramIter = adaptor.getOperands().begin();
-  ++paramIter;
+    // Params is a list of possible substitutions (Attributes) the length of the
+    // SSA arguments. Skip the first one, which is the base pointer argument.
+    auto paramIter = extval.getOperands().begin();
+    ++paramIter;
 
-  auto dynamicIndexIter = getDynamicIndices().begin();
-  SmallVector<std::int32_t> newConstantIndices;
-  SmallVector<Value> newIndices;
-  bool changed = false;
+    auto dynamicIndexIter = extval.getDynamicIndices().begin();
+    SmallVector<std::int32_t> newConstantIndices;
+    SmallVector<Value> newIndices;
+    bool changed = false;
+
+    // Build lists of raw constants and SSA values with the SSA values that have
+    // substituions omitted and properly interleaved in as constants in the
+    // first list.
+    for (auto index : extval.getRawConstantIndices()) {
+      if (index != cudaq::cc::ExtractValueOp::kDynamicIndex) {
+        newConstantIndices.push_back(index);
+        continue;
+      }
 
-  // Build lists of raw constants and SSA values with the SSA values that have
-  // substituions omitted and properly interleaved in as constants in the first
-  // list.
-  for (auto index : getRawConstantIndices()) {
-    if (index != kDynamicIndex) {
-      newConstantIndices.push_back(index);
-      continue;
-    }
-    if (auto newVal = dyn_cast_if_present<IntegerAttr>(*paramIter)) {
-      newConstantIndices.push_back(newVal.getInt());
-      changed = true;
-    } else {
-      newConstantIndices.push_back(kDynamicIndex);
-      newIndices.push_back(*dynamicIndexIter);
+      Attribute konstant;
+      bool handleNonConstant = true;
+      if (matchPattern(*paramIter, m_Constant(&konstant)))
+        if (auto newVal = dyn_cast_if_present<IntegerAttr>(konstant)) {
+          newConstantIndices.push_back(newVal.getInt());
+          changed = true;
+          handleNonConstant = false;
+        }
+      if (handleNonConstant) {
+        newConstantIndices.push_back(cudaq::cc::ExtractValueOp::kDynamicIndex);
+        newIndices.push_back(*dynamicIndexIter);
+      }
+      ++dynamicIndexIter;
+      ++paramIter;
     }
-    ++dynamicIndexIter;
-    ++paramIter;
-  }
 
-  // If any new constants were found, update the cc.compute_ptr in place, adding
-  // the new constants and dropping any unneeded SSA arguments on the floor.
-  if (changed) {
-    assert(newConstantIndices.size() == getRawConstantIndices().size());
-    assert(newIndices.size() < getDynamicIndices().size());
-    getDynamicIndicesMutable().assign(newIndices);
-    setRawConstantIndices(newConstantIndices);
-    return Value{*this};
+    // If any new constants were found, update the cc.compute_ptr in place,
+    // adding the new constants and dropping any unneeded SSA arguments on the
+    // floor.
+    if (!changed)
+      return failure();
+
+    assert(newConstantIndices.size() == extval.getRawConstantIndices().size());
+    assert(newIndices.size() < extval.getDynamicIndices().size());
+    rewriter.modifyOpInPlace(extval, [&]() {
+      extval.getDynamicIndicesMutable().assign(newIndices);
+      extval.setRawConstantIndices(newConstantIndices);
+    });
+    return success();
   }
-  return nullptr;
-}
+};
+} // namespace
 
 static ParseResult parseExtractValueIndices(
     OpAsmParser &parser,
@@ -1083,16 +1163,16 @@ struct FuseWithConstantArray
         if (auto intTy = dyn_cast<IntegerType>(extval.getType())) {
           std::int32_t i = extval.getRawConstantIndices()[0];
           auto cval = cast<IntegerAttr>(conarr.getConstantValues()[i]).getInt();
-          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(extval, cval,
-                                                            intTy);
+          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(extval, intTy,
+                                                            cval);
 
           return success();
         }
         if (auto fltTy = dyn_cast<FloatType>(extval.getType())) {
           std::int32_t i = extval.getRawConstantIndices()[0];
           auto cval = cast<FloatAttr>(conarr.getConstantValues()[i]).getValue();
-          rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(extval, cval,
-                                                              fltTy);
+          rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(extval, fltTy,
+                                                              cval);
 
           return success();
         }
@@ -1111,7 +1191,7 @@ struct FuseWithConstantArray
 
 void cudaq::cc::ExtractValueOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<FuseWithConstantArray>(context);
+  patterns.add<FuseWithConstantArray, FoldExtractOp>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1368,8 +1448,8 @@ struct ForwardStdvecInitSize
       if (auto arrTy =
               dyn_cast<cudaq::cc::ArrayType>(init.getBuffer().getType()))
         if (!arrTy.isUnknownSize()) {
-          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
-              size, arrTy.getSize(), ty);
+          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(size, ty,
+                                                            arrTy.getSize());
           return success();
         }
     }
@@ -1387,9 +1467,6 @@ void cudaq::cc::StdvecSizeOp::getCanonicalizationPatterns(
 // LoopOp
 //===----------------------------------------------------------------------===//
 
-// Override the default.
-Region &cudaq::cc::LoopOp::getLoopBody() { return getBodyRegion(); }
-
 // The basic block of the step region must end in a continue op, which need not
 // be pretty printed if the loop has no block arguments. This ensures the step
 // block is properly terminated.
@@ -1401,7 +1478,7 @@ static void ensureStepTerminator(OpBuilder &builder, OperationState &result,
   auto addContinue = [&]() {
     OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPointToEnd(block);
-    builder.create<cudaq::cc::ContinueOp>(result.location);
+    cudaq::cc::ContinueOp::create(builder, result.location);
   };
   if (block->empty()) {
     addContinue();
@@ -1629,69 +1706,82 @@ bool cudaq::cc::LoopOp::hasBreakInBody() {
 }
 
 void cudaq::cc::LoopOp::getSuccessorRegions(
-    std::optional<unsigned> index, ArrayRef<Attribute> operands,
-    SmallVectorImpl<RegionSuccessor> &regions) {
-  if (!index) {
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  if (point.isParent()) {
     // loop op, successor is either the WHILE region, or the DO region if loop
     // is post conditional.
     if (isPostConditional())
-      regions.push_back(
-          RegionSuccessor(&getBodyRegion(), getDoEntryArguments()));
+      regions.emplace_back(&getBodyRegion(), getDoEntryArguments());
     else
-      regions.push_back(
-          RegionSuccessor(&getWhileRegion(), getWhileArguments()));
+      regions.emplace_back(&getWhileRegion(), getWhileArguments());
     return;
   }
-  switch (index.value()) {
-  case 0:
-    // WHILE region, successors are the DO region and either the owning loop op
-    // (if no else region is present) or the else region.
-    regions.push_back(RegionSuccessor(&getBodyRegion(), getDoEntryArguments()));
+
+  Operation *pred = point.getTerminatorPredecessorOrNull();
+  assert(pred && "must have a terminator");
+  Region *region = pred->getParentRegion();
+  assert(region && "must have a region");
+  if (region == &getWhileRegion()) {
+    // WHILE region, successors are the owning loop op and the DO region.
+    regions.emplace_back(&getBodyRegion(), getDoEntryArguments());
     if (hasPythonElse())
-      regions.push_back(
-          RegionSuccessor(&getElseRegion(), getElseEntryArguments()));
+      regions.emplace_back(&getElseRegion(), getElseEntryArguments());
     else
-      regions.push_back(RegionSuccessor(getResults()));
-    break;
-  case 1:
+      regions.emplace_back(getOperation(), getResults());
+  } else if (region == &getBodyRegion()) {
     // DO region, successor is STEP region (2) if present, or WHILE region (0)
     // if STEP is absent.
     if (hasStep())
-      regions.push_back(RegionSuccessor(&getStepRegion(), getStepArguments()));
+      regions.emplace_back(&getStepRegion(), getStepArguments());
     else
-      regions.push_back(
-          RegionSuccessor(&getWhileRegion(), getWhileArguments()));
+      regions.emplace_back(&getWhileRegion(), getWhileArguments());
     // If the body contains a break, then the loop op is also a successor.
     if (hasBreakInBody())
-      regions.push_back(RegionSuccessor(getResults()));
-    break;
-  case 2:
+      regions.emplace_back(getOperation(), getResults());
+  } else if (region == &getStepRegion()) {
     // STEP region, if present, WHILE region is always successor.
     if (hasStep())
-      regions.push_back(
-          RegionSuccessor(&getWhileRegion(), getWhileArguments()));
-    break;
-  case 3:
+      regions.emplace_back(&getWhileRegion(), getWhileArguments());
+  } else if (region == &getElseRegion()) {
     // ELSE region, successors are the owning loop op.
     if (hasPythonElse())
-      regions.push_back(RegionSuccessor(getResults()));
-    break;
+      regions.emplace_back(getOperation(), getResults());
+  } else {
+    emitOpError("unhandled region");
   }
 }
 
 OperandRange
-cudaq::cc::LoopOp::getSuccessorEntryOperands(std::optional<unsigned> index) {
-  assert(index && "invalid index region");
-  switch (*index) {
-  case 0:
-    if (!isPostConditional())
-      return getInitialArgs();
-    break;
-  case 1:
-    if (isPostConditional())
-      return getInitialArgs();
-    break;
-  }
+cudaq::cc::LoopOp::getEntrySuccessorOperands(RegionSuccessor successor) {
+  // If the successor is the 'while' region (Region #0), pass the initial args.
+  if (successor.getSuccessor() == &getWhileRegion())
+    return getInitialArgs();
+
+  auto *region = successor.getSuccessor();
+  if (region == &getWhileRegion() && !isPostConditional())
+    return getInitialArgs();
+  if (region == &getBodyRegion() && isPostConditional())
+    return getInitialArgs();
+
+  // Otherwise, no operands are passed from the parent.
+  return {nullptr, 0};
+}
+
+SmallVector<Region *> cudaq::cc::LoopOp::getLoopRegions() {
+  return {&getWhileRegion(), &getBodyRegion(), &getStepRegion()};
+}
+
+OperandRange
+cudaq::cc::LoopOp::getEntrySuccessorOperands(RegionBranchPoint point) {
+  llvm::errs() << "getEntrySuccessorOperands: " << point << "\n";
+  assert(!point.isParent() && "invalid index region");
+  Operation *pred = point.getTerminatorPredecessorOrNull();
+  assert(pred && "must have a terminator");
+  Region *region = pred->getParentRegion();
+  if (region == &getWhileRegion() && !isPostConditional())
+    return getInitialArgs();
+  if (region == &getBodyRegion() && isPostConditional())
+    return getInitialArgs();
   return {nullptr, 0};
 }
 
@@ -1845,7 +1935,7 @@ static void ensureScopeRegionTerminator(OpBuilder &builder,
   }
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(block);
-  builder.create<cudaq::cc::ContinueOp>(result.location);
+  cudaq::cc::ContinueOp::create(builder, result.location);
 }
 
 ParseResult cudaq::cc::ScopeOp::parse(OpAsmParser &parser,
@@ -1865,13 +1955,12 @@ void cudaq::cc::ScopeOp::getRegionInvocationBounds(
     ArrayRef<Attribute> attrs, SmallVectorImpl<InvocationBounds> &bounds) {}
 
 void cudaq::cc::ScopeOp::getSuccessorRegions(
-    std::optional<unsigned> index, ArrayRef<Attribute> operands,
-    SmallVectorImpl<RegionSuccessor> &regions) {
-  if (!index) {
-    regions.push_back(RegionSuccessor(&getRegion()));
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  if (point.isParent()) {
+    regions.emplace_back(&getRegion());
     return;
   }
-  regions.push_back(RegionSuccessor(getResults()));
+  regions.emplace_back(getOperation(), getResults());
 }
 
 // If quantumAllocs, then just look for any allocate memory effect. Otherwise,
@@ -1942,7 +2031,7 @@ struct EraseScopeWhenNotNeeded : public OpRewritePattern<cudaq::cc::ScopeOp> {
       succBlock = rewriter.createBlock(
           splitBlock, scope.getResultTypes(),
           SmallVector<Location>(scope.getNumResults(), loc));
-      rewriter.create<cf::BranchOp>(loc, splitBlock);
+      cf::BranchOp::create(rewriter, loc, splitBlock);
     }
     // Inline the cc.scope's region into the parent and create a branch to the
     // new successor block.
@@ -1951,13 +2040,13 @@ struct EraseScopeWhenNotNeeded : public OpRewritePattern<cudaq::cc::ScopeOp> {
     auto *initTerminator = initRegion.back().getTerminator();
     auto initTerminatorOperands = initTerminator->getOperands();
     rewriter.setInsertionPointToEnd(&initRegion.back());
-    rewriter.create<cf::BranchOp>(loc, succBlock, initTerminatorOperands);
+    cf::BranchOp::create(rewriter, loc, succBlock, initTerminatorOperands);
     rewriter.eraseOp(initTerminator);
     rewriter.inlineRegionBefore(initRegion, succBlock);
     // Replace the cc.scope with a branch to the newly inlined region's entry
     // block.
     rewriter.setInsertionPointToEnd(scopeBlock);
-    rewriter.create<cf::BranchOp>(loc, initBlock, ValueRange{});
+    cf::BranchOp::create(rewriter, loc, initBlock, ValueRange{});
     rewriter.replaceOp(scope, succBlock->getArguments());
     return success();
   }
@@ -2045,7 +2134,7 @@ static void ensureIfRegionTerminator(OpBuilder &builder, OperationState &result,
   }
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(block);
-  builder.create<cudaq::cc::ContinueOp>(result.location);
+  cudaq::cc::ContinueOp::create(builder, result.location);
 }
 
 ParseResult cudaq::cc::IfOp::parse(OpAsmParser &parser,
@@ -2113,16 +2202,31 @@ void cudaq::cc::IfOp::getRegionInvocationBounds(
 }
 
 void cudaq::cc::IfOp::getSuccessorRegions(
-    std::optional<unsigned> index, ArrayRef<Attribute> operands,
-    SmallVectorImpl<RegionSuccessor> &regions) {
-  if (index) {
-    regions.push_back(RegionSuccessor(getResults()));
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  if (point.isParent()) {
+    regions.emplace_back(&getThenRegion());
+    if (!getElseRegion().empty())
+      regions.emplace_back(&getElseRegion());
+  } else {
+    regions.emplace_back(getOperation(), getResults());
+  }
+}
+
+void cudaq::cc::IfOp::getEntrySuccessorRegions(
+    ArrayRef<Attribute> operands, SmallVectorImpl<RegionSuccessor> &regions) {
+  FoldAdaptor adaptor(operands);
+  auto boolAttr = dyn_cast_or_null<BoolAttr>(adaptor.getCondition());
+  if (!boolAttr)
+    return;
+  if (boolAttr.getValue()) {
+    regions.emplace_back(&getThenRegion());
+    return;
+  }
+  if (!getElseRegion().empty()) {
+    regions.emplace_back(&getElseRegion());
     return;
   }
-  // TODO: can constant fold if the condition is a constant here.
-  regions.push_back(RegionSuccessor(&getThenRegion()));
-  if (!getElseRegion().empty())
-    regions.push_back(RegionSuccessor(&getElseRegion()));
+  regions.emplace_back(getOperation(), getResults());
 }
 
 template <typename A>
@@ -2136,7 +2240,7 @@ LogicalResult cudaq::cc::verifyConvergentLinearTypesInRegions(Operation *op) {
   if (!regionOp)
     return failure();
   SmallVector<RegionSuccessor> successors;
-  regionOp.getSuccessorRegions(std::nullopt, {}, successors);
+  regionOp.getSuccessorRegions(RegionBranchPoint::parent(), successors);
   // For each region successor, determine the number of distinct linear-typed
   // definitions in the region.
   long linearMax = -1;
@@ -2168,18 +2272,78 @@ struct KillRegionIfConstant : public OpRewritePattern<cudaq::cc::IfOp> {
 
   // This rewrite will determine if the condition is constant. If it is, then it
   // will elide the true or false region completely, depending on the constant's
-  // value.
+  // value. For cc.if ops with results, it inlines the surviving region and
+  // replaces the results with the cc.continue operands.
   LogicalResult matchAndRewrite(cudaq::cc::IfOp ifOp,
                                 PatternRewriter &rewriter) const override {
     auto cond = ifOp.getCondition();
-    if (!ifOp.getResults().empty())
-      return failure();
     auto con = cond.getDefiningOp<arith::ConstantIntOp>();
     if (!con)
       return failure();
     auto val = con.value();
     auto loc = ifOp.getLoc();
-    auto truth = rewriter.create<arith::ConstantIntOp>(loc, 1, 1);
+
+    // Handle cc.if with results by inlining the surviving region.
+    if (!ifOp.getResults().empty()) {
+      Region *survivingRegion = nullptr;
+      if (val) {
+        // Condition is true: use then region.
+        survivingRegion = &ifOp.getThenRegion();
+      } else {
+        // Condition is false: use else region if it exists.
+        if (ifOp.getElseRegion().empty()) {
+          // No else region and condition is false - this shouldn't happen for
+          // a well-formed cc.if with results, but handle it gracefully.
+          return failure();
+        }
+        survivingRegion = &ifOp.getElseRegion();
+      }
+
+      // The surviving region should have a single block ending in cc.continue.
+      if (survivingRegion->empty())
+        return failure();
+
+      // Collect results from all cc.continue ops and inline the region.
+      // For a proper cc.if with results, there should be exactly one path
+      // through each region ending in cc.continue.
+      SmallVector<Value> results;
+      Block &entryBlock = survivingRegion->front();
+
+      // Find the terminator cc.continue to get the result values.
+      // We need to walk all blocks because there might be nested control flow.
+      for (Block &block : *survivingRegion) {
+        if (auto contOp =
+                dyn_cast<cudaq::cc::ContinueOp>(block.getTerminator())) {
+          // For single-block regions, just grab the operands.
+          if (survivingRegion->hasOneBlock()) {
+            results = llvm::to_vector(contOp.getOperands());
+            rewriter.eraseOp(contOp);
+            break;
+          }
+        }
+      }
+
+      // If we couldn't find a simple single-block case, fall back to creating
+      // a new cc.if with only the surviving region.
+      if (results.empty() || results.size() != ifOp.getNumResults()) {
+        auto truth = arith::ConstantIntOp::create(rewriter, loc, 1, 1);
+        rewriter.replaceOpWithNewOp<cudaq::cc::IfOp>(
+            ifOp, ifOp.getResultTypes(), truth,
+            [&](OpBuilder &, Location, Region &region) {
+              region.takeBody(*survivingRegion);
+            });
+        return success();
+      }
+
+      // Inline the surviving region's block before the cc.if, replacing
+      // block arguments with the cc.if's linear args.
+      rewriter.inlineBlockBefore(&entryBlock, ifOp, ifOp.getLinearArgs());
+      rewriter.replaceOp(ifOp, results);
+      return success();
+    }
+
+    // Original logic for cc.if without results.
+    auto truth = arith::ConstantIntOp::create(rewriter, loc, 1, 1);
     Region *newRegion = nullptr;
     if (val) {
       // The else block, if any, is dead.
@@ -2194,7 +2358,7 @@ struct KillRegionIfConstant : public OpRewritePattern<cudaq::cc::IfOp> {
         OpBuilder::InsertionGuard guard(rewriter);
         Block *block = new Block();
         rewriter.setInsertionPointToEnd(block);
-        rewriter.create<cudaq::cc::ContinueOp>(loc);
+        cudaq::cc::ContinueOp::create(rewriter, loc);
         newRegion->push_back(block);
       }
     }
@@ -2381,8 +2545,8 @@ LogicalResult cudaq::cc::ConditionOp::verify() {
   return success();
 }
 
-MutableOperandRange cudaq::cc::ConditionOp::getMutableSuccessorOperands(
-    std::optional<unsigned> index) {
+MutableOperandRange
+cudaq::cc::ConditionOp::getMutableSuccessorOperands(RegionSuccessor point) {
   return getResultsMutable();
 }
 
@@ -2522,8 +2686,8 @@ struct FoldTrivialOffsetOf : public OpRewritePattern<cudaq::cc::OffsetOfOp> {
                                 PatternRewriter &rewriter) const override {
     // If there are no offsets, the offset is 0.
     if (offOp.getConstantIndices().empty()) {
-      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(offOp, 0,
-                                                        offOp.getType());
+      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(offOp, offOp.getType(),
+                                                        0);
       return success();
     }
 
@@ -2531,8 +2695,8 @@ struct FoldTrivialOffsetOf : public OpRewritePattern<cudaq::cc::OffsetOfOp> {
     if (std::all_of(offOp.getConstantIndices().begin(),
                     offOp.getConstantIndices().end(),
                     [](std::int32_t i) { return i == 0; })) {
-      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(offOp, 0,
-                                                        offOp.getType());
+      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(offOp, offOp.getType(),
+                                                        0);
       return success();
     }
 
@@ -2600,8 +2764,8 @@ struct ConstArrayConvertToKnownSize
     std::size_t size = connie.getConstantValuesAttr().size();
     auto *ctx = rewriter.getContext();
     auto newTy = cudaq::cc::ArrayType::get(ctx, arrTy.getElementType(), size);
-    auto ca = rewriter.create<cudaq::cc::ConstantArrayOp>(
-        connie.getLoc(), newTy, connie.getConstantValuesAttr());
+    auto ca = cudaq::cc::ConstantArrayOp::create(
+        rewriter, connie.getLoc(), newTy, connie.getConstantValuesAttr());
     rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(connie, arrTy, ca);
     return success();
   }
@@ -2689,10 +2853,10 @@ struct ReplaceConstantSizes : public OpRewritePattern<cudaq::cc::SizeOfOp> {
         auto sizeOpSz = sizeOp.getType().getIntOrFloatBitWidth();
         auto loc = sizeOp.getLoc();
         if (sizeOpSz < vSz)
-          v = rewriter.create<cudaq::cc::CastOp>(loc, sizeOp.getType(), v);
+          v = cudaq::cc::CastOp::create(rewriter, loc, sizeOp.getType(), v);
         else
-          v = rewriter.create<cudaq::cc::CastOp>(
-              loc, sizeOp.getType(), v, cudaq::cc::CastOpMode::Unsigned);
+          v = cudaq::cc::CastOp::create(rewriter, loc, sizeOp.getType(), v,
+                                        cudaq::cc::CastOpMode::Unsigned);
       }
       rewriter.replaceOp(sizeOp, v);
       return success();
diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp
index 75be57ad612..77fed739128 100644
--- a/lib/Optimizer/Dialect/CC/CCTypes.cpp
+++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp
@@ -85,25 +85,17 @@ void cc::StructType::print(AsmPrinter &printer) const {
   printer << '>';
 }
 
-unsigned
+llvm::TypeSize
 cc::StructType::getTypeSizeInBits(const DataLayout &dataLayout,
                                   DataLayoutEntryListRef params) const {
-  return static_cast<unsigned>(getBitSize());
+  return llvm::TypeSize::getFixed(getBitSize());
 }
 
-unsigned cc::StructType::getABIAlignment(const DataLayout &dataLayout,
+uint64_t cc::StructType::getABIAlignment(const DataLayout &dataLayout,
                                          DataLayoutEntryListRef params) const {
   return getAlignment();
 }
 
-unsigned
-cc::StructType::getPreferredAlignment(const DataLayout &dataLayout,
-                                      DataLayoutEntryListRef params) const {
-  // No distinction between ABI and preferred alignments for now. Clang just
-  // gives us an alignment value.
-  return getAlignment();
-}
-
 LogicalResult
 cc::StructType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                        mlir::StringAttr, llvm::ArrayRef<mlir::Type> members,
diff --git a/lib/Optimizer/Dialect/CC/CMakeLists.txt b/lib/Optimizer/Dialect/CC/CMakeLists.txt
index ee725ba8913..6cd7b3c9f69 100644
--- a/lib/Optimizer/Dialect/CC/CMakeLists.txt
+++ b/lib/Optimizer/Dialect/CC/CMakeLists.txt
@@ -16,8 +16,9 @@ add_cudaq_dialect_library(CCDialect
     CCOpsIncGen
     CCTypesIncGen
 
-  LINK_LIBS
+  LINK_LIBS PUBLIC
     MLIRComplexDialect
+    MLIRControlFlowDialect
     MLIRFuncDialect
     MLIRLLVMDialect
     MLIRIR
diff --git a/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
index a2d45bc0d21..75eafaf8f73 100644
--- a/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
+++ b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
@@ -1,5 +1,5 @@
 /****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -24,8 +24,8 @@ struct AdjustAdjointExpPauliPattern : OpRewritePattern<quake::ExpPauliOp> {
       return failure();
     SmallVector<Value> negp;
     if (!pauli.getParameters().empty())
-      negp.push_back(rewriter.create<arith::NegFOp>(pauli.getLoc(),
-                                                    pauli.getParameters()[0]));
+      negp.push_back(arith::NegFOp::create(rewriter, pauli.getLoc(),
+                                           pauli.getParameters()[0]));
     rewriter.replaceOpWithNewOp<quake::ExpPauliOp>(
         pauli, pauli.getResultTypes(), UnitAttr{}, negp, pauli.getControls(),
         pauli.getTargets(), pauli.getNegatedQubitControlsAttr(),
@@ -94,8 +94,8 @@ struct ForwardConstantVeqSizePattern
     if (!veqTy.hasSpecifiedSize())
       return failure();
     auto resTy = veqSize.getType();
-    rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(veqSize, veqTy.getSize(),
-                                                      resTy);
+    rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(veqSize, resTy,
+                                                      veqTy.getSize());
     return success();
   }
 };
@@ -122,8 +122,8 @@ struct FuseConstantToAllocaPattern : public OpRewritePattern<quake::AllocaOp> {
       return failure();
     auto loc = alloc.getLoc();
     auto resTy = alloc.getType();
-    auto newAlloc = rewriter.create<quake::AllocaOp>(
-        loc, static_cast<std::size_t>(*intCon));
+    auto newAlloc = quake::AllocaOp::create(rewriter, loc,
+                                            static_cast<std::size_t>(*intCon));
     rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(alloc, resTy, newAlloc);
     return success();
   }
@@ -216,8 +216,8 @@ static Value createCast(PatternRewriter &rewriter, Location loc, Value inVal) {
   auto i64Ty = rewriter.getI64Type();
   assert(inVal.getType() != rewriter.getIndexType() &&
          "use of index type is deprecated");
-  return rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, inVal,
-                                            cudaq::cc::CastOpMode::Unsigned);
+  return cudaq::cc::CastOp::create(rewriter, loc, i64Ty, inVal,
+                                   cudaq::cc::CastOpMode::Unsigned);
 }
 
 class ExtractRefFromSubVeqPattern
@@ -251,18 +251,18 @@ public:
     auto loc = extract.getLoc();
     auto low = [&]() -> Value {
       if (subveq.hasConstantLowerBound())
-        return rewriter.create<arith::ConstantIntOp>(
-            loc, subveq.getConstantLowerBound(), 64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            subveq.getConstantLowerBound(), 64);
       return subveq.getLower();
     }();
     if (extract.hasConstantIndex()) {
-      Value cv = rewriter.create<arith::ConstantIntOp>(
-          loc, extract.getConstantIndex(), low.getType());
-      offset = rewriter.create<arith::AddIOp>(loc, cv, low);
+      Value cv = arith::ConstantIntOp::create(rewriter, loc, low.getType(),
+                                              extract.getConstantIndex());
+      offset = arith::AddIOp::create(rewriter, loc, cv, low);
     } else {
       auto cast1 = createCast(rewriter, loc, extract.getIndex());
       auto cast2 = createCast(rewriter, loc, low);
-      offset = rewriter.create<arith::AddIOp>(loc, cast1, cast2);
+      offset = arith::AddIOp::create(rewriter, loc, cast1, cast2);
     }
     rewriter.replaceOpWithNewOp<quake::ExtractRefOp>(extract, subveq.getVeq(),
                                                      offset);
@@ -338,7 +338,8 @@ struct ConcatSizePattern : public OpRewritePattern<quake::ConcatOp> {
         if (*arity) {
           // Get each member for IR legalization.
           for (auto [i, memTy] : llvm::enumerate(stqTy.getMembers())) {
-            auto mem = rewriter.create<quake::GetMemberOp>(loc, memTy, opnd, i);
+            auto mem =
+                quake::GetMemberOp::create(rewriter, loc, memTy, opnd, i);
             targets.push_back(mem);
           }
         }
@@ -354,7 +355,7 @@ struct ConcatSizePattern : public OpRewritePattern<quake::ConcatOp> {
 
     // Leans into the relax_size canonicalization pattern.
     auto newTy = quake::VeqType::get(ctx, sum);
-    Value newOp = rewriter.create<quake::ConcatOp>(loc, newTy, targets);
+    Value newOp = quake::ConcatOp::create(rewriter, loc, newTy, targets);
     auto noSizeTy = quake::VeqType::getUnsized(ctx);
     rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(concat, noSizeTy, newOp);
     return success();
@@ -449,8 +450,9 @@ struct ForwardAllocaTypePattern
         auto targ = initState.getTargets();
         if (auto targTy = dyn_cast<quake::VeqType>(targ.getType()))
           if (targTy.hasSpecifiedSize()) {
-            auto newInit = rewriter.create<quake::InitializeStateOp>(
-                initState.getLoc(), targTy, targ, initState.getState());
+            auto newInit = quake::InitializeStateOp::create(
+                rewriter, initState.getLoc(), targTy, targ,
+                initState.getState());
             rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(initState, isTy,
                                                             newInit);
             return success();
@@ -493,9 +495,9 @@ struct FixUnspecifiedSubveqPattern : public OpRewritePattern<quake::SubVeqOp> {
         subveq.getConstantUpperBound() - subveq.getConstantLowerBound() + 1u;
     auto szVecTy = quake::VeqType::get(ctx, size);
     auto loc = subveq.getLoc();
-    auto subv = rewriter.create<quake::SubVeqOp>(
-        loc, szVecTy, subveq.getVeq(), subveq.getLower(), subveq.getUpper(),
-        subveq.getRawLower(), subveq.getRawUpper());
+    auto subv = quake::SubVeqOp::create(
+        rewriter, loc, szVecTy, subveq.getVeq(), subveq.getLower(),
+        subveq.getUpper(), subveq.getRawLower(), subveq.getRawUpper());
     rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(subveq, veqTy, subv);
     return success();
   }
@@ -606,8 +608,8 @@ public:
     // Lambda to create a Value for the lower bound of `s`.
     auto lofunc = [&](quake::SubVeqOp s) -> Value {
       if (s.hasConstantLowerBound())
-        return rewriter.create<arith::ConstantIntOp>(
-            loc, s.getConstantLowerBound(), 64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            s.getConstantLowerBound(), 64);
       return s.getLower();
     };
     auto priorlo = lofunc(prior);
@@ -616,15 +618,15 @@ public:
     // Lambda for creating the upper bound Value.
     auto svup = [&]() -> Value {
       if (subveq.hasConstantUpperBound())
-        return rewriter.create<arith::ConstantIntOp>(
-            loc, subveq.getConstantUpperBound(), 64);
+        return arith::ConstantIntOp::create(rewriter, loc,
+                                            subveq.getConstantUpperBound(), 64);
       return subveq.getUpper();
     }();
     auto cast1 = createCast(rewriter, loc, priorlo);
     auto cast2 = createCast(rewriter, loc, svlo);
     auto cast3 = createCast(rewriter, loc, svup);
-    Value sum1 = rewriter.create<arith::AddIOp>(loc, cast1, cast2);
-    Value sum2 = rewriter.create<arith::AddIOp>(loc, cast1, cast3);
+    Value sum1 = arith::AddIOp::create(rewriter, loc, cast1, cast2);
+    Value sum2 = arith::AddIOp::create(rewriter, loc, cast1, cast3);
     auto veqTy = subveq.getType();
     rewriter.replaceOpWithNewOp<quake::SubVeqOp>(subveq, veqTy, prior.getVeq(),
                                                  sum1, sum2);
@@ -648,8 +650,8 @@ struct FoldInitStateSizePattern : public OpRewritePattern<quake::VeqSizeOp> {
               dyn_cast<quake::VeqType>(initState.getTargets().getType()))
         if (veqTy.hasSpecifiedSize()) {
           std::size_t numQubits = veqTy.getSize();
-          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(veqSize, numQubits,
-                                                            veqSize.getType());
+          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
+              veqSize, veqSize.getType(), numQubits);
           return success();
         }
     return failure();
@@ -700,12 +702,12 @@ struct MergeRotationPattern : public OpRewritePattern<OP> {
     auto adjAttr = rotate.getIsAdjAttr();
     auto newAngle = [&]() -> Value {
       if (input.isAdj() == rotate.isAdj())
-        return rewriter.create<arith::AddFOp>(loc, angle1, angle2);
+        return arith::AddFOp::create(rewriter, loc, angle1, angle2);
       // One is adjoint, so it should be subtracted from the other.
       if (input.isAdj())
-        return rewriter.create<arith::SubFOp>(loc, angle2, angle1);
+        return arith::SubFOp::create(rewriter, loc, angle2, angle1);
       adjAttr = input.getIsAdjAttr();
-      return rewriter.create<arith::SubFOp>(loc, angle1, angle2);
+      return arith::SubFOp::create(rewriter, loc, angle1, angle2);
     }();
     rewriter.replaceOpWithNewOp<OP>(rotate, rotate.getResultTypes(), adjAttr,
                                     ValueRange{newAngle}, ValueRange{},
@@ -726,7 +728,7 @@ struct ForwardRelaxedSizePattern : public OpRewritePattern<quake::RelaxSizeOp> {
                                 PatternRewriter &rewriter) const override {
     auto inpVec = relax.getInputVec();
     bool replaced = false;
-    rewriter.replaceOpWithIf(relax, inpVec, [&](OpOperand &use) {
+    rewriter.replaceUsesWithIf(relax, inpVec, [&](OpOperand &use) {
       bool res = false;
       if (Operation *user = use.getOwner())
         res = isQuakeOperation(user) && !isa<quake::ApplyOp>(user);
diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
index bc5a2865773..c3e85b6b0de 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
@@ -127,16 +127,18 @@ Value quake::createConstantAlloca(PatternRewriter &builder, Location loc,
   auto newAlloca = [&]() {
     if (isa<quake::VeqType>(result.getType()) &&
         cast<quake::VeqType>(result.getType()).hasSpecifiedSize()) {
-      return builder.create<quake::AllocaOp>(
-          loc, cast<quake::VeqType>(result.getType()).getSize());
+      return quake::AllocaOp::create(
+          builder, loc, cast<quake::VeqType>(result.getType()).getSize());
     }
     auto constOp = cast<arith::ConstantOp>(args[0].getDefiningOp());
-    return builder.create<quake::AllocaOp>(
-        loc, static_cast<std::size_t>(
-                 cast<IntegerAttr>(constOp.getValue()).getInt()));
+    return quake::AllocaOp::create(
+        builder, loc,
+        static_cast<std::size_t>(
+            cast<IntegerAttr>(constOp.getValue()).getInt()));
   }();
-  return builder.create<quake::RelaxSizeOp>(
-      loc, quake::VeqType::getUnsized(builder.getContext()), newAlloca);
+  return quake::RelaxSizeOp::create(
+      builder, loc, quake::VeqType::getUnsized(builder.getContext()),
+      newAlloca);
 }
 
 LogicalResult quake::AllocaOp::verify() {
@@ -1175,15 +1177,16 @@ using EffectsVectorImpl =
 /// reference or value form. A operation with modeless effects is not removed
 /// when its result(s) is (are) unused.
 [[maybe_unused]] inline static void
-getModelessEffectsImpl(EffectsVectorImpl &effects, ValueRange controls,
-                       ValueRange targets) {
-  for (auto v : controls)
-    effects.emplace_back(MemoryEffects::Read::get(), v,
+getModelessEffectsImpl(EffectsVectorImpl &effects,
+                       MutableArrayRef<OpOperand> controls,
+                       MutableArrayRef<OpOperand> targets) {
+  for (OpOperand &v : controls)
+    effects.emplace_back(MemoryEffects::Read::get(), &v,
                          SideEffects::DefaultResource::get());
-  for (auto v : targets) {
-    effects.emplace_back(MemoryEffects::Read::get(), v,
+  for (OpOperand &v : targets) {
+    effects.emplace_back(MemoryEffects::Read::get(), &v,
                          SideEffects::DefaultResource::get());
-    effects.emplace_back(MemoryEffects::Write::get(), v,
+    effects.emplace_back(MemoryEffects::Write::get(), &v,
                          SideEffects::DefaultResource::get());
   }
 }
@@ -1195,36 +1198,37 @@ getModelessEffectsImpl(EffectsVectorImpl &effects, ValueRange controls,
 /// have both a read and write effect. If the operand is in value form, the
 /// operation introduces no effects on that operand.
 inline static void getModedEffectsImpl(EffectsVectorImpl &effects,
-                                       ValueRange controls,
-                                       ValueRange targets) {
-  for (auto v : controls)
-    if (isa<quake::RefType, quake::VeqType>(v.getType()))
-      effects.emplace_back(MemoryEffects::Read::get(), v,
+                                       MutableArrayRef<OpOperand> controls,
+                                       MutableArrayRef<OpOperand> targets) {
+  for (OpOperand &v : controls)
+    if (isa<quake::RefType, quake::VeqType>(v.get().getType()))
+      effects.emplace_back(MemoryEffects::Read::get(), &v,
                            SideEffects::DefaultResource::get());
-  for (auto v : targets)
-    if (isa<quake::RefType, quake::VeqType>(v.getType())) {
-      effects.emplace_back(MemoryEffects::Read::get(), v,
+  for (OpOperand &v : targets)
+    if (isa<quake::RefType, quake::VeqType>(v.get().getType())) {
+      effects.emplace_back(MemoryEffects::Read::get(), &v,
                            SideEffects::DefaultResource::get());
-      effects.emplace_back(MemoryEffects::Write::get(), v,
+      effects.emplace_back(MemoryEffects::Write::get(), &v,
                            SideEffects::DefaultResource::get());
     }
 }
 
 /// Quake reset has modeless effects.
 void quake::getResetEffectsImpl(EffectsVectorImpl &effects,
-                                ValueRange targets) {
+                                MutableArrayRef<OpOperand> targets) {
   getModedEffectsImpl(effects, {}, targets);
 }
 
 /// Quake measurement operations have moded effects.
 void quake::getMeasurementEffectsImpl(EffectsVectorImpl &effects,
-                                      ValueRange targets) {
+                                      MutableArrayRef<OpOperand> targets) {
   getModedEffectsImpl(effects, {}, targets);
 }
 
 /// Quake quantum operators have moded effects.
 void quake::getOperatorEffectsImpl(EffectsVectorImpl &effects,
-                                   ValueRange controls, ValueRange targets) {
+                                   MutableArrayRef<OpOperand> controls,
+                                   MutableArrayRef<OpOperand> targets) {
   getModedEffectsImpl(effects, controls, targets);
 }
 
@@ -1262,7 +1266,5 @@ VERIFY_OPS(INSTANTIATE_LINEAR_TYPE_VERIFY)
 // Generated logic
 //===----------------------------------------------------------------------===//
 
-using namespace cudaq;
-
 #define GET_OP_CLASSES
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.cpp.inc"
diff --git a/lib/Optimizer/Transforms/AddDeallocs.cpp b/lib/Optimizer/Transforms/AddDeallocs.cpp
index f2dad559e0d..9ad8071a8c4 100644
--- a/lib/Optimizer/Transforms/AddDeallocs.cpp
+++ b/lib/Optimizer/Transforms/AddDeallocs.cpp
@@ -7,14 +7,17 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
+namespace cudaq::opt {
+#define GEN_PASS_DEF_QUAKEADDDEALLOCS
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
 #define DEBUG_TYPE "add-deallocs"
 
 using namespace mlir;
@@ -135,7 +138,7 @@ inline void generateDeallocsForSet(PatternRewriter &rewriter,
               dyn_cast<quake::InitializeStateOp>(*a->getUsers().begin()))
         v = initState;
     }
-    rewriter.create<quake::DeallocOp>(a->getLoc(), v);
+    quake::DeallocOp::create(rewriter, a->getLoc(), v);
   }
 }
 
@@ -144,7 +147,6 @@ template <typename RET, typename OP>
 LogicalResult addDeallocations(OP wrapper, PatternRewriter &rewriter,
                                const DeallocationAnalysisInfo &infoMap,
                                const DominanceInfo &domInfo) {
-  rewriter.startRootUpdate(wrapper);
   llvm::DenseSet<Operation *> allocs;
   for (auto &[op, done] : infoMap.allocMap)
     if ((op->getParentOp() == wrapper.getOperation()) && !done)
@@ -158,48 +160,49 @@ LogicalResult addDeallocations(OP wrapper, PatternRewriter &rewriter,
   LLVM_DEBUG(llvm::dbgs() << "adding deallocations to "
                           << wrapper.getOperation() << '\n');
 
-  // 1) Create an exit block to stick dealloc operations in.
-  auto *exitBlock = new Block;
-  exitBlock->addArguments(
-      wrapper.getResultTypes(),
-      SmallVector<Location>{wrapper.getNumResults(), wrapper.getLoc()});
-  wrapper.getRegion().push_back(exitBlock);
-
-  // 2) Update all the RET ops (at top level) to branches to the exit block
-  // when it is correct to do so. Otherwise, add the subset of deallocations
-  // inline before each RET op.
-  auto entireSetDominates = [&](RET ret) {
-    for (auto *alloc : allocs)
-      if (!domInfo.dominates(alloc, ret))
-        return false;
-    return true;
-  };
-  for (Block &block : wrapper.getRegion())
-    for (Operation &op : block)
-      if (auto ret = dyn_cast<RET>(op)) {
-        if (entireSetDominates(ret)) {
-          // Replace the RET op with a branch to the shared deallocation block.
-          rewriter.setInsertionPoint(ret);
-          rewriter.replaceOpWithNewOp<cf::BranchOp>(ret, exitBlock,
-                                                    ret.getOperands());
-        } else {
-          // Collect only the subset that dominates this RET op. Insert the
-          // deallocations directly in front of the RET op.
-          llvm::DenseSet<Operation *> subset;
-          for (auto *alloc : allocs)
-            if (domInfo.dominates(alloc, ret))
-              subset.insert(alloc);
-          rewriter.setInsertionPoint(ret);
-          generateDeallocsForSet(rewriter, subset);
+  rewriter.modifyOpInPlace(wrapper, [&]() {
+    // 1) Create an exit block to stick dealloc operations in.
+    auto *exitBlock = new Block;
+    exitBlock->addArguments(
+        wrapper.getResultTypes(),
+        SmallVector<Location>{wrapper.getNumResults(), wrapper.getLoc()});
+    wrapper.getRegion().push_back(exitBlock);
+
+    // 2) Update all the RET ops (at top level) to branches to the exit block
+    // when it is correct to do so. Otherwise, add the subset of deallocations
+    // inline before each RET op.
+    auto entireSetDominates = [&](RET ret) {
+      for (auto *alloc : allocs)
+        if (!domInfo.dominates(alloc, ret))
+          return false;
+      return true;
+    };
+    for (Block &block : wrapper.getRegion())
+      for (Operation &op : block)
+        if (auto ret = dyn_cast<RET>(op)) {
+          if (entireSetDominates(ret)) {
+            // Replace the RET op with a branch to the shared deallocation
+            // block.
+            rewriter.setInsertionPoint(ret);
+            rewriter.replaceOpWithNewOp<cf::BranchOp>(ret, exitBlock,
+                                                      ret.getOperands());
+          } else {
+            // Collect only the subset that dominates this RET op. Insert the
+            // deallocations directly in front of the RET op.
+            llvm::DenseSet<Operation *> subset;
+            for (auto *alloc : allocs)
+              if (domInfo.dominates(alloc, ret))
+                subset.insert(alloc);
+            rewriter.setInsertionPoint(ret);
+            generateDeallocsForSet(rewriter, subset);
+          }
         }
-      }
-
-  // 3) Create the deallocations.
-  rewriter.setInsertionPointToEnd(exitBlock);
-  generateDeallocsForSet(rewriter, allocs);
-  rewriter.create<RET>(wrapper.getLoc(), exitBlock->getArguments());
 
-  rewriter.finalizeRootUpdate(wrapper);
+    // 3) Create the deallocations.
+    rewriter.setInsertionPointToEnd(exitBlock);
+    generateDeallocsForSet(rewriter, allocs);
+    RET::create(rewriter, wrapper.getLoc(), exitBlock->getArguments());
+  });
   LLVM_DEBUG(llvm::dbgs() << "updated " << wrapper.getOperation() << '\n');
   return success();
 }
@@ -243,7 +246,7 @@ using ScopeDeallocPattern =
 /// dealloc ops along non-trivial control paths in the presence of global jumps.
 /// DeallocationAnalysis will flag any unwinding jumps as errors.
 class QuakeAddDeallocsPass
-    : public cudaq::opt::QuakeAddDeallocsBase<QuakeAddDeallocsPass> {
+    : public cudaq::opt::impl::QuakeAddDeallocsBase<QuakeAddDeallocsPass> {
 public:
   void runOnOperation() override {
     func::FuncOp funcOp = getOperation();
diff --git a/lib/Optimizer/Transforms/AddMeasurements.cpp b/lib/Optimizer/Transforms/AddMeasurements.cpp
index 1b71702ae1a..e3bda7eec30 100644
--- a/lib/Optimizer/Transforms/AddMeasurements.cpp
+++ b/lib/Optimizer/Transforms/AddMeasurements.cpp
@@ -8,8 +8,6 @@
 
 #include "PassDetails.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -83,26 +81,26 @@ addMeasurements(func::FuncOp funcOp, SmallVector<Operation *> &allocations,
   // Replace every func.return in the function with a branch to the new block.
   for (auto returnOp : returnsToReplace) {
     OpBuilder builder(returnOp);
-    builder.create<cf::BranchOp>(returnOp.getLoc(), newBlock,
-                                 returnOp.getOperands());
+    cf::BranchOp::create(builder, returnOp.getLoc(), newBlock,
+                         returnOp.getOperands());
     returnOp.erase();
   }
 
   // Set insertion point to the new block and add measurements
   builder.setInsertionPointToEnd(newBlock);
   auto measTy = quake::MeasureType::get(builder.getContext());
-  for (auto &[index, alloca] : llvm::enumerate(allocations)) {
+  for (auto [index, alloca] : llvm::enumerate(allocations)) {
     if (isa<quake::VeqType>(alloca->getResult(0).getType())) {
       auto stdvecTy = cudaq::cc::StdvecType::get(measTy);
-      builder.create<quake::MzOp>(loc, stdvecTy,
-                                  ValueRange{alloca->getResult(0)});
+      quake::MzOp::create(builder, loc, stdvecTy,
+                          ValueRange{alloca->getResult(0)});
     } else {
-      builder.create<quake::MzOp>(loc, measTy, alloca->getResult(0));
+      quake::MzOp::create(builder, loc, measTy, alloca->getResult(0));
     }
   }
 
   // Add the final return using block arguments
-  builder.create<func::ReturnOp>(loc, newBlock->getArguments());
+  func::ReturnOp::create(builder, loc, newBlock->getArguments());
 
   return success();
 }
@@ -122,8 +120,8 @@ struct AddMeasurementsPass
     /// NOTE: Having a conditional on a measurement indicates that a measurement
     /// is present, however, it does not guarantee that all the allocated qubits
     /// are measured.
-    if (auto boolAttr = func->getAttr("qubitMeasurementFeedback")
-                            .dyn_cast_or_null<mlir::BoolAttr>()) {
+    if (auto boolAttr = dyn_cast_if_present<mlir::BoolAttr>(
+            func->getAttr("qubitMeasurementFeedback"))) {
       if (boolAttr.getValue())
         return;
     }
diff --git a/lib/Optimizer/Transforms/AddMetadata.cpp b/lib/Optimizer/Transforms/AddMetadata.cpp
index 873552f2245..bb356a952ac 100644
--- a/lib/Optimizer/Transforms/AddMetadata.cpp
+++ b/lib/Optimizer/Transforms/AddMetadata.cpp
@@ -8,22 +8,22 @@
 
 #include "cudaq/Optimizer/Transforms/AddMetadata.h"
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
-using namespace mlir;
+namespace cudaq::opt {
+#define GEN_PASS_DEF_QUAKEADDMETADATA
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
 
 #define DEBUG_TYPE "add-metadata"
 
+using namespace mlir;
+
 static cudaq::cc::AllocaOp seekAllocaFrom(Value v);
 
 static cudaq::cc::AllocaOp seekAllocaFrom(Operation *op) {
@@ -155,7 +155,7 @@ namespace {
 /// This pass will analyze Quake functions and attach metadata (as an MLIR
 /// function attribute) for specific features.
 class QuakeAddMetadataPass
-    : public cudaq::opt::QuakeAddMetadataBase<QuakeAddMetadataPass> {
+    : public cudaq::opt::impl::QuakeAddMetadataBase<QuakeAddMetadataPass> {
 public:
   QuakeAddMetadataPass() = default;
 
diff --git a/lib/Optimizer/Transforms/AggressiveInlining.cpp b/lib/Optimizer/Transforms/AggressiveInlining.cpp
index 802477ae976..bf7066c2bbf 100644
--- a/lib/Optimizer/Transforms/AggressiveInlining.cpp
+++ b/lib/Optimizer/Transforms/AggressiveInlining.cpp
@@ -39,7 +39,7 @@ getConversionMap(ModuleOp module) {
           cudaq::runtime::mangledNameMap)) {
     for (auto namedAttr : mangledNameMap) {
       auto key = namedAttr.getName();
-      auto val = namedAttr.getValue().cast<StringAttr>().getValue();
+      auto val = cast<StringAttr>(namedAttr.getValue()).getValue();
       result.insert({val, key});
     }
     return result;
@@ -68,9 +68,9 @@ class RewriteCall : public OpRewritePattern<func::CallOp> {
     auto loc = call.getLoc();
     auto funcTy = call.getCalleeType();
     cudaq::opt::factory::getOrAddFunc(loc, directName, funcTy, module);
-    rewriter.startRootUpdate(call);
-    call.setCalleeAttr(SymbolRefAttr::get(ctx, directName));
-    rewriter.finalizeRootUpdate(call);
+    rewriter.modifyOpInPlace(call, [&]() {
+      call.setCalleeAttr(SymbolRefAttr::get(ctx, directName));
+    });
     LLVM_DEBUG(llvm::dbgs() << "Rewriting " << directName << '\n');
     return success();
   }
@@ -93,7 +93,7 @@ class ConvertToDirectCalls
       LLVM_DEBUG(llvm::dbgs() << "Processing: " << module << '\n');
       RewritePatternSet patterns(ctx);
       patterns.insert<RewriteCall>(ctx, *indirectMapOpt, module);
-      if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+      if (failed(applyPatternsGreedily(module, std::move(patterns))))
         signalPassFailure();
     }
   }
diff --git a/lib/Optimizer/Transforms/ApplyControlNegations.cpp b/lib/Optimizer/Transforms/ApplyControlNegations.cpp
index 1d0885f5fe1..78915b17e2c 100644
--- a/lib/Optimizer/Transforms/ApplyControlNegations.cpp
+++ b/lib/Optimizer/Transforms/ApplyControlNegations.cpp
@@ -8,8 +8,6 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "mlir/IR/PatternMatch.h"
@@ -40,29 +38,26 @@ class ReplaceNegativeControl : public OpRewritePattern<Op> {
 
     for (auto negationIter : llvm::enumerate(negations.value()))
       if (negationIter.value())
-        rewriter.create<quake::XOp>(
-            loc, ValueRange(),
-            ValueRange{op.getControls()[negationIter.index()]});
+        quake::XOp::create(rewriter, loc, ValueRange(),
+                           ValueRange{op.getControls()[negationIter.index()]});
 
     if constexpr (std::is_same_v<Op, quake::ExpPauliOp>) {
-      rewriter.create<quake::ExpPauliOp>(
-          loc, TypeRange{}, op.getIsAdjAttr(), op.getParameters(),
+      quake::ExpPauliOp::create(
+          rewriter, loc, TypeRange{}, op.getIsAdjAttr(), op.getParameters(),
           op.getControls(), op.getTargets(), op.getNegatedQubitControlsAttr(),
           op.getPauli(), op.getPauliLiteralAttr());
     } else if constexpr (std::is_same_v<Op, quake::CustomUnitarySymbolOp>) {
-      rewriter.create<Op>(loc, op.getGeneratorAttr(), op.getIsAdj(),
-                          op.getParameters(), op.getControls(),
-                          op.getTargets());
+      Op::create(rewriter, loc, op.getGeneratorAttr(), op.getIsAdj(),
+                 op.getParameters(), op.getControls(), op.getTargets());
     } else {
-      rewriter.create<Op>(loc, op.getIsAdj(), op.getParameters(),
-                          op.getControls(), op.getTargets());
+      Op::create(rewriter, loc, op.getIsAdj(), op.getParameters(),
+                 op.getControls(), op.getTargets());
     }
 
     for (auto negationIter : llvm::enumerate(negations.value()))
       if (negationIter.value())
-        rewriter.create<quake::XOp>(
-            loc, ValueRange(),
-            ValueRange{op.getControls()[negationIter.index()]});
+        quake::XOp::create(rewriter, loc, ValueRange(),
+                           ValueRange{op.getControls()[negationIter.index()]});
     rewriter.eraseOp(op);
 
     return success();
diff --git a/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp b/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
index c428bfdec2a..4676d4588a3 100644
--- a/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
+++ b/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
@@ -130,8 +130,8 @@ struct ApplyOpAnalysis {
                 auto *ctx = newFunc.getContext();
                 OpBuilder builder(ctx);
                 builder.setInsertionPoint(&newFunc.front().front());
-                auto relax = builder.create<quake::RelaxSizeOp>(
-                    newFunc.getLoc(), quake::VeqType::getUnsized(ctx),
+                auto relax = quake::RelaxSizeOp::create(
+                    builder, newFunc.getLoc(), quake::VeqType::getUnsized(ctx),
                     newFunc.front().getArgument(pos));
                 newFunc.front().getArgument(pos).replaceAllUsesExcept(
                     relax.getResult(), relax.getOperation());
@@ -143,10 +143,10 @@ struct ApplyOpAnalysis {
               entry.push_front(c);
             module.push_back(newFunc);
             OpBuilder builder(apply);
-            auto newApply = builder.create<quake::ApplyOp>(
-                apply.getLoc(), apply.getResultTypes(),
-                SymbolRefAttr::get(ctx, calleeName), apply.getIndirectCallee(),
-                apply.getIsAdj(), apply.getControls(), preservedArgs);
+            auto newApply = quake::ApplyOp::create(
+                builder, apply.getLoc(), apply.getResultTypes(),
+                SymbolRefAttr::get(ctx, calleeName), apply.getIsAdj(),
+                apply.getControls(), preservedArgs);
             apply->replaceAllUsesWith(newApply.getResults());
             apply->dropAllReferences();
             apply->erase();
@@ -318,8 +318,8 @@ struct ApplyOpPattern : public OpRewritePattern<quake::ApplyOp> {
     auto unsizedVeqTy = quake::VeqType::getUnsized(ctx);
     SmallVector<Value> newArgs;
     if (!apply.getControls().empty()) {
-      auto consOp = rewriter.create<quake::ConcatOp>(
-          apply.getLoc(), unsizedVeqTy, apply.getControls());
+      auto consOp = quake::ConcatOp::create(rewriter, apply.getLoc(),
+                                            unsizedVeqTy, apply.getControls());
       newArgs.push_back(consOp);
     }
     for (auto [v, toTy] :
@@ -328,8 +328,8 @@ struct ApplyOpPattern : public OpRewritePattern<quake::ApplyOp> {
         continue;
       Value arg = v;
       if (arg.getType() != toTy)
-        arg =
-            rewriter.create<quake::ConcatOp>(apply.getLoc(), unsizedVeqTy, arg);
+        arg = quake::ConcatOp::create(rewriter, apply.getLoc(), unsizedVeqTy,
+                                      arg);
       newArgs.emplace_back(arg);
     }
     LLVM_DEBUG(llvm::dbgs() << "replacing: " << apply << '\n');
@@ -378,7 +378,7 @@ class ApplySpecializationPass
     auto *ctx = module.getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<FoldCallable>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    if (failed(applyPatternsGreedily(module, std::move(patterns))))
       signalPassFailure();
 
     ApplyOpAnalysis analysis(module, constantPropagation);
@@ -494,7 +494,7 @@ class ApplySpecializationPass
 
         // This is a quantum op. It should be updated with an additional control
         // argument, `newCond`.
-        auto arrAttr = op->getAttr(segmentSizes).cast<DenseI32ArrayAttr>();
+        auto arrAttr = cast<DenseI32ArrayAttr>(op->getAttr(segmentSizes));
         SmallVector<std::int32_t> arrRef{arrAttr.asArrayRef().begin(),
                                          arrAttr.asArrayRef().end()};
         SmallVector<Value> operands(op->getOperands().begin(),
@@ -518,9 +518,10 @@ class ApplySpecializationPass
         SmallVector<Value> newControls = {newCond};
         newControls.append(apply.getControls().begin(),
                            apply.getControls().end());
-        auto newApply = builder.create<quake::ApplyOp>(
-            apply.getLoc(), apply.getResultTypes(), apply.getCalleeAttr(),
-            apply.getIsAdjAttr(), newControls, apply.getActuals());
+        auto newApply = quake::ApplyOp::create(
+            builder, apply.getLoc(), apply.getResultTypes(),
+            apply.getCalleeAttr(), apply.getIsAdjAttr(), newControls,
+            apply.getActuals());
         apply->replaceAllUsesWith(newApply.getResults());
         apply->erase();
       } else if (isQuantumKernelCall(op)) {
@@ -614,7 +615,7 @@ class ApplySpecializationPass
   static Value createIntConstant(OpBuilder &builder, Location loc, Type ty,
                                  std::int64_t val) {
     auto attr = builder.getIntegerAttr(ty, val);
-    return builder.create<arith::ConstantOp>(loc, attr, ty);
+    return arith::ConstantOp::create(builder, loc, ty, attr);
   }
 
   /// Clone the LoopOp, \p loop, and return a new LoopOp that runs the loop
@@ -640,31 +641,32 @@ class ApplySpecializationPass
     auto zero = createIntConstant(builder, loc, newStepVal.getType(), 0);
     if (!stepIsAnAddOp) {
       // Negate the step value when arith.subi.
-      newStepVal = builder.create<arith::SubIOp>(loc, zero, newStepVal);
+      newStepVal = arith::SubIOp::create(builder, loc, zero, newStepVal);
     }
-    Value iters = builder.create<arith::SubIOp>(
-        loc, newTermVal, loop.getInitialArgs()[loopComponents->induction]);
+    Value iters =
+        arith::SubIOp::create(builder, loc, newTermVal,
+                              loop.getInitialArgs()[loopComponents->induction]);
     auto cmpOp = cast<arith::CmpIOp>(loopComponents->compareOp);
     auto pred = cmpOp.getPredicate();
     auto one = createIntConstant(builder, loc, iters.getType(), 1);
     if (cudaq::opt::isSemiOpenPredicate(pred)) {
-      Value negStepCond = builder.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, newStepVal, zero);
+      Value negStepCond = arith::CmpIOp::create(
+          builder, loc, arith::CmpIPredicate::slt, newStepVal, zero);
       auto negOne = createIntConstant(builder, loc, iters.getType(), -1);
-      Value adj = builder.create<arith::SelectOp>(loc, iters.getType(),
-                                                  negStepCond, one, negOne);
-      iters = builder.create<arith::AddIOp>(loc, iters, adj);
+      Value adj = arith::SelectOp::create(builder, loc, iters.getType(),
+                                          negStepCond, one, negOne);
+      iters = arith::AddIOp::create(builder, loc, iters, adj);
     }
-    iters = builder.create<arith::AddIOp>(loc, iters, newStepVal);
-    iters = builder.create<arith::DivSIOp>(loc, iters, newStepVal);
-    Value noLoopCond = builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sgt, iters, zero);
-    iters = builder.create<arith::SelectOp>(loc, iters.getType(), noLoopCond,
-                                            iters, zero);
-    Value lastIter = builder.create<arith::SubIOp>(loc, iters, one);
-    Value nStep = builder.create<arith::MulIOp>(loc, lastIter, newStepVal);
-    Value newInitVal =
-        builder.create<arith::AddIOp>(loc, loopComponents->initialValue, nStep);
+    iters = arith::AddIOp::create(builder, loc, iters, newStepVal);
+    iters = arith::DivSIOp::create(builder, loc, iters, newStepVal);
+    Value noLoopCond = arith::CmpIOp::create(
+        builder, loc, arith::CmpIPredicate::sgt, iters, zero);
+    iters = arith::SelectOp::create(builder, loc, iters.getType(), noLoopCond,
+                                    iters, zero);
+    Value lastIter = arith::SubIOp::create(builder, loc, iters, one);
+    Value nStep = arith::MulIOp::create(builder, loc, lastIter, newStepVal);
+    Value newInitVal = arith::AddIOp::create(
+        builder, loc, loopComponents->initialValue, nStep);
 
     // Create the list of input arguments to loop. We're going to add an
     // argument to the end that is the number of iterations left to execute.
@@ -679,8 +681,9 @@ class ApplySpecializationPass
     // through the new argument. In the stepRegion, decrement the new argument
     // by 1 and convert the original step expression to be a negative step.
     IRRewriter rewriter(builder);
-    return rewriter.create<cudaq::cc::LoopOp>(
-        loc, ValueRange{inputs}.getTypes(), inputs, /*postCondition=*/false,
+    return cudaq::cc::LoopOp::create(
+        rewriter, loc, ValueRange{inputs}.getTypes(), inputs,
+        /*postCondition=*/false,
         [&](OpBuilder &builder, Location loc, Region &region) {
           IRMapping dummyMap;
           loop.getWhileRegion().cloneInto(&region, dummyMap);
@@ -694,8 +697,8 @@ class ApplySpecializationPass
           Value trip = block.getArguments().back();
           args.push_back(trip);
           auto zero = createIntConstant(builder, loc, trip.getType(), 0);
-          auto newCond = rewriter.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::sgt, trip, zero);
+          auto newCond = arith::CmpIOp::create(
+              rewriter, loc, arith::CmpIPredicate::sgt, trip, zero);
           rewriter.replaceOpWithNewOp<cudaq::cc::ConditionOp>(condOp, newCond,
                                                               args);
         },
@@ -725,15 +728,15 @@ class ApplySpecializationPass
           auto *stepOp = contOp.getOperand(0).getDefiningOp();
           auto newBump = [&]() -> Value {
             if (stepIsAnAddOp)
-              return rewriter.create<arith::SubIOp>(
-                  loc, stepOp->getOperand(commuteTheAddOp ? 1 : 0),
+              return arith::SubIOp::create(
+                  rewriter, loc, stepOp->getOperand(commuteTheAddOp ? 1 : 0),
                   stepOp->getOperand(commuteTheAddOp ? 0 : 1));
-            return rewriter.create<arith::AddIOp>(loc, stepOp->getOperands());
+            return arith::AddIOp::create(rewriter, loc, stepOp->getOperands());
           }();
           args[loopComponents->induction] = newBump;
           auto one = createIntConstant(rewriter, loc, iters.getType(), 1);
-          args.push_back(rewriter.create<arith::SubIOp>(
-              loc, entry.getArguments().back(), one));
+          args.push_back(arith::SubIOp::create(
+              rewriter, loc, entry.getArguments().back(), one));
           rewriter.replaceOpWithNewOp<cudaq::cc::ContinueOp>(contOp, args);
         });
   }
@@ -787,9 +790,10 @@ class ApplySpecializationPass
         mlir::UnitAttr newIsAdj =
             applyOp.getIsAdj() ? mlir::UnitAttr{}
                                : mlir::UnitAttr::get(builder.getContext());
-        builder.create<quake::ApplyOp>(
-            applyOp.getLoc(), applyOp.getResultTypes(), applyOp.getCalleeAttr(),
-            newIsAdj, applyOp.getControls(), applyOp.getActuals());
+        quake::ApplyOp::create(builder, applyOp.getLoc(),
+                               applyOp.getResultTypes(),
+                               applyOp.getCalleeAttr(), newIsAdj,
+                               applyOp.getControls(), applyOp.getActuals());
         applyOp->erase();
         continue;
       }
@@ -797,13 +801,13 @@ class ApplySpecializationPass
       bool opWasNegated = false;
       IRMapping mapper;
       LLVM_DEBUG(llvm::dbgs() << "moving quantum op: " << *op << ".\n");
-      auto arrAttr = op->getAttr(segmentSizes).cast<DenseI32ArrayAttr>();
+      auto arrAttr = cast<DenseI32ArrayAttr>(op->getAttr(segmentSizes));
       // Walk over any floating-point parameters to `op` and negate them.
       for (auto iter = op->getOperands().begin(),
                 endIter = op->getOperands().begin() + arrAttr[0];
            iter != endIter; ++iter) {
         Value val = *iter;
-        Value neg = builder.create<arith::NegFOp>(loc, val.getType(), val);
+        Value neg = arith::NegFOp::create(builder, loc, val.getType(), val);
         mapper.map(val, neg);
         opWasNegated = true;
       }
@@ -845,7 +849,7 @@ class ApplySpecializationPass
     auto *ctx = module.getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<ApplyOpPattern>(ctx, constantPropagation);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    if (failed(applyPatternsGreedily(module, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After apply specialization:\n"
                             << module << "\n\n");
diff --git a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
index c8643a9d854..091a73b776d 100644
--- a/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
+++ b/lib/Optimizer/Transforms/ArgumentSynthesis.cpp
@@ -122,10 +122,10 @@ class ArgumentSynthesisPass
         OpBuilder builder{ctx};
         Block *splitBlock = entry.splitBlock(entry.begin());
         builder.setInsertionPointToEnd(&entry);
-        builder.create<cf::BranchOp>(func.getLoc(), &subst.getBody().front());
+        cf::BranchOp::create(builder, func.getLoc(), &subst.getBody().front());
         Operation *lastOp = &subst.getBody().front().back();
         builder.setInsertionPointToEnd(&subst.getBody().front());
-        builder.create<cf::BranchOp>(func.getLoc(), splitBlock);
+        cf::BranchOp::create(builder, func.getLoc(), splitBlock);
         func.getBlocks().splice(Region::iterator{splitBlock},
                                 subst.getBody().getBlocks());
         if (lastOp && lastOp->getResult(0).getType() ==
@@ -152,8 +152,8 @@ class ArgumentSynthesisPass
       // substituted. Erasing the arguments changes the calling semantics and
       // breaks all calls to `func`. This practice is unnecessary and highly
       // discouraged.
-      if (changeSemantics)
-        func.eraseArguments(replacedArgs);
+      if (changeSemantics && failed(func.eraseArguments(replacedArgs)))
+        func->emitWarning("could not erase function arguments");
     }
   }
 };
diff --git a/lib/Optimizer/Transforms/BasisConversion.cpp b/lib/Optimizer/Transforms/BasisConversion.cpp
index 1515aecd7a1..15289f59960 100644
--- a/lib/Optimizer/Transforms/BasisConversion.cpp
+++ b/lib/Optimizer/Transforms/BasisConversion.cpp
@@ -7,10 +7,8 @@
  ******************************************************************************/
 
 #include "DecompositionPatterns.h"
+#include "PassDetails.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Threading.h"
@@ -18,16 +16,13 @@
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/DialectConversion.h"
 
-using namespace mlir;
-
-//===----------------------------------------------------------------------===//
-// Generated logic
-//===----------------------------------------------------------------------===//
 namespace cudaq::opt {
 #define GEN_PASS_DEF_BASISCONVERSION
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
+using namespace mlir;
+
 namespace {
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Optimizer/Transforms/CableRoughIn.cpp b/lib/Optimizer/Transforms/CableRoughIn.cpp
index 598f94d8091..b32e02937e4 100644
--- a/lib/Optimizer/Transforms/CableRoughIn.cpp
+++ b/lib/Optimizer/Transforms/CableRoughIn.cpp
@@ -8,10 +8,7 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -80,7 +77,7 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
     for (auto arg : call.getOperands()) {
       Type argTy = arg.getType();
       if (argTy == refTy) {
-        newArgs.push_back(rewriter.create<quake::UnwrapOp>(loc, wireTy, arg));
+        newArgs.push_back(quake::UnwrapOp::create(rewriter, loc, wireTy, arg));
         resultTys.push_back(wireTy);
         continue;
       }
@@ -103,9 +100,9 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
         SmallVector<Value> unwraps;
         for (auto carg : concat.getTargets())
           unwraps.push_back(
-              rewriter.create<quake::UnwrapOp>(loc, wireTy, carg));
+              quake::UnwrapOp::create(rewriter, loc, wireTy, carg));
         newArgs.push_back(
-            rewriter.create<quake::BundleCableOp>(loc, cableTy, unwraps));
+            quake::BundleCableOp::create(rewriter, loc, cableTy, unwraps));
         resultTys.push_back(cableTy);
         continue;
       }
@@ -121,7 +118,7 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
           auto strArgTy = strArg.getType();
           if (isa<quake::RefType>(strArgTy)) {
             unwraps.push_back(
-                rewriter.create<quake::UnwrapOp>(loc, wireTy, strArg));
+                quake::UnwrapOp::create(rewriter, loc, wireTy, strArg));
             cableSize++;
             continue;
           }
@@ -142,7 +139,7 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
             cableSize += concat.getTargets().size();
             for (auto carg : concat.getTargets())
               unwraps.push_back(
-                  rewriter.create<quake::UnwrapOp>(loc, wireTy, carg));
+                  quake::UnwrapOp::create(rewriter, loc, wireTy, carg));
             continue;
           }
           LLVM_DEBUG(llvm::dbgs() << strArg << " is not supported.\n");
@@ -150,7 +147,7 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
         }
         auto cableTy = quake::CableType::get(ctx, cableSize);
         newArgs.push_back(
-            rewriter.create<quake::BundleCableOp>(loc, cableTy, unwraps));
+            quake::BundleCableOp::create(rewriter, loc, cableTy, unwraps));
         resultTys.push_back(cableTy);
         continue;
       }
@@ -159,8 +156,8 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
     }
 
     // Create a quake.call_by_ref operation.
-    auto callByRef = rewriter.create<quake::CallByRefOp>(
-        loc, resultTys, call.getCalleeAttr(), newArgs);
+    auto callByRef = quake::CallByRefOp::create(
+        rewriter, loc, call.getCalleeAttr(), resultTys, newArgs);
 
     // Wrap the wires and cables.
     std::size_t i = origCoarity;
@@ -169,7 +166,7 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
     for (auto arg : call.getOperands()) {
       Type argTy = arg.getType();
       if (argTy == refTy) {
-        rewriter.create<quake::WrapOp>(loc, results[i++], arg);
+        quake::WrapOp::create(rewriter, loc, results[i++], arg);
         continue;
       }
       if (isa<quake::VeqType>(argTy)) {
@@ -181,11 +178,11 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
         SmallVector<Type> wireTys(cableSize);
         std::fill(wireTys.begin(), wireTys.end(), wireTy);
         auto split =
-            rewriter.create<quake::SplitCableOp>(loc, wireTys, results[i++]);
+            quake::SplitCableOp::create(rewriter, loc, wireTys, results[i++]);
         SmallVector<Value> concatTargs{concat.getTargets().begin(),
                                        concat.getTargets().end()};
         for (auto [j, wire] : llvm::enumerate(split.getResults()))
-          rewriter.create<quake::WrapOp>(loc, wire, concatTargs[j]);
+          quake::WrapOp::create(rewriter, loc, wire, concatTargs[j]);
       }
       if (isa<quake::StruqType>(argTy)) {
         auto mkStruq = arg.getDefiningOp<quake::MakeStruqOp>();
@@ -194,14 +191,14 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
         SmallVector<Type> wireTys(cableSize);
         std::fill(wireTys.begin(), wireTys.end(), wireTy);
         auto split =
-            rewriter.create<quake::SplitCableOp>(loc, wireTys, results[i++]);
+            quake::SplitCableOp::create(rewriter, loc, wireTys, results[i++]);
         std::size_t j = 0;
         SmallVector<Value> splitResults{split.getResults().begin(),
                                         split.getResults().end()};
         for (auto strArg : mkStruq.getVeqs()) {
           auto strArgTy = strArg.getType();
           if (isa<quake::RefType>(strArgTy)) {
-            rewriter.create<quake::WrapOp>(loc, splitResults[j++], strArg);
+            quake::WrapOp::create(rewriter, loc, splitResults[j++], strArg);
             continue;
           }
           if (isa<quake::VeqType>(strArgTy)) {
@@ -211,8 +208,8 @@ class CallPattern : public OpRewritePattern<func::CallOp> {
             SmallVector<Value> concatTargs{concat.getTargets().begin(),
                                            concat.getTargets().end()};
             for (std::size_t k = 0, K = concatTargs.size(); k < K; ++k)
-              rewriter.create<quake::WrapOp>(loc, splitResults[j++],
-                                             concatTargs[k]);
+              quake::WrapOp::create(rewriter, loc, splitResults[j++],
+                                    concatTargs[k]);
             continue;
           }
           LLVM_DEBUG(llvm::dbgs() << strArg << " is not supported.\n");
@@ -240,7 +237,7 @@ class CableRoughInPass
     patterns.insert<CallPattern>(ctx);
     quake::ExtractRefOp::getCanonicalizationPatterns(patterns, ctx);
     quake::GetMemberOp::getCanonicalizationPatterns(patterns, ctx);
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns))))
+    if (failed(applyPatternsGreedily(funcOp, std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/lib/Optimizer/Transforms/ClassicalOptimization.cpp b/lib/Optimizer/Transforms/ClassicalOptimization.cpp
index accc8b09b60..a17b68ad31c 100644
--- a/lib/Optimizer/Transforms/ClassicalOptimization.cpp
+++ b/lib/Optimizer/Transforms/ClassicalOptimization.cpp
@@ -81,7 +81,8 @@ class ClassicalOptimizationPass
             simplifyRegions(rewriter, op->getRegions());
       }
       progress = 0;
-      (void)applyPatternsAndFoldGreedily(op, frozen);
+      if (failed(applyPatternsGreedily(op, frozen)))
+        break;
     } while (progress);
   }
 
diff --git a/lib/Optimizer/Transforms/CombineMeasurements.cpp b/lib/Optimizer/Transforms/CombineMeasurements.cpp
index 5065c8aa6b6..4d66e687dbc 100644
--- a/lib/Optimizer/Transforms/CombineMeasurements.cpp
+++ b/lib/Optimizer/Transforms/CombineMeasurements.cpp
@@ -9,14 +9,10 @@
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/Optimizer/CodeGen/QIRAttributeNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "nlohmann/json.hpp"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -248,8 +244,8 @@ class CombineMeasurementsPass
     RewritePatternSet patterns(ctx);
     patterns.insert<ExtendQubitMeasurePattern, ExtendVeqMeasurePattern>(
         ctx, analysis);
-    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                            std::move(patterns)))) {
+    if (failed(
+            applyPatternsGreedily(func.getOperation(), std::move(patterns)))) {
       func.emitOpError("Combining measurements failed");
       signalPassFailure();
     }
diff --git a/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp b/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp
index 29cfc8a3e9b..4ce925310a1 100644
--- a/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp
+++ b/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp
@@ -7,8 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -57,10 +55,11 @@ class AllocaPat : public OpRewritePattern<quake::AllocaOp> {
           return success();
         }
         if (isa<quake::VeqType>(alloc.getType())) {
-          Value lo = rewriter.create<arith::ConstantIntOp>(
-              alloc.getLoc(), os.first, rewriter.getI64Type());
-          Value hi = rewriter.create<arith::ConstantIntOp>(
-              alloc.getLoc(), os.first + os.second - 1, rewriter.getI64Type());
+          Value lo = arith::ConstantIntOp::create(
+              rewriter, alloc.getLoc(), rewriter.getI64Type(), os.first);
+          Value hi = arith::ConstantIntOp::create(rewriter, alloc.getLoc(),
+                                                  rewriter.getI64Type(),
+                                                  os.first + os.second - 1);
           // trying to print alloc after the replace gives a segfault
           LLVM_DEBUG(llvm::dbgs() << "replace " << alloc);
           [[maybe_unused]] Value subveq =
@@ -76,15 +75,15 @@ class AllocaPat : public OpRewritePattern<quake::AllocaOp> {
           for (auto m : sty.getMembers()) {
             auto v = [&]() -> Value {
               if (isa<quake::RefType>(m)) {
-                auto result = rewriter.create<quake::ExtractRefOp>(
-                    loc, analysis.newAlloc, inner);
+                auto result = quake::ExtractRefOp::create(
+                    rewriter, loc, analysis.newAlloc, inner);
                 inner++;
                 return result;
               }
               assert(cast<quake::VeqType>(m).hasSpecifiedSize());
               std::size_t dist = inner + cast<quake::VeqType>(m).getSize() - 1;
-              auto result = rewriter.create<quake::SubVeqOp>(
-                  loc, m, analysis.newAlloc, inner, dist);
+              auto result = quake::SubVeqOp::create(
+                  rewriter, loc, m, analysis.newAlloc, inner, dist);
               inner = dist + 1;
               return result;
             }();
@@ -145,7 +144,7 @@ class CombineQuantumAllocationsPass
     OpBuilder rewriter(ctx);
     rewriter.setInsertionPointToStart(entryBlock);
     auto veqTy = quake::VeqType::get(ctx, currentOffset);
-    analysis.newAlloc = rewriter.create<quake::AllocaOp>(loc, veqTy);
+    analysis.newAlloc = quake::AllocaOp::create(rewriter, loc, veqTy);
 
     // 3. Greedily replace the uses of the original alloca ops with uses of
     // partitions of the new alloca op. Replace subveq of subveq with a single
@@ -158,8 +157,8 @@ class CombineQuantumAllocationsPass
       quake::GetMemberOp::getCanonicalizationPatterns(patterns, ctx);
       quake::SubVeqOp::getCanonicalizationPatterns(patterns, ctx);
       quake::ConcatOp::getCanonicalizationPatterns(patterns, ctx);
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns)))) {
+      if (failed(applyPatternsGreedily(func.getOperation(),
+                                       std::move(patterns)))) {
         func.emitOpError("combining alloca, subveq, and extract ops failed");
         signalPassFailure();
       }
@@ -172,8 +171,8 @@ class CombineQuantumAllocationsPass
       for (auto &block : func.getRegion()) {
         if (block.hasNoSuccessors()) {
           rewriter.setInsertionPoint(block.getTerminator());
-          rewriter.create<quake::DeallocOp>(analysis.newAlloc.getLoc(),
-                                            analysis.newAlloc);
+          quake::DeallocOp::create(rewriter, analysis.newAlloc.getLoc(),
+                                   analysis.newAlloc);
         }
       }
     }
diff --git a/lib/Optimizer/Transforms/ConstantPropagation.cpp b/lib/Optimizer/Transforms/ConstantPropagation.cpp
index fd5fe25f2aa..58cab26148e 100644
--- a/lib/Optimizer/Transforms/ConstantPropagation.cpp
+++ b/lib/Optimizer/Transforms/ConstantPropagation.cpp
@@ -8,10 +8,7 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -114,8 +111,8 @@ class ForwardConstSubArray : public OpRewritePattern<cudaq::cc::LoadOp> {
     }
     Type loadTy = loadSpan.getType();
     auto arrayAttr = cast<ArrayAttr>(attr);
-    Value newConArr = rewriter.create<cudaq::cc::ConstantArrayOp>(
-        loadSpan.getLoc(), ty, arrayAttr);
+    Value newConArr = cudaq::cc::ConstantArrayOp::create(
+        rewriter, loadSpan.getLoc(), ty, arrayAttr);
     rewriter.replaceOpWithNewOp<cudaq::cc::ReifySpanOp>(loadSpan, loadTy,
                                                         newConArr);
     return success();
@@ -193,24 +190,24 @@ class ForwardSingleDimensionData : public OpRewritePattern<cudaq::cc::LoadOp> {
     auto loc = loadSpanEle.getLoc();
     if (isa<cudaq::cc::CharspanType>(loadTy)) {
       auto stringAttr = cast<StringAttr>(attr);
-      auto lit = rewriter.create<cudaq::cc::CreateStringLiteralOp>(
-          loc, cudaq::cc::PointerType::get(ty), stringAttr);
-      auto len = rewriter.create<arith::ConstantIntOp>(
-          loc, stringAttr.getValue().size() + 1, 64);
+      auto lit = cudaq::cc::CreateStringLiteralOp::create(
+          rewriter, loc, cudaq::cc::PointerType::get(ty), stringAttr);
+      auto len = arith::ConstantIntOp::create(
+          rewriter, loc, stringAttr.getValue().size() + 1, 64);
       rewriter.replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(loadSpanEle, loadTy,
                                                            lit, len);
       return success();
     }
     if (auto intTy = dyn_cast<IntegerType>(loadTy)) {
       auto intAttr = cast<IntegerAttr>(attr);
-      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(
-          loadSpanEle, intAttr.getInt(), intTy);
+      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(loadSpanEle, intTy,
+                                                        intAttr.getInt());
       return success();
     }
     if (auto floatTy = dyn_cast<FloatType>(loadTy)) {
       auto floatAttr = cast<FloatAttr>(attr);
-      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(
-          loadSpanEle, floatAttr.getValue(), floatTy);
+      rewriter.replaceOpWithNewOp<arith::ConstantFloatOp>(loadSpanEle, floatTy,
+                                                          floatAttr.getValue());
       return success();
     }
     return failure();
@@ -231,8 +228,8 @@ class ConstantPropagationPass
 
     LLVM_DEBUG(llvm::dbgs() << "Before constant prop:\n" << func << '\n');
 
-    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                            std::move(patterns)))) {
+    if (failed(
+            applyPatternsGreedily(func.getOperation(), std::move(patterns)))) {
       signalPassFailure();
       return;
     }
diff --git a/lib/Optimizer/Transforms/DeadStoreRemoval.cpp b/lib/Optimizer/Transforms/DeadStoreRemoval.cpp
index 64d158cb518..dbc47a51afe 100644
--- a/lib/Optimizer/Transforms/DeadStoreRemoval.cpp
+++ b/lib/Optimizer/Transforms/DeadStoreRemoval.cpp
@@ -18,7 +18,7 @@ namespace cudaq::opt {
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
-#define DEBUG_TYPE "dsr"
+#define DEBUG_TYPE "dead-store-removal"
 
 using namespace mlir;
 
@@ -98,7 +98,7 @@ class DSRPass : public cudaq::opt::impl::DeadStoreRemovalBase<DSRPass> {
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<DSRPattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After erasure:\n" << *op << "\n\n");
   }
diff --git a/lib/Optimizer/Transforms/Decomposition.cpp b/lib/Optimizer/Transforms/Decomposition.cpp
index cff76e3b32d..a94c239ce77 100644
--- a/lib/Optimizer/Transforms/Decomposition.cpp
+++ b/lib/Optimizer/Transforms/Decomposition.cpp
@@ -7,9 +7,8 @@
  ******************************************************************************/
 
 #include "DecompositionPatterns.h"
+#include "PassDetails.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/PatternMatch.h"
@@ -17,16 +16,13 @@
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-using namespace mlir;
-
-//===----------------------------------------------------------------------===//
-// Generated logic
-//===----------------------------------------------------------------------===//
 namespace cudaq::opt {
 #define GEN_PASS_DEF_DECOMPOSITION
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
+using namespace mlir;
+
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -102,7 +98,7 @@ struct Decomposition
     // Process kernels in parallel
     LogicalResult rewriteResult = failableParallelForEach(
         module.getContext(), kernels, [&](Operation *op) {
-          LogicalResult converged = applyPatternsAndFoldGreedily(op, patterns);
+          LogicalResult converged = applyPatternsGreedily(op, patterns);
 
           // Decomposition is best-effort. Non-convergence is only a pass
           // failure if the user asked for convergence.
diff --git a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
index bde2025353e..1d8c3b734d9 100644
--- a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
@@ -7,34 +7,23 @@
  ******************************************************************************/
 
 #include "DecompositionPatterns.h"
-#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
-#include "mlir/Dialect/Math/IR/Math.h"
+#include "PassDetails.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include <algorithm>
-#include <llvm/ADT/ArrayRef.h>
-#include <llvm/ADT/Hashing.h>
-#include <llvm/ADT/SetVector.h>
-#include <llvm/ADT/StringMap.h>
-#include <llvm/ADT/StringRef.h>
-#include <memory>
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
-#include <vector>
 
 using namespace mlir;
 
 namespace {
 
-//===----------------------------------------------------------------------===//
 // ConversionTarget and OperatorInfo, parsed from target basis strings such as
 // ["x", "x(1)", "z"]
-//===----------------------------------------------------------------------===//
-
 struct OperatorInfo {
   StringRef name;
   std::size_t numControls;
@@ -136,8 +125,6 @@ struct hash<OperatorInfo> {
 };
 } // namespace std
 
-namespace {
-
 // Computes a hash of the given unordered set using the hashes of the elements
 // in the set.
 template <typename T>
@@ -150,6 +137,7 @@ std::size_t computeSetHash(const std::unordered_set<T> &set) {
   return llvm::hash_combine_range(hashes.begin(), hashes.end());
 }
 
+namespace {
 //===----------------------------------------------------------------------===//
 // Decomposition Graph for Pattern Selection
 //===----------------------------------------------------------------------===//
@@ -357,7 +345,6 @@ class DecompositionGraph {
   std::unordered_map<std::size_t, std::vector<std::string>>
       patternSelectionCache;
 };
-
 } // namespace
 
 std::unique_ptr<ConversionTarget>
diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.cpp b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
index ed3ef43a874..1f5a1ca7614 100644
--- a/lib/Optimizer/Transforms/DecompositionPatterns.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
@@ -6,6 +6,15 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+#include "DecompositionPatterns.h"
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Builder/Factory.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/TypeName.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"
+
 /**
  * This file contains the decomposition patterns that match single gates and
  * decompose them into a sequence of other gates.
@@ -22,27 +31,10 @@
  * macro can be used for this purpose instead.
  */
 
-#include "DecompositionPatterns.h"
-#include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Rewrite/FrozenRewritePatternSet.h"
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/ADT/StringMap.h>
-#include <llvm/ADT/StringRef.h>
-#include <llvm/Support/Casting.h>
-#include <llvm/Support/Error.h>
-#include <llvm/Support/TypeName.h>
-#include <memory>
-
 using namespace mlir;
 
 LLVM_INSTANTIATE_REGISTRY(cudaq::DecompositionPatternTypeRegistry)
 
-namespace {
-
 //===----------------------------------------------------------------------===//
 // Helpers
 //===----------------------------------------------------------------------===//
@@ -55,23 +47,24 @@ inline Value createConstant(Location loc, double value, Type type,
 
 inline Value createConstant(Location loc, std::size_t value,
                             PatternRewriter &rewriter) {
-  return rewriter.create<arith::ConstantIntOp>(loc, value, 64);
+  return arith::ConstantIntOp::create(rewriter, loc, value, 64);
 }
 
 inline Value createDivF(Location loc, Value numerator, double denominator,
                         PatternRewriter &rewriter) {
   auto denominatorValue =
       createConstant(loc, denominator, numerator.getType(), rewriter);
-  return rewriter.create<arith::DivFOp>(loc, numerator, denominatorValue);
+  return arith::DivFOp::create(rewriter, loc, numerator, denominatorValue);
 }
 
 /// @brief Returns true if \p op contains any `ControlType` operands.
 inline bool containsControlTypes(quake::OperatorInterface op) {
   return llvm::any_of(op.getControls(), [](const Value &v) {
-    return v.getType().isa<quake::ControlType>();
+    return isa<quake::ControlType>(v.getType());
   });
 }
 
+namespace {
 /// @brief This is a wrapper class for `PatternRewriter::create<>()` for
 /// `QuakeOperator`s. If the controls and targets are `quake::WireType`, then
 /// this wrapper class's methods update the controls and targets in the `create`
@@ -85,7 +78,7 @@ class QuakeOperatorCreator {
   /// builder for cases when you have one input ValueRange.
   SmallVector<Type> getResultType(ValueRange operands) {
     std::size_t numOutputWires = llvm::count_if(operands, [](const Value &v) {
-      return v.getType().isa<quake::WireType>();
+      return isa<quake::WireType>(v.getType());
     });
 
     return SmallVector<Type>(numOutputWires,
@@ -98,9 +91,9 @@ class QuakeOperatorCreator {
     std::size_t numOutputWires =
         llvm::count_if(
             operands1,
-            [](const Value &v) { return v.getType().isa<quake::WireType>(); }) +
+            [](const Value &v) { return isa<quake::WireType>(v.getType()); }) +
         llvm::count_if(operands2, [](const Value &v) {
-          return v.getType().isa<quake::WireType>();
+          return isa<quake::WireType>(v.getType());
         });
 
     return SmallVector<Type>(numOutputWires,
@@ -112,7 +105,7 @@ class QuakeOperatorCreator {
   void selectWiresAndReplaceUses(Operation *op, ValueRange newValues) {
     SmallVector<Value, 4> newWireValues;
     for (const auto &v : newValues)
-      if (v.getType().isa<quake::WireType>())
+      if (isa<quake::WireType>(v.getType()))
         newWireValues.push_back(v);
     assert(op->getResults().size() == newWireValues.size() &&
            "incorrect number of output wires provided");
@@ -125,9 +118,9 @@ class QuakeOperatorCreator {
                                  Value target) {
     SmallVector<Value, 4> newWireValues;
     for (const auto &v : controls)
-      if (v.getType().isa<quake::WireType>())
+      if (isa<quake::WireType>(v.getType()))
         newWireValues.push_back(v);
-    if (target.getType().isa<quake::WireType>())
+    if (isa<quake::WireType>(target.getType()))
       newWireValues.push_back(target);
     assert(op->getResults().size() == newWireValues.size() &&
            "incorrect number of output wires provided");
@@ -137,13 +130,12 @@ class QuakeOperatorCreator {
   template <typename OpTy>
   OpTy create(Location location, Value &target) {
     OpTy op;
-    op = rewriter.create<OpTy>(location, getResultType(target), false,
-                               ValueRange{}, ValueRange{}, target,
-                               DenseBoolArrayAttr{});
+    op = OpTy::create(rewriter, location, getResultType(target), false,
+                      ValueRange{}, ValueRange{}, target, DenseBoolArrayAttr{});
     auto resultWires = op.getWires();
     auto resultIt = resultWires.begin();
     auto resultWiresEnd = resultWires.end();
-    if (target.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+    if (isa<quake::WireType>(target.getType()) && resultIt != resultWiresEnd)
       target = *resultIt;
     return op;
   }
@@ -151,13 +143,12 @@ class QuakeOperatorCreator {
   template <typename OpTy>
   OpTy create(Location location, bool is_adj, Value &target) {
     OpTy op;
-    op = rewriter.create<OpTy>(location, getResultType(target), is_adj,
-                               ValueRange{}, ValueRange{}, target,
-                               DenseBoolArrayAttr{});
+    op = OpTy::create(rewriter, location, getResultType(target), is_adj,
+                      ValueRange{}, ValueRange{}, target, DenseBoolArrayAttr{});
     auto resultWires = op.getWires();
     auto resultIt = resultWires.begin();
     auto resultWiresEnd = resultWires.end();
-    if (target.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+    if (isa<quake::WireType>(target.getType()) && resultIt != resultWiresEnd)
       target = *resultIt;
     return op;
   }
@@ -165,15 +156,14 @@ class QuakeOperatorCreator {
   template <typename OpTy>
   OpTy create(Location location, Value &control, Value &target) {
     OpTy op;
-    op = rewriter.create<OpTy>(location, getResultType(control, target), false,
-                               ValueRange{}, control, target,
-                               DenseBoolArrayAttr{});
+    op = OpTy::create(rewriter, location, getResultType(control, target), false,
+                      ValueRange{}, control, target, DenseBoolArrayAttr{});
     auto resultWires = op.getWires();
     auto resultIt = resultWires.begin();
     auto resultWiresEnd = resultWires.end();
-    if (control.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+    if (isa<quake::WireType>(control.getType()) && resultIt != resultWiresEnd)
       control = *resultIt++;
-    if (target.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+    if (isa<quake::WireType>(target.getType()) && resultIt != resultWiresEnd)
       target = *resultIt;
     return op;
   }
@@ -182,16 +172,16 @@ class QuakeOperatorCreator {
   OpTy create(Location location, bool is_adj, ValueRange parameters,
               SmallVectorImpl<Value> &controls, Value &target) {
     OpTy op;
-    op = rewriter.create<OpTy>(location, getResultType(controls, target),
-                               is_adj, parameters, controls, target,
-                               DenseBoolArrayAttr{});
+    op = OpTy::create(rewriter, location, getResultType(controls, target),
+                      is_adj, parameters, controls, target,
+                      DenseBoolArrayAttr{});
     auto resultWires = op.getWires();
     auto resultIt = resultWires.begin();
     auto resultWiresEnd = resultWires.end();
     for (auto &c : controls)
-      if (c.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+      if (isa<quake::WireType>(c.getType()) && resultIt != resultWiresEnd)
         c = *resultIt++;
-    if (target.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+    if (isa<quake::WireType>(target.getType()) && resultIt != resultWiresEnd)
       target = *resultIt;
     return op;
   }
@@ -200,16 +190,16 @@ class QuakeOperatorCreator {
   OpTy create(Location location, ValueRange parameters,
               SmallVectorImpl<Value> &controls, Value &target) {
     OpTy op;
-    op = rewriter.create<OpTy>(location, getResultType(controls, target), false,
-                               parameters, controls, target,
-                               DenseBoolArrayAttr{});
+    op =
+        OpTy::create(rewriter, location, getResultType(controls, target), false,
+                     parameters, controls, target, DenseBoolArrayAttr{});
     auto resultWires = op.getWires();
     auto resultIt = resultWires.begin();
     auto resultWiresEnd = resultWires.end();
     for (auto &c : controls)
-      if (c.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+      if (isa<quake::WireType>(c.getType()) && resultIt != resultWiresEnd)
         c = *resultIt++;
-    if (target.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+    if (isa<quake::WireType>(target.getType()) && resultIt != resultWiresEnd)
       target = *resultIt;
     return op;
   }
@@ -218,16 +208,16 @@ class QuakeOperatorCreator {
   OpTy create(Location location, SmallVectorImpl<Value> &controls,
               Value &target) {
     OpTy op;
-    op = rewriter.create<OpTy>(location, getResultType(controls, target), false,
-                               ValueRange{}, controls, target,
-                               DenseBoolArrayAttr{});
+    op =
+        OpTy::create(rewriter, location, getResultType(controls, target), false,
+                     ValueRange{}, controls, target, DenseBoolArrayAttr{});
     auto resultWires = op.getWires();
     auto resultIt = resultWires.begin();
     auto resultWiresEnd = resultWires.end();
     for (auto &c : controls)
-      if (c.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+      if (isa<quake::WireType>(c.getType()) && resultIt != resultWiresEnd)
         c = *resultIt++;
-    if (target.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+    if (isa<quake::WireType>(target.getType()) && resultIt != resultWiresEnd)
       target = *resultIt;
     return op;
   }
@@ -235,14 +225,14 @@ class QuakeOperatorCreator {
   template <typename OpTy>
   OpTy create(Location location, SmallVectorImpl<Value> &targets) {
     OpTy op;
-    op = rewriter.create<OpTy>(location, getResultType(targets), false,
-                               ValueRange{}, ValueRange{}, targets,
-                               DenseBoolArrayAttr{});
+    op =
+        OpTy::create(rewriter, location, getResultType(targets), false,
+                     ValueRange{}, ValueRange{}, targets, DenseBoolArrayAttr{});
     auto resultWires = op.getWires();
     auto resultIt = resultWires.begin();
     auto resultWiresEnd = resultWires.end();
     for (auto &t : targets)
-      if (t.getType().isa<quake::WireType>() && resultIt != resultWiresEnd)
+      if (isa<quake::WireType>(t.getType()) && resultIt != resultWiresEnd)
         t = *resultIt++;
     return op;
   }
@@ -250,13 +240,14 @@ class QuakeOperatorCreator {
 private:
   PatternRewriter &rewriter;
 };
+} // namespace
 
 /// Check whether the operation has the correct number of controls.
 ///
 /// Note: This function assumes that the operation has already been tested for
 /// reference semantics.
-LogicalResult checkNumControls(quake::OperatorInterface op,
-                               std::size_t requiredNumControls) {
+static LogicalResult checkNumControls(quake::OperatorInterface op,
+                                      std::size_t requiredNumControls) {
   auto opControls = op.getControls();
   if (opControls.size() > requiredNumControls)
     return failure();
@@ -283,9 +274,9 @@ LogicalResult checkNumControls(quake::OperatorInterface op,
 ///
 /// Note: This function assumes that the operation has already been tested for
 /// reference semantics.
-LogicalResult checkAndExtractControls(quake::OperatorInterface op,
-                                      MutableArrayRef<Value> controls,
-                                      PatternRewriter &rewriter) {
+static LogicalResult checkAndExtractControls(quake::OperatorInterface op,
+                                             MutableArrayRef<Value> controls,
+                                             PatternRewriter &rewriter) {
   if (failed(checkNumControls(op, controls.size())))
     return failure();
 
@@ -295,7 +286,7 @@ LogicalResult checkAndExtractControls(quake::OperatorInterface op,
       for (std::size_t i = 0, end = veq.getSize(); i < end; ++i) {
         Value index = createConstant(op.getLoc(), i, rewriter);
         Value qref =
-            rewriter.create<quake::ExtractRefOp>(op.getLoc(), control, index);
+            quake::ExtractRefOp::create(rewriter, op.getLoc(), control, index);
         controls[controlIndex] = qref;
         controlIndex += 1;
       }
@@ -317,6 +308,7 @@ LogicalResult checkAndExtractControls(quake::OperatorInterface op,
 /// "target2", ...)
 /// where "source_op" is the operation that the pattern matches and
 /// {"target1", "target2", ...} are the operations that the pattern may produce.
+#undef REGISTER_DECOMPOSITION_PATTERN
 #define REGISTER_DECOMPOSITION_PATTERN(PATTERN, SOURCE_OP, ...)                \
   struct PATTERN##Type : public cudaq::DecompositionPatternType {              \
     using cudaq::DecompositionPatternType::DecompositionPatternType;           \
@@ -345,11 +337,11 @@ LogicalResult checkAndExtractControls(quake::OperatorInterface op,
 // HOp decompositions
 //===----------------------------------------------------------------------===//
 
+namespace {
 // quake.h target
 // ───────────────────────────────────
 // quake.phased_rx(π/2, π/2) target
 // quake.phased_rx(π, 0) target
-
 struct HToPhasedRxType; // forward declare the pattern type, defined in the
                         // macro below
 struct HToPhasedRx
@@ -407,7 +399,7 @@ struct ExpPauliDecomposition
     auto pauliWord = expPauliOp.getPauli();
 
     if (expPauliOp.isAdj())
-      theta = rewriter.create<arith::NegFOp>(loc, theta);
+      theta = arith::NegFOp::create(rewriter, loc, theta);
 
     std::optional<std::string> optPauliWordStr;
     if (!pauliWord) {
@@ -503,19 +495,19 @@ struct ExpPauliDecomposition
 
     SmallVector<Value> qubitSupport;
     for (std::size_t i = 0; i < size; i++) {
-      Value index = rewriter.create<arith::ConstantIntOp>(loc, i, 64);
-      Value qubitI = rewriter.create<quake::ExtractRefOp>(loc, qubits, index);
+      Value index = arith::ConstantIntOp::create(rewriter, loc, i, 64);
+      Value qubitI = quake::ExtractRefOp::create(rewriter, loc, qubits, index);
       if (pauliWordStr[i] != 'I')
         qubitSupport.push_back(qubitI);
 
       if (pauliWordStr[i] == 'Y') {
         APFloat d(M_PI_2);
-        Value param = rewriter.create<arith::ConstantFloatOp>(
-            loc, d, rewriter.getF64Type());
-        rewriter.create<quake::RxOp>(loc, ValueRange{param}, ValueRange{},
-                                     ValueRange{qubitI});
+        Value param = arith::ConstantFloatOp::create(rewriter, loc,
+                                                     rewriter.getF64Type(), d);
+        quake::RxOp::create(rewriter, loc, ValueRange{param}, ValueRange{},
+                            ValueRange{qubitI});
       } else if (pauliWordStr[i] == 'X') {
-        rewriter.create<quake::HOp>(loc, ValueRange{qubitI});
+        quake::HOp::create(rewriter, loc, ValueRange{qubitI});
       }
     }
 
@@ -529,34 +521,35 @@ struct ExpPauliDecomposition
 
     std::vector<std::pair<Value, Value>> toReverse;
     for (std::size_t i = 0; i < qubitSupport.size() - 1; i++) {
-      rewriter.create<quake::XOp>(loc, ValueRange{qubitSupport[i]},
-                                  ValueRange{qubitSupport[i + 1]});
+      quake::XOp::create(rewriter, loc, ValueRange{qubitSupport[i]},
+                         ValueRange{qubitSupport[i + 1]});
       toReverse.emplace_back(qubitSupport[i], qubitSupport[i + 1]);
     }
 
     // Note: `Rz(theta)` = `exp(-i*theta/2 Z)`
-    Value negTwoTheta = rewriter.create<arith::MulFOp>(
-        loc, createConstant(loc, -2.0, rewriter.getF64Type(), rewriter), theta);
-    rewriter.create<quake::RzOp>(loc, ValueRange{negTwoTheta}, ValueRange{},
-                                 ValueRange{qubitSupport.back()});
+    Value negTwoTheta = arith::MulFOp::create(
+        rewriter, loc,
+        createConstant(loc, -2.0, rewriter.getF64Type(), rewriter), theta);
+    quake::RzOp::create(rewriter, loc, ValueRange{negTwoTheta}, ValueRange{},
+                        ValueRange{qubitSupport.back()});
 
     std::reverse(toReverse.begin(), toReverse.end());
     for (auto &[i, j] : toReverse)
-      rewriter.create<quake::XOp>(loc, ValueRange{i}, ValueRange{j});
+      quake::XOp::create(rewriter, loc, ValueRange{i}, ValueRange{j});
 
     for (std::size_t i = 0; i < pauliWordStr.size(); i++) {
       std::size_t k = pauliWordStr.size() - 1 - i;
-      Value index = rewriter.create<arith::ConstantIntOp>(loc, k, 64);
-      Value qubitK = rewriter.create<quake::ExtractRefOp>(loc, qubits, index);
+      Value index = arith::ConstantIntOp::create(rewriter, loc, k, 64);
+      Value qubitK = quake::ExtractRefOp::create(rewriter, loc, qubits, index);
 
       if (pauliWordStr[k] == 'Y') {
         APFloat d(-M_PI_2);
-        Value param = rewriter.create<arith::ConstantFloatOp>(
-            loc, d, rewriter.getF64Type());
-        rewriter.create<quake::RxOp>(loc, ValueRange{param}, ValueRange{},
-                                     ValueRange{qubitK});
+        Value param = arith::ConstantFloatOp::create(rewriter, loc,
+                                                     rewriter.getF64Type(), d);
+        quake::RxOp::create(rewriter, loc, ValueRange{param}, ValueRange{},
+                            ValueRange{qubitK});
       } else if (pauliWordStr[k] == 'X') {
-        rewriter.create<quake::HOp>(loc, ValueRange{qubitK});
+        quake::HOp::create(rewriter, loc, ValueRange{qubitK});
       }
     }
 
@@ -633,7 +626,7 @@ struct R1AdjToR1
     Location loc = op->getLoc();
     Value target = op.getTarget();
     Value angle = op.getParameter();
-    angle = rewriter.create<arith::NegFOp>(loc, angle);
+    angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
@@ -752,7 +745,7 @@ struct SToPhasedRx
     SmallVector<Value> noControls;
     Value zero = createConstant(loc, 0.0, rewriter.getF64Type(), rewriter);
     Value pi_2 = createConstant(loc, M_PI_2, rewriter.getF64Type(), rewriter);
-    Value negPi_2 = rewriter.create<arith::NegFOp>(loc, pi_2);
+    Value negPi_2 = arith::NegFOp::create(rewriter, loc, pi_2);
 
     Value angle = op.isAdj() ? pi_2 : negPi_2;
 
@@ -830,13 +823,13 @@ struct TToPhasedRx
     Value target = op.getTarget();
     Value angle = createConstant(loc, -M_PI_4, rewriter.getF64Type(), rewriter);
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
     Value zero = createConstant(loc, 0.0, rewriter.getF64Type(), rewriter);
     Value pi_2 = createConstant(loc, M_PI_2, rewriter.getF64Type(), rewriter);
-    Value negPi_2 = rewriter.create<arith::NegFOp>(loc, pi_2);
+    Value negPi_2 = arith::NegFOp::create(rewriter, loc, pi_2);
 
     std::array<Value, 2> parameters = {pi_2, zero};
     QuakeOperatorCreator qRewriter(rewriter);
@@ -1234,7 +1227,7 @@ struct ZToPhasedRx
     Value zero = createConstant(loc, 0.0, rewriter.getF64Type(), rewriter);
     Value negPi = createConstant(loc, -M_PI, rewriter.getF64Type(), rewriter);
     Value pi_2 = createConstant(loc, M_PI_2, rewriter.getF64Type(), rewriter);
-    Value negPi_2 = rewriter.create<arith::NegFOp>(loc, pi_2);
+    Value negPi_2 = arith::NegFOp::create(rewriter, loc, pi_2);
 
     std::array<Value, 2> parameters = {pi_2, zero};
     QuakeOperatorCreator qRewriter(rewriter);
@@ -1289,12 +1282,12 @@ struct CR1ToCX : public cudaq::DecompositionPattern<CR1ToCXType, quake::R1Op> {
       negControl = (*negatedControls)[0];
 
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
     Value halfAngle = createDivF(loc, angle, 2.0, rewriter);
-    Value negHalfAngle = rewriter.create<arith::NegFOp>(loc, halfAngle);
+    Value negHalfAngle = arith::NegFOp::create(rewriter, loc, halfAngle);
 
     QuakeOperatorCreator qRewriter(rewriter);
     qRewriter.create<quake::R1Op>(loc, /*isAdj*/ negControl, halfAngle,
@@ -1334,15 +1327,15 @@ struct R1ToPhasedRx
     Value target = op.getTarget();
     Value angle = op.getParameter();
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
     Type angleType = op.getParameter().getType();
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
     Value zero = createConstant(loc, 0.0, angleType, rewriter);
     Value pi_2 = createConstant(loc, M_PI_2, angleType, rewriter);
-    Value negPi_2 = rewriter.create<arith::NegFOp>(loc, pi_2);
-    Value negAngle = rewriter.create<arith::NegFOp>(loc, angle);
+    Value negPi_2 = arith::NegFOp::create(rewriter, loc, pi_2);
+    Value negAngle = arith::NegFOp::create(rewriter, loc, angle);
 
     std::array<Value, 2> parameters = {pi_2, zero};
     QuakeOperatorCreator qRewriter(rewriter);
@@ -1396,13 +1389,13 @@ struct CRxToCX : public cudaq::DecompositionPattern<CRxToCXType, quake::RxOp> {
 
     Value angle = op.getParameter();
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
     Type angleType = op.getParameter().getType();
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
     Value halfAngle = createDivF(loc, angle, 2.0, rewriter);
-    Value negHalfAngle = rewriter.create<arith::NegFOp>(loc, halfAngle);
+    Value negHalfAngle = arith::NegFOp::create(rewriter, loc, halfAngle);
     Value negPI_2 = createConstant(loc, -M_PI_2, angleType, rewriter);
 
     QuakeOperatorCreator qRewriter(rewriter);
@@ -1442,7 +1435,7 @@ struct RxToPhasedRx
     Value target = op.getTarget();
     Value angle = op.getParameter();
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
     Type angleType = op.getParameter().getType();
 
     // Necessary/Helpful constants
@@ -1482,7 +1475,7 @@ struct RxAdjToRx
     Location loc = op->getLoc();
     Value target = op.getTarget();
     Value angle = op.getParameter();
-    angle = rewriter.create<arith::NegFOp>(loc, angle);
+    angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
@@ -1530,12 +1523,12 @@ struct CRyToCX : public cudaq::DecompositionPattern<CRyToCXType, quake::RyOp> {
 
     Value angle = op.getParameter();
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
     Value halfAngle = createDivF(loc, angle, 2.0, rewriter);
-    Value negHalfAngle = rewriter.create<arith::NegFOp>(loc, halfAngle);
+    Value negHalfAngle = arith::NegFOp::create(rewriter, loc, halfAngle);
 
     QuakeOperatorCreator qRewriter(rewriter);
     qRewriter.create<quake::RyOp>(loc, halfAngle, noControls, target);
@@ -1571,7 +1564,7 @@ struct RyToPhasedRx
     Value target = op.getTarget();
     Value angle = op.getParameter();
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
     Type angleType = op.getParameter().getType();
 
     // Necessary/Helpful constants
@@ -1611,7 +1604,7 @@ struct RyAdjToRy
     Location loc = op->getLoc();
     Value target = op.getTarget();
     Value angle = op.getParameter();
-    angle = rewriter.create<arith::NegFOp>(loc, angle);
+    angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
@@ -1659,12 +1652,12 @@ struct CRzToCX : public cudaq::DecompositionPattern<CRzToCXType, quake::RzOp> {
 
     Value angle = op.getParameter();
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
     Value halfAngle = createDivF(loc, angle, 2.0, rewriter);
-    Value negHalfAngle = rewriter.create<arith::NegFOp>(loc, halfAngle);
+    Value negHalfAngle = arith::NegFOp::create(rewriter, loc, halfAngle);
 
     QuakeOperatorCreator qRewriter(rewriter);
     qRewriter.create<quake::RzOp>(loc, halfAngle, noControls, target);
@@ -1702,15 +1695,15 @@ struct RzToPhasedRx
     Value target = op.getTarget();
     Value angle = op.getParameter();
     if (op.isAdj())
-      angle = rewriter.create<arith::NegFOp>(loc, angle);
+      angle = arith::NegFOp::create(rewriter, loc, angle);
     Type angleType = op.getParameter().getType();
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
     Value zero = createConstant(loc, 0.0, angleType, rewriter);
     Value pi_2 = createConstant(loc, M_PI_2, angleType, rewriter);
-    Value negPi_2 = rewriter.create<arith::NegFOp>(loc, pi_2);
-    Value negAngle = rewriter.create<arith::NegFOp>(loc, angle);
+    Value negPi_2 = arith::NegFOp::create(rewriter, loc, pi_2);
+    Value negAngle = arith::NegFOp::create(rewriter, loc, angle);
 
     std::array<Value, 2> parameters = {pi_2, zero};
     QuakeOperatorCreator qRewriter(rewriter);
@@ -1751,7 +1744,7 @@ struct RzAdjToRz
     Location loc = op->getLoc();
     Value target = op.getTarget();
     Value angle = op.getParameter();
-    angle = rewriter.create<arith::NegFOp>(loc, angle);
+    angle = arith::NegFOp::create(rewriter, loc, angle);
 
     // Necessary/Helpful constants
     SmallVector<Value> noControls;
@@ -1796,17 +1789,17 @@ struct U3ToRotations
     Value lam = op.getParameters()[2];
 
     if (op.isAdj()) {
-      theta = rewriter.create<arith::NegFOp>(loc, theta);
+      theta = arith::NegFOp::create(rewriter, loc, theta);
       // swap the 2nd and 3rd parameter for correctness
       std::swap(phi, lam);
-      phi = rewriter.create<arith::NegFOp>(loc, phi);
-      lam = rewriter.create<arith::NegFOp>(loc, lam);
+      phi = arith::NegFOp::create(rewriter, loc, phi);
+      lam = arith::NegFOp::create(rewriter, loc, lam);
     }
 
     // Necessary/Helpful constants
     Type angleType = op.getParameter().getType();
     Value pi_2 = createConstant(loc, M_PI_2, angleType, rewriter);
-    Value negPi_2 = rewriter.create<arith::NegFOp>(loc, pi_2);
+    Value negPi_2 = arith::NegFOp::create(rewriter, loc, pi_2);
 
     QuakeOperatorCreator qRewriter(rewriter);
     qRewriter.create<quake::RzOp>(loc, lam, controls, target);
diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.h b/lib/Optimizer/Transforms/DecompositionPatterns.h
index 20b402abd5e..1cad9d3fb9d 100644
--- a/lib/Optimizer/Transforms/DecompositionPatterns.h
+++ b/lib/Optimizer/Transforms/DecompositionPatterns.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#define LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING 1
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Registry.h"
 #include "mlir/IR/PatternMatch.h"
@@ -28,8 +29,13 @@ namespace cudaq {
 /// system. Stores the pattern metadata and provides a factory method to create
 /// new instances of the pattern.
 ///
+/// Register decomposition patterns using
+/// CUDAQ_REGISTER_TYPE(cudaq::DecompositionPatternType, MyPatternType,
+/// pattern_name)
+/// where pattern_name is the same as MyPatternType().getPatternName().
 class DecompositionPatternType {
 public:
+  using RegistryType = llvm::Registry<DecompositionPatternType>;
   virtual ~DecompositionPatternType() = default;
 
   /// Get the source operation this pattern matches and decomposes.
@@ -102,3 +108,9 @@ createBasisTarget(mlir::MLIRContext &context,
 using DecompositionPatternTypeRegistry =
     llvm::Registry<DecompositionPatternType>;
 } // namespace cudaq
+
+/// Register a decomposition pattern type with the LLVM registry.
+/// This is compiler-internal only (no cross-DSO / Python concerns).
+#define REGISTER_DECOMPOSITION_PATTERN(SUBTYPE, NAME)                          \
+  static cudaq::DecompositionPatternType::RegistryType::Add<SUBTYPE>           \
+      decomp_reg_##NAME(#NAME, "");
diff --git a/lib/Optimizer/Transforms/DependencyAnalysis.cpp b/lib/Optimizer/Transforms/DependencyAnalysis.cpp
index 580dd6d4a86..9951d8d02db 100644
--- a/lib/Optimizer/Transforms/DependencyAnalysis.cpp
+++ b/lib/Optimizer/Transforms/DependencyAnalysis.cpp
@@ -16,6 +16,14 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
+//===----------------------------------------------------------------------===//
+// Generated logic
+//===----------------------------------------------------------------------===//
+namespace cudaq::opt {
+#define GEN_PASS_DEF_DEPENDENCYANALYSIS
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
 #define DEBUG_TYPE "dep-analysis"
 
 using namespace mlir;
@@ -25,14 +33,6 @@ using namespace mlir;
 #define RAW_GATE_OPS GATE_OPS(RAW)
 #define RAW_QUANTUM_OPS QUANTUM_OPS(RAW)
 
-//===----------------------------------------------------------------------===//
-// Generated logic
-//===----------------------------------------------------------------------===//
-namespace cudaq::opt {
-#define GEN_PASS_DEF_DEPENDENCYANALYSIS
-#include "cudaq/Optimizer/Transforms/Passes.h.inc"
-} // namespace cudaq::opt
-
 namespace {
 // TODO: Someday, it would probably make sense to make VirtualQIDs and
 // PhysicalQIDs be data structures with metadata, not just integer
@@ -653,8 +653,8 @@ class InitDependencyNode : public DependencyNode {
     assert(qubit.has_value() && "Trying to codeGen a virtual allocation "
                                 "without a physical qubit assigned!");
     auto wirety = quake::WireType::get(builder.getContext());
-    auto alloc = builder.create<quake::BorrowWireOp>(
-        builder.getUnknownLoc(), wirety,
+    auto alloc = quake::BorrowWireOp::create(
+        builder, builder.getUnknownLoc(), wirety,
         cudaq::opt::topologyAgnosticWiresetName, qubit.value());
     wire = alloc.getResult();
     hasCodeGen = true;
@@ -760,13 +760,13 @@ class OpDependencyNode : public DependencyNode {
   std::string getOpName() override {
     if (isa<arith::ConstantOp>(associated)) {
       if (auto cstf = dyn_cast<arith::ConstantFloatOp>(associated)) {
-        auto value = cstf.getValue().cast<FloatAttr>().getValueAsDouble();
+        auto value = cast<FloatAttr>(cstf.getValue()).getValueAsDouble();
         return std::to_string(value);
       } else if (auto cstidx = dyn_cast<arith::ConstantIndexOp>(associated)) {
-        auto value = cstidx.getValue().cast<IntegerAttr>().getInt();
+        auto value = cast<IntegerAttr>(cstidx.getValue()).getInt();
         return std::to_string(value);
       } else if (auto cstint = dyn_cast<arith::ConstantIntOp>(associated)) {
-        auto value = cstint.getValue().cast<IntegerAttr>().getInt();
+        auto value = cast<IntegerAttr>(cstint.getValue()).getInt();
         return std::to_string(value);
       }
     }
@@ -800,9 +800,9 @@ class OpDependencyNode : public DependencyNode {
     auto oldOp = associated;
     auto operands = gatherOperands(builder);
 
-    associated =
-        Operation::create(oldOp->getLoc(), oldOp->getName(),
-                          oldOp->getResultTypes(), operands, oldOp->getAttrs());
+    associated = Operation::create(
+        oldOp->getLoc(), oldOp->getName(), oldOp->getResultTypes(), operands,
+        oldOp->getAttrs(), OpaqueProperties{nullptr});
     associated->removeAttr("dnodeid");
     builder.insert(associated);
   }
@@ -1710,7 +1710,7 @@ class RootDependencyNode : public OpDependencyNode {
   void genOp(OpBuilder &builder) override {
     auto wire = dependencies[0].getValue();
     auto newOp =
-        builder.create<quake::ReturnWireOp>(builder.getUnknownLoc(), wire);
+        quake::ReturnWireOp::create(builder, builder.getUnknownLoc(), wire);
     newOp->setAttrs(associated->getAttrs());
     newOp->removeAttr("dnodeid");
     associated = newOp;
@@ -2605,7 +2605,7 @@ class IfDependencyNode : public OpDependencyNode {
     }
 
     auto newIf =
-        builder.create<cudaq::cc::IfOp>(oldOp->getLoc(), results, operands);
+        cudaq::cc::IfOp::create(builder, oldOp->getLoc(), results, operands);
     auto *then_region = &newIf.getThenRegion();
     then_block->codeGen(builder, then_region);
 
@@ -3137,7 +3137,7 @@ class DependencyAnalysisEngine {
     // and thus should have a memoized dnode for defOp, fail if not
     assert(defOp->hasAttr("dnodeid") && "No dnodeid found for operation");
 
-    auto id = defOp->getAttr("dnodeid").cast<IntegerAttr>().getUInt();
+    auto id = cast<IntegerAttr>(defOp->getAttr("dnodeid")).getUInt();
     auto dnode = perOp[id];
 
     if (!ifStack.empty() && defOp->getParentOp() != ifStack.back() &&
diff --git a/lib/Optimizer/Transforms/DistributedDeviceCall.cpp b/lib/Optimizer/Transforms/DistributedDeviceCall.cpp
index 1e944626f8f..279b7328a0c 100644
--- a/lib/Optimizer/Transforms/DistributedDeviceCall.cpp
+++ b/lib/Optimizer/Transforms/DistributedDeviceCall.cpp
@@ -13,6 +13,7 @@
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/Support/MD5.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeSupport.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -78,10 +79,9 @@ class QIRVendorDeviceCallPat
         // Error code 2 is used to indicate illegal execution of unreachable
         // code.
         Value errorCodeTwo =
-            rewriter.create<arith::ConstantIntOp>(devcall.getLoc(), 2, 64);
-        rewriter.create<func::CallOp>(devcall.getLoc(), TypeRange{},
-                                      cudaq::opt::QISTrap,
-                                      ValueRange{errorCodeTwo});
+            arith::ConstantIntOp::create(rewriter, devcall.getLoc(), 2, 64);
+        func::CallOp::create(rewriter, devcall.getLoc(), TypeRange{},
+                             cudaq::opt::QISTrap, ValueRange{errorCodeTwo});
         // For return (after the trap), load from nullptr to create return value
         // of the same type as the device function, i.e., `return *(T*)nullptr;`
         // for return type `T`.
@@ -90,18 +90,18 @@ class QIRVendorDeviceCallPat
         // function.
         SmallVector<Value> trapResults;
         for (Type resTy : devFunc.getFunctionType().getResults()) {
-          auto nullPtr = rewriter.create<arith::ConstantOp>(
-              devcall.getLoc(),
+          auto nullPtr = arith::ConstantOp::create(
+              rewriter, devcall.getLoc(),
               rewriter.getZeroAttr(rewriter.getIntegerType(64)));
           auto ptrTy = cudaq::cc::PointerType::get(resTy);
-          auto castedNullPtr = rewriter.create<cudaq::cc::CastOp>(
-              devcall.getLoc(), ptrTy, nullPtr);
-          auto loadedVal = rewriter.create<cudaq::cc::LoadOp>(devcall.getLoc(),
-                                                              castedNullPtr);
+          auto castedNullPtr = cudaq::cc::CastOp::create(
+              rewriter, devcall.getLoc(), ptrTy, nullPtr);
+          auto loadedVal = cudaq::cc::LoadOp::create(rewriter, devcall.getLoc(),
+                                                     castedNullPtr);
           trapResults.push_back(loadedVal);
         }
 
-        rewriter.create<func::ReturnOp>(devcall.getLoc(), trapResults);
+        func::ReturnOp::create(rewriter, devcall.getLoc(), trapResults);
       }
       // (2) Set this trap function as private and weak_odr linkage, to allow
       // multiple definitions across translation units without linker errors.
@@ -123,7 +123,7 @@ class QIRVendorDeviceCallPat
       // weak_odr linkage.
       rewriter.replaceOpWithNewOp<cudaq::cc::NoInlineCallOp>(
           devcall, devFunc.getFunctionType().getResults(), devFuncName,
-          devcall.getArgs());
+          devcall.getArgs(), ArrayAttr{}, ArrayAttr{});
 
       return success();
     }
@@ -167,8 +167,9 @@ class ResolveDevicePtrOpPat
   LogicalResult matchAndRewrite(cudaq::cc::ResolveDevicePtrOp resolve,
                                 PatternRewriter &rewriter) const override {
     auto loc = resolve.getLoc();
-    auto call = rewriter.create<func::CallOp>(
-        loc, TypeRange{cudaq::cc::PointerType::get(rewriter.getI8Type())},
+    auto call = func::CallOp::create(
+        rewriter, loc,
+        TypeRange{cudaq::cc::PointerType::get(rewriter.getI8Type())},
         cudaq::runtime::extractDevPtr, ValueRange{resolve.getDevicePtr()});
     rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(
         resolve, resolve.getResult().getType(), call.getResult(0));
@@ -202,7 +203,7 @@ class DistributedDeviceCallPass
 
     patterns.add<ResolveDevicePtrOpPat>(ctx);
     patterns.insert<QIRVendorDeviceCallPat>(ctx, insertTrapImplementation);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    if (failed(applyPatternsGreedily(module, std::move(patterns))))
       signalPassFailure();
     return;
   }
diff --git a/lib/Optimizer/Transforms/EraseNoise.cpp b/lib/Optimizer/Transforms/EraseNoise.cpp
index d7f86771a66..746bb89bec9 100644
--- a/lib/Optimizer/Transforms/EraseNoise.cpp
+++ b/lib/Optimizer/Transforms/EraseNoise.cpp
@@ -47,7 +47,7 @@ class EraseNoisePass : public cudaq::opt::impl::EraseNoiseBase<EraseNoisePass> {
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<EraseApplyNoisePattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After erasure:\n" << *op << "\n\n");
   }
diff --git a/lib/Optimizer/Transforms/EraseNopCalls.cpp b/lib/Optimizer/Transforms/EraseNopCalls.cpp
index ef35056b056..d334bf75f5f 100644
--- a/lib/Optimizer/Transforms/EraseNopCalls.cpp
+++ b/lib/Optimizer/Transforms/EraseNopCalls.cpp
@@ -51,7 +51,7 @@ class EraseNopCallsPass
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<EraseStdMovePattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After erasure:\n" << *op << "\n\n");
   }
diff --git a/lib/Optimizer/Transforms/EraseVectorCopyCtor.cpp b/lib/Optimizer/Transforms/EraseVectorCopyCtor.cpp
index f3daf62f7d1..e35c5709517 100644
--- a/lib/Optimizer/Transforms/EraseVectorCopyCtor.cpp
+++ b/lib/Optimizer/Transforms/EraseVectorCopyCtor.cpp
@@ -32,11 +32,11 @@ struct PatternAnalysis {
 // Transformation is:
 //
 //  %36 = func.call @malloc(%35) : (i64) -> !cc.ptr<i8>
-//  func.call @llvm.memcpy.p0i8.p0i8.i64(%36, %34, %35, %false) :
+//  func.call @llvm.memcpy.p0.p0.i64(%36, %34, %35, %false) :
 //      (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 //  %37 = cc.alloca i8[%35 : i64]
 //  %38 = cc.cast %37 : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-//  func.call @llvm.memcpy.p0i8.p0i8.i64(%38, %36, %35, %false) :
+//  func.call @llvm.memcpy.p0.p0.i64(%38, %36, %35, %false) :
 //      (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 //  func.call @free(%36) : (!cc.ptr<i8>) -> ()
 //  ───────────────────────────────────────────────────────────────
@@ -68,11 +68,11 @@ class EraseVectorCopyCtorPattern : public OpRewritePattern<func::CallOp> {
     if (globalConst) {
       auto ip = rewriter.saveInsertionPoint();
       rewriter.setInsertionPointAfter(analysis.copyFrom);
-      auto loaded = rewriter.create<cudaq::cc::LoadOp>(
-          analysis.copyFrom.getLoc(), globalConst);
+      auto loaded = cudaq::cc::LoadOp::create(
+          rewriter, analysis.copyFrom.getLoc(), globalConst);
       rewriter.setInsertionPointAfter(analysis.copyTo);
-      rewriter.create<cudaq::cc::StoreOp>(analysis.copyTo.getLoc(), loaded,
-                                          newStackSlot);
+      cudaq::cc::StoreOp::create(rewriter, analysis.copyTo.getLoc(), loaded,
+                                 newStackSlot);
       rewriter.restoreInsertionPoint(ip);
     } else {
       rewriter.replaceOpWithNewOp<cudaq::cc::CastOp>(
@@ -126,7 +126,7 @@ class EraseVectorCopyCtorPass
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<EraseVectorCopyCtorPattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After erasure:\n" << *op << "\n\n");
   }
diff --git a/lib/Optimizer/Transforms/ExpandControlVeqs.cpp b/lib/Optimizer/Transforms/ExpandControlVeqs.cpp
index 0548d181a38..ac227107651 100644
--- a/lib/Optimizer/Transforms/ExpandControlVeqs.cpp
+++ b/lib/Optimizer/Transforms/ExpandControlVeqs.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
@@ -54,7 +53,7 @@ class ExpandPat : public OpRewritePattern<OP> {
         // The veq is not added the newControls, so it will be dropped
         for (size_t i = 0; i < *size; ++i) {
           auto ext =
-              rewriter.create<quake::ExtractRefOp>(op.getLoc(), veqVal, i);
+              quake::ExtractRefOp::create(rewriter, op.getLoc(), veqVal, i);
           newControls.push_back(ext);
           update = true;
         }
diff --git a/lib/Optimizer/Transforms/ExpandMeasurements.cpp b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
index 17682471337..45117d5ee47 100644
--- a/lib/Optimizer/Transforms/ExpandMeasurements.cpp
+++ b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
@@ -8,14 +8,17 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
+namespace cudaq::opt {
+#define GEN_PASS_DEF_EXPANDMEASUREMENTS
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
 using namespace mlir;
 
 // Only an individual qubit measurement returns a scalar token. Both
@@ -42,63 +45,59 @@ class ExpandRewritePattern : public OpRewritePattern<A> {
     // in.
     unsigned numQubits = 0u;
     for (auto v : measureOp.getTargets())
-      if (v.getType().template isa<quake::RefType>())
+      if (isa<quake::RefType>(v.getType()))
         ++numQubits;
     Value totalToRead =
-        rewriter.template create<arith::ConstantIntOp>(loc, numQubits, 64);
+        arith::ConstantIntOp::create(rewriter, loc, numQubits, 64);
     auto i64Ty = rewriter.getI64Type();
     for (auto v : measureOp.getTargets())
-      if (v.getType().template isa<quake::VeqType>()) {
-        Value vecSz = rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
-        totalToRead =
-            rewriter.template create<arith::AddIOp>(loc, totalToRead, vecSz);
+      if (isa<quake::VeqType>(v.getType())) {
+        Value vecSz = quake::VeqSizeOp::create(rewriter, loc, i64Ty, v);
+        totalToRead = arith::AddIOp::create(rewriter, loc, totalToRead, vecSz);
       }
 
     // 2. Create the buffer.
     auto i1Ty = rewriter.getI1Type();
     auto i8Ty = rewriter.getI8Type();
-    Value buff =
-        rewriter.template create<cudaq::cc::AllocaOp>(loc, i8Ty, totalToRead);
+    Value buff = cudaq::cc::AllocaOp::create(rewriter, loc, i8Ty, totalToRead);
 
     // 3. Measure each individual qubit and insert the result, in order, into
     // the buffer. For registers/vectors, loop over the entire set of qubits.
-    Value buffOff = rewriter.template create<arith::ConstantIntOp>(loc, 0, 64);
-    Value one = rewriter.template create<arith::ConstantIntOp>(loc, 1, 64);
+    Value buffOff = arith::ConstantIntOp::create(rewriter, loc, 0, 64);
+    Value one = arith::ConstantIntOp::create(rewriter, loc, 1, 64);
     auto measTy = quake::MeasureType::get(rewriter.getContext());
     for (auto v : measureOp.getTargets()) {
       if (isa<quake::RefType>(v.getType())) {
-        auto meas = rewriter.template create<A>(loc, measTy, v).getMeasOut();
-        auto bit =
-            rewriter.template create<quake::DiscriminateOp>(loc, i1Ty, meas);
-        Value addr = rewriter.template create<cudaq::cc::ComputePtrOp>(
-            loc, cudaq::cc::PointerType::get(i8Ty), buff, buffOff);
-        auto bitByte = rewriter.template create<cudaq::cc::CastOp>(
-            loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
-        rewriter.template create<cudaq::cc::StoreOp>(loc, bitByte, addr);
-        buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, one);
+        auto meas = A::create(rewriter, loc, measTy, v).getMeasOut();
+        auto bit = quake::DiscriminateOp::create(rewriter, loc, i1Ty, meas);
+        Value addr = cudaq::cc::ComputePtrOp::create(
+            rewriter, loc, cudaq::cc::PointerType::get(i8Ty), buff, buffOff);
+        auto bitByte = cudaq::cc::CastOp::create(
+            rewriter, loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
+        cudaq::cc::StoreOp::create(rewriter, loc, bitByte, addr);
+        buffOff = arith::AddIOp::create(rewriter, loc, buffOff, one);
       } else {
         assert(isa<quake::VeqType>(v.getType()));
-        Value vecSz = rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
+        Value vecSz = quake::VeqSizeOp::create(rewriter, loc, i64Ty, v);
         cudaq::opt::factory::createInvariantLoop(
             rewriter, loc, vecSz,
             [&](OpBuilder &builder, Location loc, Region &, Block &block) {
               Value iv = block.getArgument(0);
-              Value qv =
-                  builder.template create<quake::ExtractRefOp>(loc, v, iv);
-              auto meas = builder.template create<A>(loc, measTy, qv);
-              auto bit = builder.template create<quake::DiscriminateOp>(
-                  loc, i1Ty, meas.getMeasOut());
+              Value qv = quake::ExtractRefOp::create(builder, loc, v, iv);
+              auto meas = A::create(builder, loc, measTy, qv);
+              auto bit = quake::DiscriminateOp::create(builder, loc, i1Ty,
+                                                       meas.getMeasOut());
               if (auto registerName = measureOp.getRegisterNameAttr())
                 meas.setRegisterName(registerName);
-              Value offset =
-                  builder.template create<arith::AddIOp>(loc, iv, buffOff);
-              auto addr = builder.template create<cudaq::cc::ComputePtrOp>(
-                  loc, cudaq::cc::PointerType::get(i8Ty), buff, offset);
-              auto bitByte = rewriter.template create<cudaq::cc::CastOp>(
-                  loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
-              builder.template create<cudaq::cc::StoreOp>(loc, bitByte, addr);
+              Value offset = arith::AddIOp::create(builder, loc, iv, buffOff);
+              auto addr = cudaq::cc::ComputePtrOp::create(
+                  builder, loc, cudaq::cc::PointerType::get(i8Ty), buff,
+                  offset);
+              auto bitByte = cudaq::cc::CastOp::create(
+                  builder, loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
+              cudaq::cc::StoreOp::create(builder, loc, bitByte, addr);
             });
-        buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, vecSz);
+        buffOff = arith::AddIOp::create(rewriter, loc, buffOff, vecSz);
       }
     }
 
@@ -110,7 +109,7 @@ class ExpandRewritePattern : public OpRewritePattern<A> {
         auto ptrArrI1Ty =
             cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i1Ty));
         auto buffCast =
-            rewriter.template create<cudaq::cc::CastOp>(loc, ptrArrI1Ty, buff);
+            cudaq::cc::CastOp::create(rewriter, loc, ptrArrI1Ty, buff);
         rewriter.template replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(
             disc, stdvecTy, buffCast, totalToRead);
       }
@@ -136,13 +135,13 @@ class ResetRewrite : public OpRewritePattern<quake::ResetOp> {
     auto loc = resetOp.getLoc();
     auto veqArg = resetOp.getTargets();
     auto i64Ty = rewriter.getI64Type();
-    Value vecSz = rewriter.create<quake::VeqSizeOp>(loc, i64Ty, veqArg);
+    Value vecSz = quake::VeqSizeOp::create(rewriter, loc, i64Ty, veqArg);
     cudaq::opt::factory::createInvariantLoop(
         rewriter, loc, vecSz,
         [&](OpBuilder &builder, Location loc, Region &, Block &block) {
           Value iv = block.getArgument(0);
-          Value qv = builder.create<quake::ExtractRefOp>(loc, veqArg, iv);
-          builder.create<quake::ResetOp>(loc, TypeRange{}, qv);
+          Value qv = quake::ExtractRefOp::create(builder, loc, veqArg, iv);
+          quake::ResetOp::create(builder, loc, TypeRange{}, qv);
         });
     rewriter.eraseOp(resetOp);
     return success();
@@ -150,8 +149,9 @@ class ResetRewrite : public OpRewritePattern<quake::ResetOp> {
 };
 
 class ExpandMeasurementsPass
-    : public cudaq::opt::ExpandMeasurementsBase<ExpandMeasurementsPass> {
+    : public cudaq::opt::impl::ExpandMeasurementsBase<ExpandMeasurementsPass> {
 public:
+  using Base::Base;
   void runOnOperation() override {
     auto *op = getOperation();
     auto *ctx = &getContext();
diff --git a/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp b/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp
index bf82726a1a4..c1da88569e5 100644
--- a/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp
+++ b/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp
@@ -8,7 +8,6 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -95,10 +94,10 @@ class AllocaPattern : public OpRewritePattern<quake::AllocaOp> {
       SmallVector<Value> memAllocs;
       for (auto memTy : stqTy.getMembers())
         memAllocs.emplace_back(
-            rewriter.create<quake::AllocaOp>(loc, memTy).getResult());
+            quake::AllocaOp::create(rewriter, loc, memTy).getResult());
       // 2. Create a value of the original struq type using quake.make_struq.
       auto aggregate =
-          rewriter.create<quake::MakeStruqOp>(loc, stqTy, memAllocs);
+          quake::MakeStruqOp::create(rewriter, loc, stqTy, memAllocs);
       // 3. Walk all the uses. If they are quake.get_member operations, replace
       // them with direct uses.
       for (auto *user : llvm::make_early_inc_range(allocOp->getUsers()))
@@ -119,7 +118,7 @@ class AllocaPattern : public OpRewritePattern<quake::AllocaOp> {
 
     // Split the aggregate veq into a sequence of distinct alloca of ref.
     for (std::size_t i = 0; i < size; ++i)
-      newAllocs.emplace_back(rewriter.create<quake::AllocaOp>(loc, refTy));
+      newAllocs.emplace_back(quake::AllocaOp::create(rewriter, loc, refTy));
 
     if (usesAreConvertible(allocOp)) {
       // Visit all users and replace them accordingly.
@@ -150,7 +149,7 @@ class AllocaPattern : public OpRewritePattern<quake::AllocaOp> {
         rewriter.setInsertionPoint(dealloc);
         auto deloc = dealloc.getLoc();
         for (std::size_t i = 0; i < size - 1; ++i)
-          rewriter.create<quake::DeallocOp>(deloc, newAllocs[i]);
+          quake::DeallocOp::create(rewriter, deloc, newAllocs[i]);
         rewriter.replaceOpWithNewOp<quake::DeallocOp>(dealloc,
                                                       newAllocs[size - 1]);
         continue;
@@ -215,20 +214,17 @@ class DeallocPattern : public OpRewritePattern<quake::DeallocOp> {
     }
 
     auto loc = dealloc.getLoc();
-    // 1. Split the aggregate alloc into a sequence of distinct dealloc of
-    // ref.
     if (auto veqTy = dyn_cast<quake::VeqType>(allocTy)) {
       generateDeallocs(veqTy, rewriter, loc, alloc);
     } else if (auto stqTy = dyn_cast<quake::StruqType>(allocTy)) {
-      // Process a struq in memberwise fashion.
       for (auto iter : llvm::enumerate(stqTy.getMembers())) {
         Type memTy = iter.value();
-        auto mem = rewriter.create<quake::GetMemberOp>(loc, memTy, alloc,
-                                                       iter.index());
+        auto mem = quake::GetMemberOp::create(rewriter, loc, memTy, alloc,
+                                              iter.index());
         if (auto veqTy = dyn_cast<quake::VeqType>(memTy))
           generateDeallocs(veqTy, rewriter, loc, mem);
         else
-          rewriter.create<quake::DeallocOp>(loc, mem);
+          quake::DeallocOp::create(rewriter, loc, mem);
       }
     }
 
@@ -243,8 +239,8 @@ class DeallocPattern : public OpRewritePattern<quake::DeallocOp> {
     std::size_t size = veqTy.getSize();
 
     for (std::size_t i = 0; i < size; ++i) {
-      Value r = rewriter.create<quake::ExtractRefOp>(loc, alloc, i);
-      rewriter.create<quake::DeallocOp>(loc, r);
+      Value r = quake::ExtractRefOp::create(rewriter, loc, alloc, i);
+      quake::DeallocOp::create(rewriter, loc, r);
     }
   };
 };
@@ -284,7 +280,7 @@ class FactorQuantumAllocationsPass
     func::FuncOp func = getOperation();
     RewritePatternSet patterns(ctx);
     patterns.insert<DeallocPattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+    if (failed(applyPatternsGreedily(func, std::move(patterns))))
       return failure();
     return success();
   }
@@ -294,7 +290,7 @@ class FactorQuantumAllocationsPass
     func::FuncOp func = getOperation();
     RewritePatternSet patterns(ctx);
     patterns.insert<AllocaPattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+    if (failed(applyPatternsGreedily(func, std::move(patterns))))
       return failure();
     return success();
   }
diff --git a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
index a6ce7e9dab2..ca5518185e6 100644
--- a/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
+++ b/lib/Optimizer/Transforms/GenDeviceCodeLoader.cpp
@@ -99,11 +99,11 @@ class GenerateDeviceCodeLoaderPass
       auto funcOp = dyn_cast<func::FuncOp>(op);
       if (!funcOp)
         continue;
-      if (!funcOp.getName().startswith(cudaq::runtime::cudaqGenPrefixName))
+      if (!funcOp.getName().starts_with(cudaq::runtime::cudaqGenPrefixName))
         continue;
       if (funcOp->hasAttr(cudaq::generatorAnnotation) || funcOp.empty())
         continue;
-      if (funcOp.getName().endswith(".entry"))
+      if (funcOp.getName().ends_with(".entry"))
         continue;
       auto className =
           funcOp.getName().drop_front(cudaq::runtime::cudaqGenPrefixLength);
@@ -160,36 +160,38 @@ class GenerateDeviceCodeLoaderPass
         strOut << *op << '\n';
       strOut << "\n}\n" << '\0';
 
-      auto devCode = builder.create<LLVM::GlobalOp>(
-          loc, cudaq::opt::factory::getStringType(ctx, funcCode.size()),
+      auto devCode = LLVM::GlobalOp::create(
+          builder, loc,
+          cudaq::opt::factory::getStringType(ctx, funcCode.size()),
           /*isConstant=*/true, LLVM::Linkage::Private,
           className.str() + "CodeHolder.extract_device_code",
           builder.getStringAttr(funcCode), /*alignment=*/0);
-      auto devName = builder.create<LLVM::GlobalOp>(
-          loc, cudaq::opt::factory::getStringType(ctx, className.size() + 1),
+      auto devName = LLVM::GlobalOp::create(
+          builder, loc,
+          cudaq::opt::factory::getStringType(ctx, className.size() + 1),
           /*isConstant=*/true, LLVM::Linkage::Private,
           className.str() + "CodeHolder.extract_device_name",
           builder.getStringAttr(className.str() + '\0'), /*alignment=*/0);
-      auto initFun = builder.create<LLVM::LLVMFuncOp>(
-          loc, className.str() + ".init_func",
+      auto initFun = LLVM::LLVMFuncOp::create(
+          builder, loc, className.str() + ".init_func",
           LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx),
                                       {}));
       auto insPt = builder.saveInsertionPoint();
-      auto *initFunEntry = initFun.addEntryBlock();
+      auto *initFunEntry = initFun.addEntryBlock(builder);
       builder.setInsertionPointToStart(initFunEntry);
-      auto devRef = builder.create<LLVM::AddressOfOp>(
-          loc, cudaq::opt::factory::getPointerType(devName.getType()),
+      auto devRef = LLVM::AddressOfOp::create(
+          builder, loc, cudaq::opt::factory::getPointerType(devName.getType()),
           devName.getSymName());
-      auto codeRef = builder.create<LLVM::AddressOfOp>(
-          loc, cudaq::opt::factory::getPointerType(devCode.getType()),
+      auto codeRef = LLVM::AddressOfOp::create(
+          builder, loc, cudaq::opt::factory::getPointerType(devCode.getType()),
           devCode.getSymName());
-      auto castDevRef = builder.create<LLVM::BitcastOp>(
-          loc, cudaq::opt::factory::getPointerType(ctx), devRef);
-      auto castCodeRef = builder.create<LLVM::BitcastOp>(
-          loc, cudaq::opt::factory::getPointerType(ctx), codeRef);
-      builder.create<LLVM::CallOp>(loc, std::nullopt,
-                                   cudaq::runtime::deviceCodeHolderAdd,
-                                   ValueRange{castDevRef, castCodeRef});
+      auto castDevRef = LLVM::BitcastOp::create(
+          builder, loc, cudaq::opt::factory::getPointerType(ctx), devRef);
+      auto castCodeRef = LLVM::BitcastOp::create(
+          builder, loc, cudaq::opt::factory::getPointerType(ctx), codeRef);
+      LLVM::CallOp::create(builder, loc, TypeRange{},
+                           cudaq::runtime::deviceCodeHolderAdd,
+                           ValueRange{castDevRef, castCodeRef});
 
       auto kernName = funcOp.getSymName().str();
       if (!jitTime && mangledNameMap && !mangledNameMap.empty() &&
@@ -198,10 +200,11 @@ class GenerateDeviceCodeLoaderPass
         auto getEntryRef = [&](auto kernName) -> Value {
           auto hostFuncNameAttr = mangledNameMap.getAs<StringAttr>(kernName);
           auto hostFuncName = hostFuncNameAttr.getValue();
-          if (hostFuncName.endswith("_PyKernelEntryPointRewrite")) {
+          if (hostFuncName.ends_with("_PyKernelEntryPointRewrite")) {
             // This is a Python module, so there is no kernel host entry point.
-            auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-            return builder.create<cudaq::cc::CastOp>(loc, ptrTy, zero);
+            auto zero = arith::ConstantIntOp::create(
+                builder, loc, builder.getIntegerType(64), 0);
+            return cudaq::cc::CastOp::create(builder, loc, ptrTy, zero);
           }
           auto hostFuncOp = module.lookupSymbol<func::FuncOp>(hostFuncName);
           if (!hostFuncOp) {
@@ -211,9 +214,10 @@ class GenerateDeviceCodeLoaderPass
                                                              {}, module);
             hostFuncOp.setPrivate();
           }
-          auto entryRef = builder.create<func::ConstantOp>(
-              loc, hostFuncOp.getFunctionType(), hostFuncOp.getSymName());
-          return builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrTy, entryRef);
+          auto entryRef = func::ConstantOp::create(builder, loc,
+                                                   hostFuncOp.getFunctionType(),
+                                                   hostFuncOp.getSymName());
+          return cudaq::cc::FuncToPtrOp::create(builder, loc, ptrTy, entryRef);
         };
         auto castEntryRef = getEntryRef(kernName);
 
@@ -223,27 +227,27 @@ class GenerateDeviceCodeLoaderPass
           auto nameTy =
               cudaq::opt::factory::getStringType(ctx, kernName.size() + 1);
           // The original kernel's name was already created.
-          auto devRef = builder.create<LLVM::AddressOfOp>(
-              loc, cudaq::opt::factory::getPointerType(nameTy),
+          auto devRef = LLVM::AddressOfOp::create(
+              builder, loc, cudaq::opt::factory::getPointerType(nameTy),
               kernName + "CodeHolder.extract_device_name");
-          auto ccPtr = builder.create<cudaq::cc::CastOp>(loc, ptrTy, devRef);
-          builder.create<func::CallOp>(loc, std::nullopt,
-                                       cudaq::runtime::registerRunnableKernel,
-                                       ValueRange{ccPtr, castEntryRef});
+          auto ccPtr = cudaq::cc::CastOp::create(builder, loc, ptrTy, devRef);
+          func::CallOp::create(builder, loc, TypeRange{},
+                               cudaq::runtime::registerRunnableKernel,
+                               ValueRange{ccPtr, castEntryRef});
         } else {
-          auto deviceRef = builder.create<func::ConstantOp>(
-              loc, funcOp.getFunctionType(), funcOp.getSymName());
+          auto deviceRef = func::ConstantOp::create(
+              builder, loc, funcOp.getFunctionType(), funcOp.getSymName());
           auto castDeviceRef =
-              builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrTy, deviceRef);
+              cudaq::cc::FuncToPtrOp::create(builder, loc, ptrTy, deviceRef);
           auto castKernNameRef =
-              builder.create<cudaq::cc::CastOp>(loc, ptrTy, devRef);
-          builder.create<func::CallOp>(
-              loc, std::nullopt, cudaq::runtime::registerLinkableKernel,
+              cudaq::cc::CastOp::create(builder, loc, ptrTy, devRef);
+          func::CallOp::create(
+              builder, loc, TypeRange{}, cudaq::runtime::registerLinkableKernel,
               ValueRange{castEntryRef, castKernNameRef, castDeviceRef});
         }
       }
 
-      builder.create<LLVM::ReturnOp>(loc, ValueRange{});
+      LLVM::ReturnOp::create(builder, loc, ValueRange{});
       builder.restoreInsertionPoint(insPt);
       cudaq::opt::factory::createGlobalCtorCall(
           module, mlir::FlatSymbolRefAttr::get(ctx, initFun.getName()));
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
index 377f16a24b5..14075be4b17 100644
--- a/lib/Optimizer/Transforms/GenKernelExecution.cpp
+++ b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -11,8 +11,6 @@
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Marshal.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "clang/Basic/Version.h"
@@ -20,7 +18,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Transforms/Passes.h"
 #include <cstdlib>
@@ -58,7 +55,7 @@ zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ModuleOp module,
       if (!(cudaq::cc::isDynamicType(ty) ||
             cudaq::opt::marshal::isStateType(ty) ||
             isa<cudaq::cc::IndirectCallableType>(ty)))
-        v = builder.create<cudaq::cc::LoadOp>(loc, v);
+        v = cudaq::cc::LoadOp::create(builder, loc, v);
       // Python will pass a std::vector<bool> to us here. Unpack it.
       auto pear = cudaq::opt::marshal::unpackAnyStdVectorBool(
           loc, builder, module, v, ty, heapTracker);
@@ -98,18 +95,18 @@ zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ModuleOp module,
         // will match the memory layout of the small struct.
         auto pairTy = cudaq::cc::StructType::get(
             ctx, ArrayRef<Type>{first.getType(), second.getType()});
-        auto tmp = builder.create<cudaq::cc::AllocaOp>(loc, pairTy);
-        auto tmp1 = builder.create<cudaq::cc::CastOp>(
-            loc, cudaq::cc::PointerType::get(first.getType()), tmp);
-        builder.create<cudaq::cc::StoreOp>(loc, first, tmp1);
-        auto tmp2 = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, cudaq::cc::PointerType::get(second.getType()), tmp,
+        auto tmp = cudaq::cc::AllocaOp::create(builder, loc, pairTy);
+        auto tmp1 = cudaq::cc::CastOp::create(
+            builder, loc, cudaq::cc::PointerType::get(first.getType()), tmp);
+        cudaq::cc::StoreOp::create(builder, loc, first, tmp1);
+        auto tmp2 = cudaq::cc::ComputePtrOp::create(
+            builder, loc, cudaq::cc::PointerType::get(second.getType()), tmp,
             ArrayRef<cudaq::cc::ComputePtrArg>{1});
-        builder.create<cudaq::cc::StoreOp>(loc, second, tmp2);
+        cudaq::cc::StoreOp::create(builder, loc, second, tmp2);
         auto devPtrTy = cudaq::cc::PointerType::get(devTy);
-        Value devVal = builder.create<cudaq::cc::CastOp>(loc, devPtrTy, tmp);
+        Value devVal = cudaq::cc::CastOp::create(builder, loc, devPtrTy, tmp);
         if (!cudaq::cc::isDynamicType(devTy))
-          devVal = builder.create<cudaq::cc::LoadOp>(loc, devVal);
+          devVal = cudaq::cc::LoadOp::create(builder, loc, devVal);
         result.emplace_back(argPos, devVal, devTy);
         continue;
       }
@@ -118,7 +115,7 @@ zipArgumentsWithDeviceTypes(Location loc, OpBuilder &builder, ModuleOp module,
       if (isa<cudaq::cc::StructType>(devTy) &&
           isa<cudaq::cc::PointerType>((*argIter).getType()) &&
           !cudaq::cc::isDynamicType(devTy)) {
-        Value devVal = builder.create<cudaq::cc::LoadOp>(loc, *argIter);
+        Value devVal = cudaq::cc::LoadOp::create(builder, loc, *argIter);
         result.emplace_back(argPos, devVal, devTy);
         continue;
       }
@@ -179,8 +176,8 @@ class GenerateKernelExecution
 
     // Create the function that we'll fill.
     auto funcType = FunctionType::get(ctx, {ptrPtrType, ptrPtrType}, {i64Ty});
-    auto argsCreatorFunc = builder.create<func::FuncOp>(
-        loc, classNameStr + ".argsCreator", funcType);
+    auto argsCreatorFunc = func::FuncOp::create(
+        builder, loc, classNameStr + ".argsCreator", funcType);
     OpBuilder::InsertionGuard guard(builder);
     auto *entry = argsCreatorFunc.addEntryBlock();
     builder.setInsertionPointToStart(entry);
@@ -193,23 +190,25 @@ class GenerateKernelExecution
     // bug in the code that is calling this argsCreator.
 
     // Get the array of void* args.
-    auto argsArray = builder.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(ptrI8Ty)),
+    auto argsArray = cudaq::cc::CastOp::create(
+        builder, loc,
+        cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(ptrI8Ty)),
         entry->getArgument(0));
 
     // Loop over the array and cast the void* to the host-side type.
     SmallVector<Value> pseudoArgs;
     for (auto iter : llvm::enumerate(passedHostArgTys)) {
       std::int32_t i = iter.index();
-      auto parg = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrPtrType, argsArray, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      auto parg = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptrPtrType, argsArray,
+          ArrayRef<cudaq::cc::ComputePtrArg>{i});
       Type ty = iter.value();
       // parg is a pointer to a pointer as it is an element of an array of
       // pointers. Always dereference the first layer here.
-      Value deref = builder.create<cudaq::cc::LoadOp>(loc, parg);
+      Value deref = cudaq::cc::LoadOp::create(builder, loc, parg);
       if (!isa<cudaq::cc::PointerType>(ty))
         ty = cudaq::cc::PointerType::get(ty);
-      pseudoArgs.push_back(builder.create<cudaq::cc::CastOp>(loc, ty, deref));
+      pseudoArgs.push_back(cudaq::cc::CastOp::create(builder, loc, ty, deref));
     }
 
     // Zip the arguments with the device side argument types. Recall that some
@@ -220,32 +219,33 @@ class GenerateKernelExecution
         cudaq::opt::marshal::createEmptyHeapTracker(loc, builder);
     auto zippy = zipArgumentsWithDeviceTypes</*argsAreReferences=*/true>(
         loc, builder, module, pseudoArgs, passedDevArgTys, heapTracker);
-    auto sizeScratch = builder.create<cudaq::cc::AllocaOp>(loc, i64Ty);
+    auto sizeScratch = cudaq::cc::AllocaOp::create(builder, loc, i64Ty);
     auto messageBufferSize = [&]() -> Value {
       if (hasDynamicSignature)
         return cudaq::opt::marshal::genSizeOfDynamicMessageBuffer(
             loc, builder, module, msgStructTy, zippy, sizeScratch);
-      return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, msgStructTy);
+      return cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, msgStructTy);
     }();
 
     // Allocate the message buffer on the heap. It must outlive this call.
-    auto buff = builder.create<func::CallOp>(loc, ptrI8Ty, "malloc",
-                                             ValueRange(messageBufferSize));
+    auto buff = func::CallOp::create(builder, loc, ptrI8Ty, "malloc",
+                                     ValueRange(messageBufferSize));
     Value rawMessageBuffer = buff.getResult(0);
     Value msgBufferPrefix =
-        builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rawMessageBuffer);
+        cudaq::cc::CastOp::create(builder, loc, structPtrTy, rawMessageBuffer);
 
     // Populate the message buffer with the pointer-free argument values.
     if (hasDynamicSignature) {
-      auto addendumScratch = builder.create<cudaq::cc::AllocaOp>(loc, ptrI8Ty);
+      auto addendumScratch = cudaq::cc::AllocaOp::create(builder, loc, ptrI8Ty);
       Value prefixSize =
-          builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, msgStructTy);
-      auto arrMessageBuffer = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)),
+          cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, msgStructTy);
+      auto arrMessageBuffer = cudaq::cc::CastOp::create(
+          builder, loc,
+          cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i8Ty)),
           rawMessageBuffer);
       // Compute the position of the addendum.
-      Value addendumPtr = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrI8Ty, arrMessageBuffer,
+      Value addendumPtr = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptrI8Ty, arrMessageBuffer,
           ArrayRef<cudaq::cc::ComputePtrArg>{prefixSize});
       cudaq::opt::marshal::populateMessageBuffer(loc, builder, module,
                                                  msgBufferPrefix, zippy,
@@ -258,9 +258,9 @@ class GenerateKernelExecution
     cudaq::opt::marshal::maybeFreeHeapAllocations(loc, builder, heapTracker);
 
     // Return the message buffer and its size in bytes.
-    builder.create<cudaq::cc::StoreOp>(loc, rawMessageBuffer,
-                                       entry->getArgument(1));
-    builder.create<func::ReturnOp>(loc, ValueRange{messageBufferSize});
+    cudaq::cc::StoreOp::create(builder, loc, rawMessageBuffer,
+                               entry->getArgument(1));
+    func::ReturnOp::create(builder, loc, ValueRange{messageBufferSize});
 
     // Note: the .argsCreator will have allocated space for a static result in
     // the message buffer. If the kernel returns a dynamic result, the launch
@@ -282,27 +282,27 @@ class GenerateKernelExecution
     auto *ctx = builder.getContext();
     auto thunkTy = cudaq::opt::marshal::getThunkType(ctx);
     auto thunk =
-        builder.create<func::FuncOp>(loc, classNameStr + ".thunk", thunkTy);
+        func::FuncOp::create(builder, loc, classNameStr + ".thunk", thunkTy);
     OpBuilder::InsertionGuard guard(builder);
     auto *thunkEntry = thunk.addEntryBlock();
     builder.setInsertionPointToStart(thunkEntry);
-    auto castOp = builder.create<cudaq::cc::CastOp>(loc, structPtrTy,
-                                                    thunkEntry->getArgument(0));
+    auto castOp = cudaq::cc::CastOp::create(builder, loc, structPtrTy,
+                                            thunkEntry->getArgument(0));
     auto isClientServer = thunkEntry->getArgument(1);
     auto i64Ty = builder.getI64Type();
 
     // Compute the struct size without the trailing bytes, structSize.
     Value structSize =
-        builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+        cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, structTy);
 
     // Compute location of trailing bytes.
     auto bufferPtrTy =
         cudaq::opt::factory::getIndexedObjectType(builder.getI8Type());
-    Value extendedBuffer = builder.create<cudaq::cc::CastOp>(
-        loc, bufferPtrTy, thunkEntry->getArgument(0));
+    Value extendedBuffer = cudaq::cc::CastOp::create(
+        builder, loc, bufferPtrTy, thunkEntry->getArgument(0));
     auto ptrI8Ty = cudaq::cc::PointerType::get(builder.getI8Type());
-    Value trailingData = builder.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrI8Ty, extendedBuffer, structSize);
+    Value trailingData = cudaq::cc::ComputePtrOp::create(
+        builder, loc, ptrI8Ty, extendedBuffer, structSize);
 
     // Unpack the arguments in the struct and build the argument list for
     // the call to the kernel code.
@@ -310,7 +310,7 @@ class GenerateKernelExecution
     const std::int32_t offset = funcTy.getNumInputs();
     if (positNullary) {
       for (auto inp : funcOp.getFunctionType().getInputs())
-        args.push_back(builder.create<cudaq::cc::UndefOp>(loc, inp));
+        args.push_back(cudaq::cc::UndefOp::create(builder, loc, inp));
     } else {
       for (auto inp : llvm::enumerate(funcTy.getInputs())) {
         auto [a, t] = cudaq::opt::marshal::processInputValue(
@@ -320,12 +320,13 @@ class GenerateKernelExecution
         args.push_back(a);
       }
     }
-    auto call = builder.create<cudaq::cc::NoInlineCallOp>(
-        loc, funcTy.getResults(), funcOp.getName(), args);
+    auto call = cudaq::cc::NoInlineCallOp::create(
+        builder, loc, funcTy.getResults(), funcOp.getName(), args, ArrayAttr(),
+        ArrayAttr());
     // After the kernel call, clean up any `Array` allocations during kernel
     // executions.
-    builder.create<func::CallOp>(loc, std::nullopt,
-                                 cudaq::runtime::cleanupArrays, ValueRange{});
+    func::CallOp::create(builder, loc, TypeRange{},
+                         cudaq::runtime::cleanupArrays, ValueRange{});
     const bool hasVectorResult =
         funcTy.getNumResults() == 1 &&
         isa<cudaq::cc::SpanLikeType>(funcTy.getResult(0));
@@ -341,16 +342,18 @@ class GenerateKernelExecution
       builder.setInsertionPointToEnd(currentBlock);
       auto eleTy = structTy.getMember(offset);
       auto memTy = cudaq::cc::PointerType::get(eleTy);
-      auto mem = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, memTy, castOp, SmallVector<cudaq::cc::ComputePtrArg>{offset});
+      auto mem = cudaq::cc::ComputePtrOp::create(
+          builder, loc, memTy, castOp,
+          SmallVector<cudaq::cc::ComputePtrArg>{offset});
       auto resPtrTy = cudaq::cc::PointerType::get(call.getResult(0).getType());
-      auto castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
-      builder.create<cudaq::cc::StoreOp>(loc, call.getResult(0), castMem);
-      builder.create<cf::CondBranchOp>(loc, isClientServer, thenBlock,
-                                       elseBlock);
+      auto castMem = cudaq::cc::CastOp::create(builder, loc, resPtrTy, mem);
+      cudaq::cc::StoreOp::create(builder, loc, call.getResult(0), castMem);
+      cf::CondBranchOp::create(builder, loc, isClientServer, thenBlock,
+                               elseBlock);
       builder.setInsertionPointToEnd(thenBlock);
-      auto resAsArg = builder.create<cudaq::cc::CastOp>(
-          loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]), mem);
+      auto resAsArg = cudaq::cc::CastOp::create(
+          builder, loc, cudaq::cc::PointerType::get(thunkTy.getResults()[0]),
+          mem);
       auto retOffset = cudaq::opt::marshal::genComputeReturnOffset(
           loc, builder, funcTy, structTy);
       // createDynamicResult allocates a new buffer and packs the input values
@@ -359,11 +362,11 @@ class GenerateKernelExecution
       // NB: This code only handles one dimensional vectors of static types. It
       // will have to be changed if there is a need to return recursively
       // dynamic structures, i.e., vectors of vectors.
-      auto res = builder.create<func::CallOp>(
-          loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult",
+      auto res = func::CallOp::create(
+          builder, loc, thunkTy.getResults()[0], "__nvqpp_createDynamicResult",
           ValueRange{thunkEntry->getArgument(0), structSize, resAsArg,
                      retOffset});
-      builder.create<func::ReturnOp>(loc, res.getResult(0));
+      func::ReturnOp::create(builder, loc, res.getResult(0));
       builder.setInsertionPointToEnd(elseBlock);
       // For the else case, the span was already copied to the block.
     } else {
@@ -376,15 +379,15 @@ class GenerateKernelExecution
              o < static_cast<std::int32_t>(funcTy.getNumResults()); ++o) {
           auto eleTy = structTy.getMember(offset + o);
           auto memTy = cudaq::cc::PointerType::get(eleTy);
-          auto mem = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, memTy, castOp,
+          auto mem = cudaq::cc::ComputePtrOp::create(
+              builder, loc, memTy, castOp,
               SmallVector<cudaq::cc::ComputePtrArg>{offset + o});
           auto resTy = call.getResult(o).getType();
           auto resPtrTy = cudaq::cc::PointerType::get(resTy);
           Value castMem = mem;
           if (resPtrTy != mem.getType())
-            castMem = builder.create<cudaq::cc::CastOp>(loc, resPtrTy, mem);
-          builder.create<cudaq::cc::StoreOp>(loc, call.getResult(o), castMem);
+            castMem = cudaq::cc::CastOp::create(builder, loc, resPtrTy, mem);
+          cudaq::cc::StoreOp::create(builder, loc, call.getResult(o), castMem);
         }
       }
     }
@@ -392,9 +395,9 @@ class GenerateKernelExecution
     // that no messages need to be sent and that the CPU and QPU code share a
     // memory space. Therefore, making any copies can be skipped.
     auto zeroRes =
-        builder.create<func::CallOp>(loc, thunkTy.getResults()[0],
-                                     "__nvqpp_zeroDynamicResult", ValueRange{});
-    builder.create<func::ReturnOp>(loc, zeroRes.getResult(0));
+        func::CallOp::create(builder, loc, thunkTy.getResults()[0],
+                             "__nvqpp_zeroDynamicResult", ValueRange{});
+    func::ReturnOp::create(builder, loc, zeroRes.getResult(0));
     return thunk;
   }
 
@@ -430,12 +433,12 @@ class GenerateKernelExecution
         cudaq::opt::marshal::createEmptyHeapTracker(loc, builder);
     auto zippy = zipArgumentsWithDeviceTypes</*argsAreReferences=*/false>(
         loc, builder, module, blockValues, devFuncTy.getInputs(), heapTracker);
-    auto sizeScratch = builder.create<cudaq::cc::AllocaOp>(loc, i64Ty);
+    auto sizeScratch = cudaq::cc::AllocaOp::create(builder, loc, i64Ty);
     auto messageBufferSize = [&]() -> Value {
       if (hasDynamicSignature)
         return cudaq::opt::marshal::genSizeOfDynamicMessageBuffer(
             loc, builder, module, structTy, zippy, sizeScratch);
-      return builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
+      return cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, structTy);
     }();
 
     Value msgBufferPrefix;
@@ -445,17 +448,17 @@ class GenerateKernelExecution
     Value extendedStructSize;
     if (cudaq::opt::marshal::isCodegenPackedData(codegenKind)) {
       auto rawMessageBuffer =
-          builder.create<cudaq::cc::AllocaOp>(loc, i8Ty, messageBufferSize);
-      msgBufferPrefix =
-          builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rawMessageBuffer);
+          cudaq::cc::AllocaOp::create(builder, loc, i8Ty, messageBufferSize);
+      msgBufferPrefix = cudaq::cc::CastOp::create(builder, loc, structPtrTy,
+                                                  rawMessageBuffer);
 
       if (hasDynamicSignature) {
         auto addendumScratch =
-            builder.create<cudaq::cc::AllocaOp>(loc, ptrI8Ty);
+            cudaq::cc::AllocaOp::create(builder, loc, ptrI8Ty);
         Value prefixSize =
-            builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, structTy);
-        Value addendumPtr = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrI8Ty, rawMessageBuffer,
+            cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, structTy);
+        Value addendumPtr = cudaq::cc::ComputePtrOp::create(
+            builder, loc, ptrI8Ty, rawMessageBuffer,
             ArrayRef<cudaq::cc::ComputePtrArg>{prefixSize});
         cudaq::opt::marshal::populateMessageBuffer(
             loc, builder, module, msgBufferPrefix, zippy, addendumPtr,
@@ -468,11 +471,11 @@ class GenerateKernelExecution
       cudaq::opt::marshal::maybeFreeHeapAllocations(loc, builder, heapTracker);
       extendedStructSize = messageBufferSize;
       Value loadThunk =
-          builder.create<func::ConstantOp>(loc, thunkTy, thunkFunc.getName());
+          func::ConstantOp::create(builder, loc, thunkTy, thunkFunc.getName());
       castLoadThunk =
-          builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrI8Ty, loadThunk);
+          cudaq::cc::FuncToPtrOp::create(builder, loc, ptrI8Ty, loadThunk);
       castTemp =
-          builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, msgBufferPrefix);
+          cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, msgBufferPrefix);
       resultOffset = cudaq::opt::marshal::genComputeReturnOffset(
           loc, builder, devFuncTy, structTy);
     }
@@ -481,25 +484,26 @@ class GenerateKernelExecution
     if (cudaq::opt::marshal::isCodegenArgumentGather(codegenKind)) {
       // 1) Allocate and initialize a std::vector<void*> object.
       const unsigned count = devFuncTy.getInputs().size();
-      auto stdVec = builder.create<cudaq::cc::AllocaOp>(
-          loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
+      auto stdVec = cudaq::cc::AllocaOp::create(
+          builder, loc, cudaq::opt::factory::stlVectorType(ptrI8Ty));
       auto arrPtrTy = cudaq::cc::ArrayType::get(ctx, ptrI8Ty, count);
-      Value buffer = builder.create<cudaq::cc::AllocaOp>(loc, arrPtrTy);
-      auto buffSize = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, arrPtrTy);
+      Value buffer = cudaq::cc::AllocaOp::create(builder, loc, arrPtrTy);
+      auto buffSize =
+          cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, arrPtrTy);
       auto ptrPtrTy = cudaq::cc::PointerType::get(ptrI8Ty);
-      auto cast1 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, buffer);
+      auto cast1 = cudaq::cc::CastOp::create(builder, loc, ptrPtrTy, buffer);
       auto ptr3Ty = cudaq::cc::PointerType::get(ptrPtrTy);
-      auto stdVec0 = builder.create<cudaq::cc::CastOp>(loc, ptr3Ty, stdVec);
-      builder.create<cudaq::cc::StoreOp>(loc, cast1, stdVec0);
-      auto cast2 = builder.create<cudaq::cc::CastOp>(loc, i64Ty, buffer);
-      auto endBuff = builder.create<arith::AddIOp>(loc, cast2, buffSize);
-      auto cast3 = builder.create<cudaq::cc::CastOp>(loc, ptrPtrTy, endBuff);
-      auto stdVec1 = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
-      builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec1);
-      auto stdVec2 = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
-      builder.create<cudaq::cc::StoreOp>(loc, cast3, stdVec2);
+      auto stdVec0 = cudaq::cc::CastOp::create(builder, loc, ptr3Ty, stdVec);
+      cudaq::cc::StoreOp::create(builder, loc, cast1, stdVec0);
+      auto cast2 = cudaq::cc::CastOp::create(builder, loc, i64Ty, buffer);
+      auto endBuff = arith::AddIOp::create(builder, loc, cast2, buffSize);
+      auto cast3 = cudaq::cc::CastOp::create(builder, loc, ptrPtrTy, endBuff);
+      auto stdVec1 = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+      cudaq::cc::StoreOp::create(builder, loc, cast3, stdVec1);
+      auto stdVec2 = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptr3Ty, stdVec, ArrayRef<cudaq::cc::ComputePtrArg>{2});
+      cudaq::cc::StoreOp::create(builder, loc, cast3, stdVec2);
 
       // 2) Iterate over the arguments passed in and populate the vector.
       SmallVector<BlockArgument> blockArgs{
@@ -508,12 +512,13 @@ class GenerateKernelExecution
       unsigned j = 0;
       for (std::int32_t i = 0, N = blockArgs.size(); i < N; ++i, ++j) {
         auto blkArg = blockArgs[i];
-        auto pos = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrPtrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
+        auto pos = cudaq::cc::ComputePtrOp::create(
+            builder, loc, ptrPtrTy, buffer,
+            ArrayRef<cudaq::cc::ComputePtrArg>{i});
         if (isa<cudaq::cc::PointerType>(blkArg.getType())) {
           auto castArg =
-              builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, blkArg);
-          builder.create<cudaq::cc::StoreOp>(loc, castArg, pos);
+              cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, blkArg);
+          cudaq::cc::StoreOp::create(builder, loc, castArg, pos);
           continue;
         }
         Value temp;
@@ -522,39 +527,41 @@ class GenerateKernelExecution
             cudaq::opt::factory::structUsesTwoArguments(
                 devFuncTy.getInput(j))) {
           temp =
-              builder.create<cudaq::cc::AllocaOp>(loc, devFuncTy.getInput(j));
-          auto part1 = builder.create<cudaq::cc::CastOp>(
-              loc, cudaq::cc::PointerType::get(blkArg.getType()), temp);
-          builder.create<cudaq::cc::StoreOp>(loc, blkArg, part1);
+              cudaq::cc::AllocaOp::create(builder, loc, devFuncTy.getInput(j));
+          auto part1 = cudaq::cc::CastOp::create(
+              builder, loc, cudaq::cc::PointerType::get(blkArg.getType()),
+              temp);
+          cudaq::cc::StoreOp::create(builder, loc, blkArg, part1);
           auto blkArg2 = blockArgs[++i];
-          auto cast2 = builder.create<cudaq::cc::CastOp>(
-              loc,
+          auto cast2 = cudaq::cc::CastOp::create(
+              builder, loc,
               cudaq::cc::PointerType::get(
                   cudaq::cc::ArrayType::get(blkArg2.getType())),
               temp);
-          auto part2 = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(blkArg2.getType()), cast2,
-              ArrayRef<cudaq::cc::ComputePtrArg>{1});
-          builder.create<cudaq::cc::StoreOp>(loc, blkArg2, part2);
+          auto part2 = cudaq::cc::ComputePtrOp::create(
+              builder, loc, cudaq::cc::PointerType::get(blkArg2.getType()),
+              cast2, ArrayRef<cudaq::cc::ComputePtrArg>{1});
+          cudaq::cc::StoreOp::create(builder, loc, blkArg2, part2);
         } else if (isa<cudaq::cc::CallableType>(blkArg.getType())) {
           // In C++, callables are already resolved. There is nothing to pass.
-          temp = builder.create<arith::ConstantIntOp>(loc, 0, 64);
+          temp = arith::ConstantIntOp::create(builder, loc, 0, 64);
         } else {
-          temp = builder.create<cudaq::cc::AllocaOp>(loc, blkArg.getType());
-          builder.create<cudaq::cc::StoreOp>(loc, blkArg, temp);
+          temp = cudaq::cc::AllocaOp::create(builder, loc, blkArg.getType());
+          cudaq::cc::StoreOp::create(builder, loc, blkArg, temp);
         }
-        auto castTemp = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, temp);
-        builder.create<cudaq::cc::StoreOp>(loc, castTemp, pos);
+        auto castTemp = cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, temp);
+        cudaq::cc::StoreOp::create(builder, loc, castTemp, pos);
       }
-      vecArgPtrs = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, stdVec);
+      vecArgPtrs = cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, stdVec);
     }
 
     // Prepare to call the `launchKernel` runtime library entry point.
-    Value loadKernName = builder.create<LLVM::AddressOfOp>(
-        loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
+    Value loadKernName = LLVM::AddressOfOp::create(
+        builder, loc,
+        cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
         kernelNameObj.getSymName());
     auto castLoadKernName =
-        builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, loadKernName);
+        cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, loadKernName);
 
     auto hostFuncTy = hostFunc.getFunctionType();
     assert((hostFuncTy.getResults().empty() ||
@@ -570,13 +577,13 @@ class GenerateKernelExecution
         return;
       Type res0Ty = structTy.getMember(offset);
       auto ptrResTy = cudaq::cc::PointerType::get(res0Ty);
-      auto rptr = builder.create<cudaq::cc::ExtractValueOp>(loc, ptrI8Ty,
-                                                            spanReturned, 0);
+      auto rptr = cudaq::cc::ExtractValueOp::create(builder, loc, ptrI8Ty,
+                                                    spanReturned, 0);
       launchResultToFree = rptr;
-      auto rIntPtr = builder.create<cudaq::cc::CastOp>(loc, i64Ty, rptr);
-      auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-      auto cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
-                                               rIntPtr, zero);
+      auto rIntPtr = cudaq::cc::CastOp::create(builder, loc, i64Ty, rptr);
+      auto zero = arith::ConstantIntOp::create(builder, loc, 0, 64);
+      auto cmp = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ne,
+                                       rIntPtr, zero);
       auto *currentBlock = builder.getBlock();
       auto *reg = currentBlock->getParent();
       auto *thenBlock = builder.createBlock(reg);
@@ -584,22 +591,22 @@ class GenerateKernelExecution
       auto *endifBlock = builder.createBlock(
           reg, reg->end(), TypeRange{ptrResTy}, SmallVector<Location>(1, loc));
       builder.setInsertionPointToEnd(currentBlock);
-      builder.create<cf::CondBranchOp>(loc, cmp, thenBlock, elseBlock);
+      cf::CondBranchOp::create(builder, loc, cmp, thenBlock, elseBlock);
       builder.setInsertionPointToEnd(thenBlock);
       // dynamic result was returned.
       // We need to free() this buffer before the end of this function.
       auto rStructPtr =
-          builder.create<cudaq::cc::CastOp>(loc, structPtrTy, rptr);
-      Value lRes = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrResTy, rStructPtr,
+          cudaq::cc::CastOp::create(builder, loc, structPtrTy, rptr);
+      Value lRes = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptrResTy, rStructPtr,
           ArrayRef<cudaq::cc::ComputePtrArg>{offset});
-      builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{lRes});
+      cf::BranchOp::create(builder, loc, endifBlock, ArrayRef<Value>{lRes});
       builder.setInsertionPointToEnd(elseBlock);
       // span was returned in the original buffer.
-      Value mRes = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrResTy, msgBufferPrefix,
+      Value mRes = cudaq::cc::ComputePtrOp::create(
+          builder, loc, ptrResTy, msgBufferPrefix,
           ArrayRef<cudaq::cc::ComputePtrArg>{offset});
-      builder.create<cf::BranchOp>(loc, endifBlock, ArrayRef<Value>{mRes});
+      cf::BranchOp::create(builder, loc, endifBlock, ArrayRef<Value>{mRes});
       builder.setInsertionPointToEnd(endifBlock);
       launchResult = endifBlock->getArgument(0);
     };
@@ -608,8 +615,8 @@ class GenerateKernelExecution
     switch (codegenKind) {
     case 0: {
       assert(vecArgPtrs && castLoadThunk);
-      auto launch = builder.create<func::CallOp>(
-          loc, cudaq::opt::factory::getDynamicBufferType(ctx),
+      auto launch = func::CallOp::create(
+          builder, loc, cudaq::opt::factory::getDynamicBufferType(ctx),
           cudaq::runtime::launchKernelHybridFuncName,
           ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
                           extendedStructSize, resultOffset, vecArgPtrs});
@@ -617,8 +624,8 @@ class GenerateKernelExecution
     } break;
     case 1: {
       assert(!vecArgPtrs && castLoadThunk);
-      auto launch = builder.create<func::CallOp>(
-          loc, cudaq::opt::factory::getDynamicBufferType(ctx),
+      auto launch = func::CallOp::create(
+          builder, loc, cudaq::opt::factory::getDynamicBufferType(ctx),
           cudaq::runtime::launchKernelFuncName,
           ArrayRef<Value>{castLoadKernName, castLoadThunk, castTemp,
                           extendedStructSize, resultOffset});
@@ -626,16 +633,16 @@ class GenerateKernelExecution
     } break;
     case 2: {
       assert(vecArgPtrs && !castLoadThunk);
-      builder.create<func::CallOp>(
-          loc, std::nullopt, cudaq::runtime::launchKernelStreamlinedFuncName,
-          ArrayRef<Value>{castLoadKernName, vecArgPtrs});
+      func::CallOp::create(builder, loc, TypeRange{},
+                           cudaq::runtime::launchKernelStreamlinedFuncName,
+                           ArrayRef<Value>{castLoadKernName, vecArgPtrs});
       // For this codegen kind, we drop any results on the floor and return
       // random data in registers and/or off the stack. This maintains parity
       // with any pre-existing kernel launchers.
       SmallVector<Value> garbage;
       for (auto ty : hostFunc.getFunctionType().getResults())
-        garbage.push_back(builder.create<cudaq::cc::UndefOp>(loc, ty));
-      builder.create<func::ReturnOp>(loc, garbage);
+        garbage.push_back(cudaq::cc::UndefOp::create(builder, loc, ty));
+      func::ReturnOp::create(builder, loc, garbage);
       return;
     }
     default:
@@ -654,16 +661,16 @@ class GenerateKernelExecution
       // reference.
       if (resultVal) {
         // Static values. std::vector are necessarily sret, see below.
-        auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
-            loc, ptrResTy, msgBufferPrefix,
+        auto resPtr = cudaq::cc::ComputePtrOp::create(
+            builder, loc, ptrResTy, msgBufferPrefix,
             ArrayRef<cudaq::cc::ComputePtrArg>{offset});
         Type castToTy = cudaq::cc::PointerType::get(hostFuncTy.getResult(0));
         auto castResPtr = [&]() -> Value {
           if (castToTy == ptrResTy)
             return resPtr;
-          return builder.create<cudaq::cc::CastOp>(loc, castToTy, resPtr);
+          return cudaq::cc::CastOp::create(builder, loc, castToTy, resPtr);
         }();
-        results.push_back(builder.create<cudaq::cc::LoadOp>(loc, castResPtr));
+        results.push_back(cudaq::cc::LoadOp::create(builder, loc, castResPtr));
       } else {
         // This is an sret return. Check if device is returning a span. If it
         // is, then we will need to convert it to a std::vector here. The vector
@@ -673,51 +680,52 @@ class GenerateKernelExecution
                 dyn_cast<cudaq::cc::SpanLikeType>(devFuncTy.getResult(0))) {
           auto eleTy = spanTy.getElementType();
           auto ptrTy = cudaq::cc::PointerType::get(eleTy);
-          auto gep0 = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(ptrTy), launchResult,
+          auto gep0 = cudaq::cc::ComputePtrOp::create(
+              builder, loc, cudaq::cc::PointerType::get(ptrTy), launchResult,
               SmallVector<cudaq::cc::ComputePtrArg>{0});
-          auto dataPtr = builder.create<cudaq::cc::LoadOp>(loc, gep0);
+          auto dataPtr = cudaq::cc::LoadOp::create(builder, loc, gep0);
           auto lenPtrTy = cudaq::cc::PointerType::get(i64Ty);
-          auto gep1 = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, lenPtrTy, launchResult,
+          auto gep1 = cudaq::cc::ComputePtrOp::create(
+              builder, loc, lenPtrTy, launchResult,
               SmallVector<cudaq::cc::ComputePtrArg>{1});
-          auto vecLen = builder.create<cudaq::cc::LoadOp>(loc, gep1);
+          auto vecLen = cudaq::cc::LoadOp::create(builder, loc, gep1);
           if (spanTy.getElementType() == builder.getI1Type()) {
             cudaq::opt::marshal::genStdvecBoolFromInitList(loc, builder, arg0,
                                                            dataPtr, vecLen);
           } else {
             Value tSize =
-                builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
+                cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, eleTy);
             cudaq::opt::marshal::genStdvecTFromInitList(loc, builder, arg0,
                                                         dataPtr, tSize, vecLen);
           }
           // free(nullptr) is defined to be a nop in the standard.
-          builder.create<func::CallOp>(loc, std::nullopt, "free",
-                                       ArrayRef<Value>{launchResultToFree});
+          func::CallOp::create(builder, loc, TypeRange{}, "free",
+                               ArrayRef<Value>{launchResultToFree});
         } else {
           // Otherwise, we can just copy the aggregate into the sret memory
           // block. Uses the size of the host function's sret pointer element
           // type for the memcpy, so the device should return an (aggregate)
           // value of suitable size.
-          auto resPtr = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, ptrResTy, msgBufferPrefix,
+          auto resPtr = cudaq::cc::ComputePtrOp::create(
+              builder, loc, ptrResTy, msgBufferPrefix,
               ArrayRef<cudaq::cc::ComputePtrArg>{offset});
           auto castMsgBuff =
-              builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, resPtr);
+              cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, resPtr);
           Type eleTy =
               cast<cudaq::cc::PointerType>(arg0.getType()).getElementType();
-          Value bytes = builder.create<cudaq::cc::SizeOfOp>(loc, i64Ty, eleTy);
-          auto notVolatile = builder.create<arith::ConstantIntOp>(loc, 0, 1);
-          auto castArg0 = builder.create<cudaq::cc::CastOp>(loc, ptrI8Ty, arg0);
-          builder.create<func::CallOp>(
-              loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+          Value bytes = cudaq::cc::SizeOfOp::create(builder, loc, i64Ty, eleTy);
+          auto notVolatile = arith::ConstantIntOp::create(builder, loc, 0, 1);
+          auto castArg0 =
+              cudaq::cc::CastOp::create(builder, loc, ptrI8Ty, arg0);
+          func::CallOp::create(
+              builder, loc, TypeRange{}, cudaq::llvmMemCopyIntrinsic,
               ValueRange{castArg0, castMsgBuff, bytes, notVolatile});
         }
       }
     }
 
     // Return the result (if any).
-    builder.create<func::ReturnOp>(loc, results);
+    func::ReturnOp::create(builder, loc, results);
   }
 
   /// Generate a function to be executed at load-time which will register the
@@ -729,32 +737,34 @@ class GenerateKernelExecution
     auto module = getOperation();
     auto *ctx = builder.getContext();
     auto ptrType = cudaq::cc::PointerType::get(builder.getI8Type());
-    auto initFun = builder.create<LLVM::LLVMFuncOp>(
-        loc, classNameStr + ".kernelRegFunc",
+    auto initFun = LLVM::LLVMFuncOp::create(
+        builder, loc, classNameStr + ".kernelRegFunc",
         LLVM::LLVMFunctionType::get(cudaq::opt::factory::getVoidType(ctx), {}));
     OpBuilder::InsertionGuard guard(builder);
-    auto *initFunEntry = initFun.addEntryBlock();
+    auto *initFunEntry = initFun.addEntryBlock(builder);
     builder.setInsertionPointToStart(initFunEntry);
-    auto kernRef = builder.create<LLVM::AddressOfOp>(
-        loc, cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
+    auto kernRef = LLVM::AddressOfOp::create(
+        builder, loc,
+        cudaq::opt::factory::getPointerType(kernelNameObj.getType()),
         kernelNameObj.getSymName());
-    auto castKernRef = builder.create<cudaq::cc::CastOp>(loc, ptrType, kernRef);
-    builder.create<func::CallOp>(loc, std::nullopt,
-                                 cudaq::runtime::CudaqRegisterKernelName,
-                                 ValueRange{castKernRef});
+    auto castKernRef =
+        cudaq::cc::CastOp::create(builder, loc, ptrType, kernRef);
+    func::CallOp::create(builder, loc, TypeRange{},
+                         cudaq::runtime::CudaqRegisterKernelName,
+                         ValueRange{castKernRef});
 
     if (cudaq::opt::marshal::isCodegenPackedData(codegenKind)) {
       // Register the argsCreator too
       auto ptrPtrType = cudaq::cc::PointerType::get(ptrType);
       auto argsCreatorFuncType = FunctionType::get(
           ctx, {ptrPtrType, ptrPtrType}, {builder.getI64Type()});
-      Value loadArgsCreator = builder.create<func::ConstantOp>(
-          loc, argsCreatorFuncType, argsCreatorFunc.getName());
-      auto castLoadArgsCreator =
-          builder.create<cudaq::cc::FuncToPtrOp>(loc, ptrType, loadArgsCreator);
-      builder.create<func::CallOp>(
-          loc, std::nullopt, cudaq::runtime::CudaqRegisterArgsCreator,
-          ValueRange{castKernRef, castLoadArgsCreator});
+      Value loadArgsCreator = func::ConstantOp::create(
+          builder, loc, argsCreatorFuncType, argsCreatorFunc.getName());
+      auto castLoadArgsCreator = cudaq::cc::FuncToPtrOp::create(
+          builder, loc, ptrType, loadArgsCreator);
+      func::CallOp::create(builder, loc, TypeRange{},
+                           cudaq::runtime::CudaqRegisterArgsCreator,
+                           ValueRange{castKernRef, castLoadArgsCreator});
     }
 
     // Check if this is a lambda mangled name
@@ -771,29 +781,31 @@ class GenerateKernelExecution
 
         // Create this global name, it is unique for any lambda
         // bc classNameStr contains the parentFunc + varName
-        auto lambdaName = builder.create<LLVM::GlobalOp>(
-            loc,
+        auto lambdaName = LLVM::GlobalOp::create(
+            builder, loc,
             cudaq::opt::factory::getStringType(ctx, demangledName.size() + 1),
             /*isConstant=*/true, LLVM::Linkage::External,
             classNameStr + ".lambdaName",
             builder.getStringAttr(demangledName + '\0'), /*alignment=*/0);
 
         builder.restoreInsertionPoint(insertPoint);
-        auto lambdaRef = builder.create<LLVM::AddressOfOp>(
-            loc, cudaq::opt::factory::getPointerType(lambdaName.getType()),
+        auto lambdaRef = LLVM::AddressOfOp::create(
+            builder, loc,
+            cudaq::opt::factory::getPointerType(lambdaName.getType()),
             lambdaName.getSymName());
 
-        auto castLambdaRef = builder.create<cudaq::cc::CastOp>(
-            loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef);
-        auto castKernelRef = builder.create<cudaq::cc::CastOp>(
-            loc, cudaq::opt::factory::getPointerType(ctx), castKernRef);
-        builder.create<LLVM::CallOp>(loc, std::nullopt,
-                                     cudaq::runtime::CudaqRegisterLambdaName,
-                                     ValueRange{castLambdaRef, castKernelRef});
+        auto castLambdaRef = cudaq::cc::CastOp::create(
+            builder, loc, cudaq::opt::factory::getPointerType(ctx), lambdaRef);
+        auto castKernelRef = cudaq::cc::CastOp::create(
+            builder, loc, cudaq::opt::factory::getPointerType(ctx),
+            castKernRef);
+        LLVM::CallOp::create(builder, loc, TypeRange{},
+                             cudaq::runtime::CudaqRegisterLambdaName,
+                             ValueRange{castLambdaRef, castKernelRef});
       }
     }
 
-    builder.create<LLVM::ReturnOp>(loc, ValueRange{});
+    LLVM::ReturnOp::create(builder, loc, ValueRange{});
     return initFun;
   }
 
@@ -915,7 +927,7 @@ class GenerateKernelExecution
     SmallVector<func::FuncOp> workList;
     for (auto &op : *module.getBody())
       if (auto funcOp = dyn_cast<func::FuncOp>(op))
-        if (funcOp.getName().startswith(cudaq::runtime::cudaqGenPrefixName) &&
+        if (funcOp.getName().starts_with(cudaq::runtime::cudaqGenPrefixName) &&
             cudaq::opt::marshal::hasLegalType(funcOp.getFunctionType()) &&
             !funcOp.empty() && !funcOp->hasAttr(cudaq::generatorAnnotation))
           workList.push_back(funcOp);
@@ -936,7 +948,7 @@ class GenerateKernelExecution
         {
           // Create the run kernel and drop the return result on the floor.
           auto runKern =
-              builder.create<func::FuncOp>(loc, runKernName, runKernTy);
+              func::FuncOp::create(builder, loc, runKernName, runKernTy);
           auto unitAttr = builder.getUnitAttr();
           runKern->setAttr(cudaq::entryPointAttrName, unitAttr);
           runKern->setAttr(cudaq::kernelAttrName, unitAttr);
@@ -949,11 +961,11 @@ class GenerateKernelExecution
           OpBuilder::InsertionGuard guard(builder);
           Block *entry = runKern.addEntryBlock();
           builder.setInsertionPointToStart(entry);
-          auto kern = builder.create<func::CallOp>(
-              loc, epKern.getFunctionType().getResults(), epKern.getName(),
-              entry->getArguments());
-          builder.create<cudaq::cc::LogOutputOp>(loc, kern.getResults());
-          builder.create<func::ReturnOp>(loc);
+          auto kern = func::CallOp::create(
+              builder, loc, epKern.getFunctionType().getResults(),
+              epKern.getName(), entry->getArguments());
+          cudaq::cc::LogOutputOp::create(builder, loc, kern.getResults());
+          func::ReturnOp::create(builder, loc);
           runKernels.push_back(runKern);
         }
         {
@@ -973,8 +985,8 @@ class GenerateKernelExecution
               runKernTy, /*hasThisPointer=*/false, module);
           runEntryKernTy =
               FunctionType::get(ctx, runEntryKernTy.getInputs(), {});
-          auto runEntryKern = builder.create<func::FuncOp>(
-              loc, runKernEntryName, runEntryKernTy);
+          auto runEntryKern = func::FuncOp::create(
+              builder, loc, runKernEntryName, runEntryKernTy);
           auto origEntryFunc = [&]() -> func::FuncOp {
             auto mangledNameMap = module->getAttrOfType<DictionaryAttr>(
                 cudaq::runtime::mangledNameMap);
@@ -989,7 +1001,7 @@ class GenerateKernelExecution
           OpBuilder::InsertionGuard guard(builder);
           Block *entry = runEntryKern.addEntryBlock();
           builder.setInsertionPointToStart(entry);
-          builder.create<func::ReturnOp>(loc);
+          func::ReturnOp::create(builder, loc);
           // Append this to the kernel name map.
           auto dict = module->getAttrOfType<DictionaryAttr>(
               cudaq::runtime::mangledNameMap);
@@ -1022,8 +1034,9 @@ class GenerateKernelExecution
         auto classNameStr = className.str();
 
         // Create a constant with the name of the kernel as a C string.
-        auto kernelNameObj = builder.create<LLVM::GlobalOp>(
-            loc, cudaq::opt::factory::getStringType(ctx, className.size() + 1),
+        auto kernelNameObj = LLVM::GlobalOp::create(
+            builder, loc,
+            cudaq::opt::factory::getStringType(ctx, className.size() + 1),
             /*isConstant=*/true, LLVM::Linkage::External,
             classNameStr + ".kernelName",
             builder.getStringAttr(classNameStr + '\0'), /*alignment=*/0);
diff --git a/lib/Optimizer/Transforms/GetConcreteMatrix.cpp b/lib/Optimizer/Transforms/GetConcreteMatrix.cpp
index d36b26fef14..7b64cfda9fe 100644
--- a/lib/Optimizer/Transforms/GetConcreteMatrix.cpp
+++ b/lib/Optimizer/Transforms/GetConcreteMatrix.cpp
@@ -7,8 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -84,8 +82,7 @@ class GetConcreteMatrixPass
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<CustomUnitaryPattern>(ctx);
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp b/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
index 383da3b5eb0..d805931bc22 100644
--- a/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
+++ b/lib/Optimizer/Transforms/GlobalizeArrayValues.cpp
@@ -8,10 +8,7 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -169,8 +166,9 @@ struct ConstantArrayPattern
       return failure();
     auto loc = conarr.getLoc();
     if (!extracts.empty()) {
-      auto base = rewriter.create<cudaq::cc::AddressOfOp>(
-          loc, cudaq::cc::PointerType::get(conarr.getType()), globalName);
+      auto base = cudaq::cc::AddressOfOp::create(
+          rewriter, loc, cudaq::cc::PointerType::get(conarr.getType()),
+          globalName);
       auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
       for (auto extract : extracts) {
         SmallVector<cudaq::cc::ComputePtrArg> args;
@@ -183,8 +181,8 @@ struct ConstantArrayPattern
         }
         OpBuilder::InsertionGuard guard(rewriter);
         rewriter.setInsertionPoint(extract);
-        auto addrVal =
-            rewriter.create<cudaq::cc::ComputePtrOp>(loc, elePtrTy, base, args);
+        auto addrVal = cudaq::cc::ComputePtrOp::create(rewriter, loc, elePtrTy,
+                                                       base, args);
         rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(extract, addrVal);
       }
     }
@@ -196,8 +194,9 @@ struct ConstantArrayPattern
         rewriter.eraseOp(store);
     }
     if (loadAsValue) {
-      auto base = rewriter.create<cudaq::cc::AddressOfOp>(
-          loc, cudaq::cc::PointerType::get(conarr.getType()), globalName);
+      auto base = cudaq::cc::AddressOfOp::create(
+          rewriter, loc, cudaq::cc::PointerType::get(conarr.getType()),
+          globalName);
       rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(conarr, base);
     }
     return success();
@@ -229,10 +228,10 @@ struct ReifySpanPattern : public OpRewritePattern<cudaq::cc::ReifySpanOp> {
         auto loc = reify.getLoc();
         auto eleTy =
             cast<cudaq::cc::StdvecType>(reify.getType()).getElementType();
-        auto numEle = rewriter.create<arith::ConstantIntOp>(
-            loc, conArr.getConstantValues().size(), 64);
-        Value buff = rewriter.create<cudaq::cc::AllocaOp>(loc, eleTy, numEle);
-        rewriter.create<cudaq::cc::StoreOp>(loc, conArr, buff);
+        auto numEle = arith::ConstantIntOp::create(
+            rewriter, loc, conArr.getConstantValues().size(), 64);
+        Value buff = cudaq::cc::AllocaOp::create(rewriter, loc, eleTy, numEle);
+        cudaq::cc::StoreOp::create(rewriter, loc, conArr, buff);
         rewriter.replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(
             reify, reify.getType(), buff, numEle);
         return success();
@@ -261,26 +260,26 @@ struct ReifySpanPattern : public OpRewritePattern<cudaq::cc::ReifySpanOp> {
         std::int64_t len = stringAttr.getValue().size() + 1;
         Type litTy = cudaq::cc::PointerType::get(
             cudaq::cc::ArrayType::get(ctx, rewriter.getI8Type(), len));
-        auto strLit = rewriter.create<cudaq::cc::CreateStringLiteralOp>(
-            loc, litTy, stringAttr);
-        auto size = rewriter.create<arith::ConstantIntOp>(loc, len, 64);
-        members.push_back(rewriter.create<cudaq::cc::StdvecInitOp>(
-            loc, cudaq::cc::CharspanType::get(ctx), strLit, size));
+        auto strLit = cudaq::cc::CreateStringLiteralOp::create(
+            rewriter, loc, litTy, stringAttr);
+        auto size = arith::ConstantIntOp::create(rewriter, loc, len, 64);
+        members.push_back(cudaq::cc::StdvecInitOp::create(
+            rewriter, loc, cudaq::cc::CharspanType::get(ctx), strLit, size));
       } else if (auto a = dyn_cast<IntegerAttr>(attr)) {
         if (auto floatTy = dyn_cast<FloatType>(eleTy)) {
           APFloat floatVal(floatTy.getFloatSemantics(), a.getValue());
           auto floatAttr = FloatAttr::get(floatTy, floatVal);
           members.push_back(
-              rewriter.create<arith::ConstantOp>(loc, floatAttr, floatTy));
+              arith::ConstantOp::create(rewriter, loc, floatTy, floatAttr));
         } else {
-          members.push_back(rewriter.create<arith::ConstantOp>(loc, a, eleTy));
+          members.push_back(arith::ConstantOp::create(rewriter, loc, eleTy, a));
         }
       } else if (auto a = dyn_cast<FloatAttr>(attr)) {
-        members.push_back(rewriter.create<arith::ConstantOp>(loc, a, eleTy));
+        members.push_back(arith::ConstantOp::create(rewriter, loc, eleTy, a));
       } else {
         // Unexpected attribute.
         LLVM_DEBUG(llvm::dbgs() << "unexpected attribute: " << attr << '\n');
-        members.push_back(rewriter.create<cudaq::cc::PoisonOp>(loc, eleTy));
+        members.push_back(cudaq::cc::PoisonOp::create(rewriter, loc, eleTy));
       }
     }
 
@@ -294,22 +293,24 @@ struct ReifySpanPattern : public OpRewritePattern<cudaq::cc::ReifySpanOp> {
       }
     }
 
-    auto size = rewriter.create<arith::ConstantIntOp>(loc, members.size(), 64);
-    auto buff = rewriter.create<cudaq::cc::AllocaOp>(loc, eleTy, size);
+    auto size = arith::ConstantIntOp::create(rewriter, loc, members.size(), 64);
+    auto buff = cudaq::cc::AllocaOp::create(rewriter, loc, eleTy, size);
     for (auto iter : llvm::enumerate(members)) {
       std::int32_t idx = iter.index();
       auto m = iter.value();
       if (hasBoolElems) {
         auto unit = UnitAttr::get(rewriter.getContext());
-        m = rewriter.create<cudaq::cc::CastOp>(loc, eleTy, m, UnitAttr(), unit);
+        m = cudaq::cc::CastOp::create(rewriter, loc, eleTy, m, UnitAttr(),
+                                      unit);
       }
       auto ptrEleTy = cudaq::cc::PointerType::get(eleTy);
-      auto ptr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, ptrEleTy, buff, ArrayRef<cudaq::cc::ComputePtrArg>{idx});
-      rewriter.create<cudaq::cc::StoreOp>(loc, m, ptr);
+      auto ptr = cudaq::cc::ComputePtrOp::create(
+          rewriter, loc, ptrEleTy, buff,
+          ArrayRef<cudaq::cc::ComputePtrArg>{idx});
+      cudaq::cc::StoreOp::create(rewriter, loc, m, ptr);
     }
     Value result =
-        rewriter.create<cudaq::cc::StdvecInitOp>(loc, ty, buff, size);
+        cudaq::cc::StdvecInitOp::create(rewriter, loc, ty, buff, size);
     return result;
   }
 
@@ -338,7 +339,7 @@ class GlobalizeArrayValuesPass
                                                             counter);
     LLVM_DEBUG(llvm::dbgs() << "Before globalizing array values:\n"
                             << module << '\n');
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
       signalPassFailure();
       return;
     }
diff --git a/lib/Optimizer/Transforms/LambdaLifting.cpp b/lib/Optimizer/Transforms/LambdaLifting.cpp
index d62e34c079c..930e1a8ff07 100644
--- a/lib/Optimizer/Transforms/LambdaLifting.cpp
+++ b/lib/Optimizer/Transforms/LambdaLifting.cpp
@@ -8,14 +8,10 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/IRMapping.h"
-#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -191,24 +187,25 @@ struct CreateLambdaOpPattern
       argTys.push_back(lambdaTy);
       argTys.append(sig.getInputs().begin(), sig.getInputs().end());
       auto funTy = FunctionType::get(ctx, argTys, sig.getResults());
-      auto thunk = rewriter.create<func::FuncOp>(
-          loc, getThunkLambdaName(counter), funTy, emptyDict);
+      auto thunk = func::FuncOp::create(
+          rewriter, loc, getThunkLambdaName(counter), funTy, emptyDict);
       thunk.setPrivate();
       thunk->setAttr(cudaq::kernelAttrName, rewriter.getUnitAttr());
       auto *entry = thunk.addEntryBlock();
       rewriter.setInsertionPointToEnd(entry);
       SmallVector<Value> callableArgs;
       if (!freeValues.empty()) {
-        auto closureData = rewriter.create<cudaq::cc::CallableClosureOp>(
-            loc, freeValues.getTypes(), thunk.getArgument(0));
+        auto closureData = cudaq::cc::CallableClosureOp::create(
+            rewriter, loc, freeValues.getTypes(), thunk.getArgument(0));
         callableArgs.append(closureData.getResults().begin(),
                             closureData.getResults().end());
       }
       callableArgs.append(thunk.getArguments().begin() + 1,
                           thunk.getArguments().end());
-      auto result = rewriter.create<func::CallOp>(
-          loc, sig.getResults(), getLiftedLambdaName(counter), callableArgs);
-      rewriter.create<func::ReturnOp>(loc, result.getResults());
+      auto result =
+          func::CallOp::create(rewriter, loc, sig.getResults(),
+                               getLiftedLambdaName(counter), callableArgs);
+      func::ReturnOp::create(rewriter, loc, result.getResults());
     }
 
     // Create a new lambda function to lift the expression into. This function
@@ -220,8 +217,8 @@ struct CreateLambdaOpPattern
                                freeValues.getTypes().end());
       argTys.append(sig.getInputs().begin(), sig.getInputs().end());
       auto funTy = FunctionType::get(ctx, argTys, sig.getResults());
-      auto func = rewriter.create<func::FuncOp>(
-          loc, getLiftedLambdaName(counter), funTy, emptyDict);
+      auto func = func::FuncOp::create(
+          rewriter, loc, getLiftedLambdaName(counter), funTy, emptyDict);
       func.setPrivate();
       func->setAttr(cudaq::kernelAttrName, rewriter.getUnitAttr());
       auto *entry = func.addEntryBlock();
@@ -256,7 +253,7 @@ struct CreateLambdaOpPattern
       rewriter.setInsertionPointToEnd(entry);
       auto nextBlockIter = ++func.getBlocks().begin();
       // Connect entry block to cloned code.
-      rewriter.create<cf::BranchOp>(loc, &*nextBlockIter);
+      cf::BranchOp::create(rewriter, loc, &*nextBlockIter);
     }
 
     SymbolRefAttr closureSymbol =
@@ -311,12 +308,12 @@ struct ComputeActionOpPattern
     if (!actionCallee)
       return failure();
     auto computeArgs = getArgs(comAct.getCompute());
-    rewriter.create<quake::ApplyOp>(loc, TypeRange{}, computeCallee,
-                                    /*isAdjoint=*/comAct.getIsDagger(),
-                                    ValueRange{}, computeArgs);
-    rewriter.create<quake::ApplyOp>(loc, TypeRange{}, actionCallee,
-                                    /*isAdjoint=*/false, ValueRange{},
-                                    getArgs(comAct.getAction()));
+    quake::ApplyOp::create(rewriter, loc, TypeRange{}, computeCallee,
+                           /*isAdjoint=*/comAct.getIsDagger(), ValueRange{},
+                           computeArgs);
+    quake::ApplyOp::create(rewriter, loc, TypeRange{}, actionCallee,
+                           /*isAdjoint=*/false, ValueRange{},
+                           getArgs(comAct.getAction()));
     rewriter.replaceOpWithNewOp<quake::ApplyOp>(
         comAct, TypeRange{}, computeCallee,
         /*isAdjoint=*/!comAct.getIsDagger(), ValueRange{}, computeArgs);
@@ -363,8 +360,8 @@ struct CallCallableOpPattern
 
     // For a callable, call the trampoline with the closure data.
     if (auto lambTy = dyn_cast<cudaq::cc::CallableType>(closureTy)) {
-      auto dynFunc = rewriter.create<cudaq::cc::CallableFuncOp>(
-          loc, call.getFunctionType(), closure);
+      auto dynFunc = cudaq::cc::CallableFuncOp::create(
+          rewriter, loc, call.getFunctionType(), closure);
       rewriter.replaceOpWithNewOp<func::CallIndirectOp>(call, dynFunc,
                                                         operands);
       return success();
@@ -373,7 +370,7 @@ struct CallCallableOpPattern
     // For a normal function, there is no closure to deal with.
     if (auto sig = dyn_cast<FunctionType>(closureTy)) {
       auto dynFunc =
-          rewriter.create<cudaq::cc::CallableFuncOp>(loc, sig, closure);
+          cudaq::cc::CallableFuncOp::create(rewriter, loc, sig, closure);
       rewriter.replaceOpWithNewOp<func::CallIndirectOp>(call, dynFunc,
                                                         operands.drop_front());
       return success();
@@ -436,7 +433,7 @@ class LambdaLiftingPass
     patterns.insert<CreateLambdaOpPattern>(ctx, constantPropagation);
     patterns.insert<ComputeActionOpPattern, CallCallableOpPattern,
                     CallableFuncOpPattern, ReturnOpPattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    if (failed(applyPatternsGreedily(module, std::move(patterns))))
       signalPassFailure();
   }
 
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index ee38a8dc151..5708d099439 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -8,10 +8,7 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -46,7 +43,7 @@ class LiftArrayAllocPass
     LLVM_DEBUG(llvm::dbgs()
                << "Before lifting constant array: " << func << '\n');
 
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+    if (failed(applyPatternsGreedily(func, std::move(patterns))))
       signalPassFailure();
 
     LLVM_DEBUG(llvm::dbgs()
diff --git a/lib/Optimizer/Transforms/LiftArrayAllocPatterns.inc b/lib/Optimizer/Transforms/LiftArrayAllocPatterns.inc
index e92c22867fd..b9757990b19 100644
--- a/lib/Optimizer/Transforms/LiftArrayAllocPatterns.inc
+++ b/lib/Optimizer/Transforms/LiftArrayAllocPatterns.inc
@@ -1,5 +1,5 @@
 /****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -47,7 +47,7 @@ public:
     auto valuesAttr = rewriter.getArrayAttr(values);
     auto loc = alloc.getLoc();
     Value conArr =
-        rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
+        cudaq::cc::ConstantArrayOp::create(rewriter, loc, arrTy, valuesAttr);
 
     assert(conArr && "must have created the constant array");
     LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n');
@@ -84,7 +84,8 @@ public:
             //     load, eleTy, conArr,
             //     ArrayRef<cudaq::cc::ExtractValueArg>{offset});
 
-            auto extractValue = rewriter.create<cudaq::cc::ExtractValueOp>(
+            auto extractValue = cudaq::cc::ExtractValueOp::create(
+                rewriter,
                 loc, eleTy, conArr,
                 ArrayRef<cudaq::cc::ExtractValueArg>{offset});
             rewriter.replaceAllUsesWith(load, extractValue);
@@ -108,7 +109,7 @@ public:
 
     if (cannotEraseAlloc) {
       rewriter.setInsertionPointAfter(alloc);
-      rewriter.create<cudaq::cc::StoreOp>(loc, conArr, alloc);
+      cudaq::cc::StoreOp::create(rewriter, loc, conArr, alloc);
       return success();
     }
     rewriter.eraseOp(alloc);
diff --git a/lib/Optimizer/Transforms/LinearCtrlRelations.cpp b/lib/Optimizer/Transforms/LinearCtrlRelations.cpp
index 995eec5a365..547e2fbc29e 100644
--- a/lib/Optimizer/Transforms/LinearCtrlRelations.cpp
+++ b/lib/Optimizer/Transforms/LinearCtrlRelations.cpp
@@ -148,8 +148,8 @@ class LinearCtrlRelationsPass
     DominanceInfo domInfo(func);
     RewritePatternSet patterns(ctx);
     patterns.insert<ThreadControl>(ctx, domInfo);
-    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                            std::move(patterns)))) {
+    if (failed(
+            applyPatternsGreedily(func.getOperation(), std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/lib/Optimizer/Transforms/LoopNormalize.cpp b/lib/Optimizer/Transforms/LoopNormalize.cpp
index 08bfd51bf37..a3e7bb254f2 100644
--- a/lib/Optimizer/Transforms/LoopNormalize.cpp
+++ b/lib/Optimizer/Transforms/LoopNormalize.cpp
@@ -36,7 +36,7 @@ class LoopNormalizePass
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<LoopPat>(ctx, allowClosedInterval, allowBreak);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(op, std::move(patterns)))) {
       op->emitOpError("could not normalize loop");
       signalPassFailure();
     }
diff --git a/lib/Optimizer/Transforms/LoopNormalizePatterns.inc b/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
index eb9b7d33cd5..bed1c04e7d2 100644
--- a/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
+++ b/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
@@ -1,5 +1,5 @@
 /****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -50,19 +50,19 @@ public:
     }
 
     if (c.hasAlwaysFalseCondition()) {
-      rewriter.startRootUpdate(loop);
+      rewriter.startOpModification(loop);
       rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(c.compareOp, 0, 1);
       loop->setAttr(cudaq::opt::DeadLoopAttr, rewriter.getUnitAttr());
-      rewriter.finalizeRootUpdate(loop);
+      rewriter.finalizeOpModification(loop);
       return success();
     }
     auto loc = loop.getLoc();
 
     // 1) Set initial value to 0.
     auto ty = c.initialValue.getType();
-    rewriter.startRootUpdate(loop);
+    rewriter.startOpModification(loop);
     auto createConstantOp = [&](std::int64_t val) -> Value {
-      return rewriter.create<arith::ConstantIntOp>(loc, val, ty);
+      return arith::ConstantIntOp::create(rewriter, loc, ty, val);
     };
     auto zero = createConstantOp(0);
     loop->setOperand(c.induction, zero);
@@ -74,68 +74,68 @@ public:
     Value step = c.stepValue;
     Value lower = c.initialValue;
     if (!c.stepIsAnAddOp())
-      step = rewriter.create<arith::SubIOp>(loc, zero, step);
+      step = arith::SubIOp::create(rewriter, loc, zero, step);
     if (c.isLinearExpr()) {
       // Induction is part of a linear expression. Deal with the terms of the
       // equation. `m` scales the step. `b` is an addend to the lower bound.
       if (c.addendValue) {
         if (c.negatedAddend) {
           // `m * i - b`, u += `b`.
-          upper = rewriter.create<arith::AddIOp>(loc, upper, c.addendValue);
+          upper = arith::AddIOp::create(rewriter, loc, upper, c.addendValue);
         } else {
           // `m * i + b`, u -= `b`.
-          upper = rewriter.create<arith::SubIOp>(loc, upper, c.addendValue);
+          upper = arith::SubIOp::create(rewriter, loc, upper, c.addendValue);
         }
       }
       if (c.minusOneMult) {
         // `b - m * i` (b eliminated), multiply lower and step by `-1` (`m`
         // follows).
         auto negOne = createConstantOp(-1);
-        lower = rewriter.create<arith::MulIOp>(loc, lower, negOne);
-        step = rewriter.create<arith::MulIOp>(loc, step, negOne);
+        lower = arith::MulIOp::create(rewriter, loc, lower, negOne);
+        step = arith::MulIOp::create(rewriter, loc, step, negOne);
       }
       if (c.scaleValue) {
         if (c.reciprocalScale) {
           // `1/m * i + b` (b eliminated), multiply upper by `m`.
-          upper = rewriter.create<arith::MulIOp>(loc, upper, c.scaleValue);
+          upper = arith::MulIOp::create(rewriter, loc, upper, c.scaleValue);
         } else {
           // `m * i + b` (b eliminated), multiple lower and step by `m`.
-          lower = rewriter.create<arith::MulIOp>(loc, lower, c.scaleValue);
-          step = rewriter.create<arith::MulIOp>(loc, step, c.scaleValue);
+          lower = arith::MulIOp::create(rewriter, loc, lower, c.scaleValue);
+          step = arith::MulIOp::create(rewriter, loc, step, c.scaleValue);
         }
       }
     }
     if (!c.isClosedIntervalForm()) {
       // Note: treating the step as a signed value to process countdown loops as
       // well as countup loops.
-      Value negStepCond = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, step, zero);
+      Value negStepCond = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::slt, step, zero);
       auto negOne = createConstantOp(-1);
       Value adj =
-          rewriter.create<arith::SelectOp>(loc, ty, negStepCond, negOne, one);
-      upper = rewriter.create<arith::SubIOp>(loc, upper, adj);
+          arith::SelectOp::create(rewriter, loc, ty, negStepCond, negOne, one);
+      upper = arith::SubIOp::create(rewriter, loc, upper, adj);
     }
-    Value diff = rewriter.create<arith::SubIOp>(loc, upper, lower);
-    Value disp = rewriter.create<arith::AddIOp>(loc, diff, step);
+    Value diff = arith::SubIOp::create(rewriter, loc, upper, lower);
+    Value disp = arith::AddIOp::create(rewriter, loc, diff, step);
     auto cmpOp = cast<arith::CmpIOp>(c.compareOp);
-    Value newUpper = rewriter.create<arith::DivSIOp>(loc, disp, step);
+    Value newUpper = arith::DivSIOp::create(rewriter, loc, disp, step);
     if (cudaq::opt::isSignedPredicate(cmpOp.getPredicate())) {
-      Value noLoopCond = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sgt, newUpper, zero);
-      newUpper =
-          rewriter.create<arith::SelectOp>(loc, ty, noLoopCond, newUpper, zero);
+      Value noLoopCond = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sgt, newUpper, zero);
+      newUpper = arith::SelectOp::create(rewriter, loc, ty, noLoopCond,
+                                         newUpper, zero);
     }
 
     // 3) Rewrite the comparison (!=) and step operations (+1).
     Value v1 = c.getCompareInduction();
     rewriter.setInsertionPoint(cmpOp);
-    Value newCmp = rewriter.create<arith::CmpIOp>(
-        cmpOp.getLoc(), arith::CmpIPredicate::ne, v1, newUpper);
+    Value newCmp = arith::CmpIOp::create(
+        rewriter, cmpOp.getLoc(), arith::CmpIPredicate::ne, v1, newUpper);
     cmpOp->replaceAllUsesWith(ValueRange{newCmp});
     auto v2 = c.stepOp->getOperand(
         c.stepIsAnAddOp() && c.shouldCommuteStepOp() ? 1 : 0);
     rewriter.setInsertionPoint(c.stepOp);
-    auto newStep = rewriter.create<arith::AddIOp>(c.stepOp->getLoc(), v2, one);
+    auto newStep = arith::AddIOp::create(rewriter, c.stepOp->getLoc(), v2, one);
     c.stepOp->replaceAllUsesWith(ValueRange{newStep.getResult()});
 
     // 4) Compute original induction value as a loop variant and replace the
@@ -144,12 +144,12 @@ public:
       Block *entry = &loop.getBodyRegion().front();
       rewriter.setInsertionPointToStart(entry);
       Value induct = entry->getArgument(c.induction);
-      auto mul = rewriter.create<arith::MulIOp>(loc, induct, c.stepValue);
-      Value newInd;
-      if (c.stepIsAnAddOp())
-        newInd = rewriter.create<arith::AddIOp>(loc, c.initialValue, mul);
-      else
-        newInd = rewriter.create<arith::SubIOp>(loc, c.initialValue, mul);
+      auto mul = arith::MulIOp::create(rewriter, loc, induct, c.stepValue);
+      auto newInd = [&]() -> Value {
+        if (c.stepIsAnAddOp())
+          return arith::AddIOp::create(rewriter, loc, c.initialValue, mul);
+        return arith::SubIOp::create(rewriter, loc, c.initialValue, mul);
+      }();
       induct.replaceUsesWithIf(newInd, [&](OpOperand &opnd) {
         auto *op = opnd.getOwner();
         return op != newStep.getOperation() && op != mul &&
@@ -163,20 +163,20 @@ public:
       if (!loopResult.use_empty()) {
         rewriter.setInsertionPointAfter(loop);
         auto mulRes =
-            rewriter.create<arith::MulIOp>(loc, loopResult, c.stepValue);
+            arith::MulIOp::create(rewriter, loc, loopResult, c.stepValue);
         Value recovered;
         if (c.stepIsAnAddOp())
           recovered =
-              rewriter.create<arith::AddIOp>(loc, c.initialValue, mulRes);
+              arith::AddIOp::create(rewriter, loc, c.initialValue, mulRes);
         else
           recovered =
-              rewriter.create<arith::SubIOp>(loc, c.initialValue, mulRes);
+              arith::SubIOp::create(rewriter, loc, c.initialValue, mulRes);
         loopResult.replaceAllUsesExcept(recovered, mulRes.getOperation());
       }
     }
     loop->setAttr(cudaq::opt::NormalizedLoopAttr, rewriter.getUnitAttr());
 
-    rewriter.finalizeRootUpdate(loop);
+    rewriter.finalizeOpModification(loop);
     LLVM_DEBUG(llvm::dbgs() << "loop after normalization: " << loop << '\n');
     return success();
   }
diff --git a/lib/Optimizer/Transforms/LoopPeeling.cpp b/lib/Optimizer/Transforms/LoopPeeling.cpp
index b777e654d7b..0db3383ecff 100644
--- a/lib/Optimizer/Transforms/LoopPeeling.cpp
+++ b/lib/Optimizer/Transforms/LoopPeeling.cpp
@@ -46,8 +46,8 @@ class LoopPat : public OpRewritePattern<cudaq::cc::LoopOp> {
     for (auto res : loop.getResults())
       afterBlock->addArgument(res.getType(), loop.getLoc());
     rewriter.setInsertionPointToEnd(oldLoopBlock);
-    auto finalBranch = rewriter.create<cf::BranchOp>(loop.getLoc(), afterBlock,
-                                                     loop.getResults());
+    auto finalBranch = cf::BranchOp::create(rewriter, loop.getLoc(), afterBlock,
+                                            loop.getResults());
     // NB: the results of the original loop are now split between the peeled
     // copy of body and the modified new loop. Introduce explicit block
     // arguments for the phi node functionality.
@@ -75,13 +75,13 @@ class LoopPat : public OpRewritePattern<cudaq::cc::LoopOp> {
     rewriter.cloneRegionBefore(loop.getBodyRegion(), newLoopBlock);
     Block *firstBlock = beforeBlock->getNextNode();
     rewriter.setInsertionPointToEnd(beforeBlock);
-    rewriter.create<cf::BranchOp>(loop.getLoc(), firstBlock, loopArgs);
+    cf::BranchOp::create(rewriter, loop.getLoc(), firstBlock, loopArgs);
 
     // Replace continue ops with branches to the new-loop-block. Replace break
     // ops with branches to the after-block.
     auto rewriteBranch = [&](auto op, Block *dest) {
       rewriter.setInsertionPointToEnd(op->getBlock());
-      rewriter.create<cf::BranchOp>(op.getLoc(), dest, op.getOperands());
+      cf::BranchOp::create(rewriter, op.getLoc(), dest, op.getOperands());
       rewriter.eraseOp(op);
     };
     for (Block *b = firstBlock; b != newLoopBlock; b = b->getNextNode())
@@ -116,7 +116,7 @@ class LoopPeelingPass
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<LoopPat>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(op, std::move(patterns)))) {
       op->emitOpError("could not peel loop");
       signalPassFailure();
     }
diff --git a/lib/Optimizer/Transforms/LoopUnroll.cpp b/lib/Optimizer/Transforms/LoopUnroll.cpp
index c6d0bf83eee..af8c9d75ff2 100644
--- a/lib/Optimizer/Transforms/LoopUnroll.cpp
+++ b/lib/Optimizer/Transforms/LoopUnroll.cpp
@@ -55,7 +55,7 @@ class LoopUnrollPass : public cudaq::opt::impl::LoopUnrollBase<LoopUnrollPass> {
       // iteratively propagated.
       do {
         progress = 0;
-        (void)applyPatternsAndFoldGreedily(op, frozen);
+        (void)applyPatternsGreedily(op, frozen);
       } while (progress);
     }
 
diff --git a/lib/Optimizer/Transforms/LoopUnrollPatterns.inc b/lib/Optimizer/Transforms/LoopUnrollPatterns.inc
index 210ff9e3eb1..b8aa500dde8 100644
--- a/lib/Optimizer/Transforms/LoopUnrollPatterns.inc
+++ b/lib/Optimizer/Transforms/LoopUnrollPatterns.inc
@@ -178,7 +178,7 @@ struct UnrollCountedLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
       // Propagate the previous iteration number into the new block. This makes
       // any unneeded computation dead. DCE will clean that up as well.
       iterationOpers[components->induction] = iterCount;
-      rewriter.create<cf::BranchOp>(loc, cloneRange.first, iterationOpers);
+      cf::BranchOp::create(rewriter, loc, cloneRange.first, iterationOpers);
       // Bookkeeping for the next iteration, which uses the new continue block,
       // `conBlock`, and its arguments.
       setIterationOpers(contBlock->getArguments());
@@ -193,7 +193,7 @@ struct UnrollCountedLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
       setIterationOpers(contBlock->getArguments());
     }
     [[maybe_unused]] auto lastBranch =
-        rewriter.create<cf::BranchOp>(loc, endBlock, iterationOpers);
+        cf::BranchOp::create(rewriter, loc, endBlock, iterationOpers);
     rewriter.replaceOp(loop, endBlock->getArguments());
 
     LLVM_DEBUG(llvm::dbgs() << "after unrolling a loop:\n";
@@ -205,7 +205,7 @@ struct UnrollCountedLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
   static Value getIntegerConstant(Location loc, Type ty, std::int64_t val,
                                   PatternRewriter &rewriter) {
     auto attr = rewriter.getIntegerAttr(ty, val);
-    return rewriter.create<arith::ConstantOp>(loc, ty, attr);
+    return arith::ConstantOp::create(rewriter, loc, ty, attr);
   }
 
   std::size_t threshold;
diff --git a/lib/Optimizer/Transforms/LowerToCFG.cpp b/lib/Optimizer/Transforms/LowerToCFG.cpp
index 60908717cdd..cd7466cd2cf 100644
--- a/lib/Optimizer/Transforms/LowerToCFG.cpp
+++ b/lib/Optimizer/Transforms/LowerToCFG.cpp
@@ -9,8 +9,6 @@
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "mlir/IR/PatternMatch.h"
@@ -59,8 +57,8 @@ class RewriteScope : public OpRewritePattern<cudaq::cc::ScopeOp> {
     Value stacksave;
     auto ptrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
     if (scopeOp.hasAllocation(/*quantumAllocs=*/false)) {
-      auto call = rewriter.create<func::CallOp>(
-          loc, ptrTy, cudaq::llvmStackSave, ArrayRef<Value>{});
+      auto call = func::CallOp::create(rewriter, loc, ptrTy,
+                                       cudaq::llvmStackSave, ArrayRef<Value>{});
       stacksave = call.getResult(0);
     }
     auto initPos = rewriter.getInsertionPoint();
@@ -71,7 +69,7 @@ class RewriteScope : public OpRewritePattern<cudaq::cc::ScopeOp> {
           endBlock, scopeOp.getResultTypes(),
           SmallVector<Location>(scopeOp.getNumResults(), loc));
       scopeResults = continueBlock->getArguments();
-      rewriter.create<cf::BranchOp>(loc, endBlock);
+      cf::BranchOp::create(rewriter, loc, endBlock);
       endBlock = continueBlock;
     }
 
@@ -85,13 +83,12 @@ class RewriteScope : public OpRewritePattern<cudaq::cc::ScopeOp> {
 
     auto *entryBlock = &scopeOp.getInitRegion().front();
     rewriter.setInsertionPointToEnd(initBlock);
-    rewriter.create<cf::BranchOp>(loc, entryBlock, ValueRange{});
+    cf::BranchOp::create(rewriter, loc, entryBlock, ValueRange{});
     rewriter.inlineRegionBefore(scopeOp.getInitRegion(), endBlock);
     if (stacksave) {
       rewriter.setInsertionPointToStart(endBlock);
-      rewriter.create<func::CallOp>(loc, ArrayRef<Type>{},
-                                    cudaq::llvmStackRestore,
-                                    ArrayRef<Value>{stacksave});
+      func::CallOp::create(rewriter, loc, ArrayRef<Type>{},
+                           cudaq::llvmStackRestore, ArrayRef<Value>{stacksave});
     }
     rewriter.replaceOp(scopeOp, scopeResults);
     return success();
@@ -193,7 +190,7 @@ class RewriteLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
       Block *continueBlock = rewriter.createBlock(
           endBlock, loopOp.getResultTypes(),
           SmallVector<Location>(loopOp.getNumResults(), loc));
-      rewriter.create<cf::BranchOp>(loc, endBlock);
+      cf::BranchOp::create(rewriter, loc, endBlock);
       endBlock = continueBlock;
     }
     auto comparison = whileCond.getCondition();
@@ -206,14 +203,14 @@ class RewriteLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
     if (loopOp.isPostConditional()) {
       // Branch from `initBlock` to getBodyRegion().front().
       rewriter.setInsertionPointToEnd(initBlock);
-      rewriter.create<cf::BranchOp>(loc, bodyBlock, loopOperands);
+      cf::BranchOp::create(rewriter, loc, bodyBlock, loopOperands);
       // Move the body region blocks between initBlock and end block.
       rewriter.inlineRegionBefore(loopOp.getBodyRegion(), endBlock);
       // Replace the condition op with a `cf.cond_br`.
       rewriter.setInsertionPointToEnd(whileBlock);
-      rewriter.create<cf::CondBranchOp>(loc, comparison, bodyBlock,
-                                        whileCond.getResults(), endBlock,
-                                        whileCond.getResults());
+      cf::CondBranchOp::create(rewriter, loc, comparison, bodyBlock,
+                               whileCond.getResults(), endBlock,
+                               whileCond.getResults());
       rewriter.eraseOp(whileCond);
       // Move the while region between the body and end block.
       rewriter.inlineRegionBefore(loopOp.getWhileRegion(), endBlock);
@@ -222,12 +219,12 @@ class RewriteLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
           loopOp.hasPythonElse() ? loopOp.getElseEntryBlock() : endBlock;
       // Branch from `initBlock` to whileRegion().front().
       rewriter.setInsertionPointToEnd(initBlock);
-      rewriter.create<cf::BranchOp>(loc, whileBlock, loopOperands);
+      cf::BranchOp::create(rewriter, loc, whileBlock, loopOperands);
       // Replace the condition op with a `cf.cond_br` op.
       rewriter.setInsertionPointToEnd(whileBlock);
-      rewriter.create<cf::CondBranchOp>(loc, comparison, bodyBlock,
-                                        whileCond.getResults(), elseBlock,
-                                        whileCond.getResults());
+      cf::CondBranchOp::create(rewriter, loc, comparison, bodyBlock,
+                               whileCond.getResults(), elseBlock,
+                               whileCond.getResults());
       rewriter.eraseOp(whileCond);
       // Move the while and body region blocks between initBlock and endBlock.
       rewriter.inlineRegionBefore(loopOp.getWhileRegion(), endBlock);
@@ -238,8 +235,8 @@ class RewriteLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
         auto *stepBlock = loopOp.getStepBlock();
         auto *terminator = stepBlock->getTerminator();
         rewriter.setInsertionPointToEnd(stepBlock);
-        rewriter.create<cf::BranchOp>(loc, whileBlock,
-                                      terminator->getOperands());
+        cf::BranchOp::create(rewriter, loc, whileBlock,
+                             terminator->getOperands());
         rewriter.eraseOp(terminator);
         rewriter.inlineRegionBefore(loopOp.getStepRegion(), endBlock);
       }
diff --git a/lib/Optimizer/Transforms/LowerToCFGPatterns.inc b/lib/Optimizer/Transforms/LowerToCFGPatterns.inc
index d9a62e7922f..cfef24dacf0 100644
--- a/lib/Optimizer/Transforms/LowerToCFGPatterns.inc
+++ b/lib/Optimizer/Transforms/LowerToCFGPatterns.inc
@@ -1,5 +1,5 @@
 /****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -61,7 +61,7 @@ public:
       Block *continueBlock = rewriter.createBlock(
           endBlock, ifOp.getResultTypes(),
           SmallVector<Location>(ifOp.getNumResults(), loc));
-      rewriter.create<cf::BranchOp>(loc, endBlock);
+      cf::BranchOp::create(rewriter, loc, endBlock);
       endBlock = continueBlock;
     }
     auto *thenBlock = &ifOp.getThenRegion().front();
@@ -73,9 +73,9 @@ public:
     if (hasElse)
       rewriter.inlineRegionBefore(ifOp.getElseRegion(), endBlock);
     rewriter.setInsertionPointToEnd(initBlock);
-    rewriter.create<cf::CondBranchOp>(loc, ifOp.getCondition(), thenBlock,
-                                      ifOp.getLinearArgs(), elseBlock,
-                                      ifOp.getLinearArgs());
+    cf::CondBranchOp::create(rewriter, loc, ifOp.getCondition(), thenBlock,
+                             ifOp.getLinearArgs(), elseBlock,
+                             ifOp.getLinearArgs());
     rewriter.replaceOp(ifOp, endBlock->getArguments());
     return success();
   }
diff --git a/lib/Optimizer/Transforms/LowerUnwind.cpp b/lib/Optimizer/Transforms/LowerUnwind.cpp
index 8746d617cee..22d4b77380c 100644
--- a/lib/Optimizer/Transforms/LowerUnwind.cpp
+++ b/lib/Optimizer/Transforms/LowerUnwind.cpp
@@ -8,9 +8,6 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "mlir/IR/Dominance.h"
@@ -371,17 +368,17 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       SmallVector<Location> locs(scope.getNumResults(), loc);
       Block *continueBlock =
           rewriter.createBlock(nextBlock, scope.getResultTypes(), locs);
-      rewriter.create<cf::BranchOp>(loc, nextBlock);
+      cf::BranchOp::create(rewriter, loc, nextBlock);
       nextBlock = continueBlock;
     }
     rewriter.setInsertionPointToEnd(initBlock);
-    rewriter.create<cf::BranchOp>(loc, scopeBlock, ValueRange{});
+    cf::BranchOp::create(rewriter, loc, scopeBlock, ValueRange{});
     // Normal scope exit with inline deallocations.
     for (auto &pr : termAllocMap) {
       auto *contOp = pr.first;
       rewriter.setInsertionPoint(contOp);
       for (auto a : llvm::reverse(pr.second))
-        rewriter.create<quake::DeallocOp>(a.getLoc(), adjustedDeallocArg(a));
+        quake::DeallocOp::create(rewriter, a.getLoc(), adjustedDeallocArg(a));
       rewriter.replaceOpWithNewOp<cf::BranchOp>(contOp, nextBlock,
                                                 contOp->getOperands());
     }
@@ -395,12 +392,13 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       if (Block *blk = blockInfo.continueBlock) {
         rewriter.setInsertionPointToEnd(blk);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
+          quake::DeallocOp::create(rewriter, a->getLoc(),
+                                   adjustedDeallocArg(a));
         if (asPrimitive) {
           Block *landingPad = getLandingPad(infoMap, scope).continueBlock;
-          rewriter.create<cf::BranchOp>(loc, landingPad, blk->getArguments());
+          cf::BranchOp::create(rewriter, loc, landingPad, blk->getArguments());
         } else {
-          rewriter.create<cudaq::cc::ContinueOp>(loc, blk->getArguments());
+          cudaq::cc::ContinueOp::create(rewriter, loc, blk->getArguments());
         }
         scope.getInitRegion().push_back(blk);
       }
@@ -408,12 +406,13 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       if (Block *blk = blockInfo.breakBlock) {
         rewriter.setInsertionPointToEnd(blk);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
+          quake::DeallocOp::create(rewriter, a->getLoc(),
+                                   adjustedDeallocArg(a));
         if (asPrimitive) {
           Block *landingPad = getLandingPad(infoMap, scope).breakBlock;
-          rewriter.create<cf::BranchOp>(loc, landingPad, blk->getArguments());
+          cf::BranchOp::create(rewriter, loc, landingPad, blk->getArguments());
         } else {
-          rewriter.create<cudaq::cc::BreakOp>(loc, blk->getArguments());
+          cudaq::cc::BreakOp::create(rewriter, loc, blk->getArguments());
         }
         scope.getInitRegion().push_back(blk);
       }
@@ -421,10 +420,11 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       if (Block *blk = blockInfo.returnBlock) {
         rewriter.setInsertionPointToEnd(blk);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
+          quake::DeallocOp::create(rewriter, a->getLoc(),
+                                   adjustedDeallocArg(a));
         assert(asPrimitive);
         Block *landingPad = getLandingPad(infoMap, scope).returnBlock;
-        rewriter.create<cf::BranchOp>(loc, landingPad, blk->getArguments());
+        cf::BranchOp::create(rewriter, loc, landingPad, blk->getArguments());
         scope.getInitRegion().push_back(blk);
       }
     }
@@ -454,8 +454,7 @@ struct FuncLikeOpPattern : public OpRewritePattern<OP> {
     assert(iter != infoMap.opParentMap.end());
     if (!func->hasAttr("add_dealloc"))
       return success();
-    rewriter.updateRootInPlace(func,
-                               [&]() { func->removeAttr("add_dealloc"); });
+    rewriter.modifyOpInPlace(func, [&]() { func->removeAttr("add_dealloc"); });
     if (!iter->second.asPrimitive) {
       LLVM_DEBUG(llvm::dbgs() << "func was not marked as primitive in map\n");
       return success();
@@ -473,7 +472,7 @@ struct FuncLikeOpPattern : public OpRewritePattern<OP> {
       auto *exitOp = pr.first;
       rewriter.setInsertionPoint(exitOp);
       for (auto a : llvm::reverse(pr.second))
-        rewriter.create<quake::DeallocOp>(a.getLoc(), adjustedDeallocArg(a));
+        quake::DeallocOp::create(rewriter, a.getLoc(), adjustedDeallocArg(a));
     }
 
     // Here, we handle the unwind return jumps.
@@ -492,8 +491,9 @@ struct FuncLikeOpPattern : public OpRewritePattern<OP> {
       if (Block *exitBlock = blockInfo.returnBlock) {
         rewriter.setInsertionPointToEnd(exitBlock);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
-        rewriter.create<TERM>(func.getLoc(), exitBlock->getArguments());
+          quake::DeallocOp::create(rewriter, a->getLoc(),
+                                   adjustedDeallocArg(a));
+        TERM::create(rewriter, func.getLoc(), exitBlock->getArguments());
         func.getBody().push_back(exitBlock);
       }
     }
@@ -531,7 +531,7 @@ struct IfOpPattern : public OpRewritePattern<cudaq::cc::IfOp> {
       Block *continueBlock = rewriter.createBlock(
           endBlock, ifOp.getResultTypes(),
           SmallVector<Location>(ifOp.getNumResults(), loc));
-      rewriter.create<cf::BranchOp>(loc, endBlock);
+      cf::BranchOp::create(rewriter, loc, endBlock);
       endBlock = continueBlock;
     }
     auto *thenBlock = &ifOp.getThenRegion().front();
@@ -555,19 +555,19 @@ struct IfOpPattern : public OpRewritePattern<cudaq::cc::IfOp> {
         if (auto *blk = blockInfo.continueBlock) {
           rewriter.setInsertionPointToEnd(blk);
           auto *dest = getLandingPad(infoMap, ifOp).continueBlock;
-          rewriter.create<cf::BranchOp>(loc, dest, blk->getArguments());
+          cf::BranchOp::create(rewriter, loc, dest, blk->getArguments());
           tailRegion.push_back(blk);
         }
         if (auto *blk = blockInfo.breakBlock) {
           rewriter.setInsertionPointToEnd(blk);
           auto *dest = getLandingPad(infoMap, ifOp).breakBlock;
-          rewriter.create<cf::BranchOp>(loc, dest, blk->getArguments());
+          cf::BranchOp::create(rewriter, loc, dest, blk->getArguments());
           tailRegion.push_back(blk);
         }
         if (auto *blk = blockInfo.returnBlock) {
           rewriter.setInsertionPointToEnd(blk);
           auto *dest = getLandingPad(infoMap, ifOp).returnBlock;
-          rewriter.create<cf::BranchOp>(loc, dest, blk->getArguments());
+          cf::BranchOp::create(rewriter, loc, dest, blk->getArguments());
           tailRegion.push_back(blk);
         }
       }
@@ -639,7 +639,7 @@ struct LoopOpPattern : public OpRewritePattern<cudaq::cc::LoopOp> {
       Block *continueBlock = rewriter.createBlock(
           endBlock, loopOp.getResultTypes(),
           SmallVector<Location>(loopOp.getNumResults(), loc));
-      rewriter.create<cf::BranchOp>(loc, endBlock);
+      cf::BranchOp::create(rewriter, loc, endBlock);
       endBlock = continueBlock;
     }
     auto comparison = whileCond.getCondition();
@@ -662,19 +662,19 @@ struct LoopOpPattern : public OpRewritePattern<cudaq::cc::LoopOp> {
       assert(details.allocaDomMap.find(pr.first)->second.empty());
       if (auto *blk = blockInfo.continueBlock) {
         rewriter.setInsertionPointToEnd(blk);
-        rewriter.create<cf::BranchOp>(loc, condBlock, blk->getArguments());
+        cf::BranchOp::create(rewriter, loc, condBlock, blk->getArguments());
         tailRegion.push_back(blk);
       }
       if (auto *blk = blockInfo.breakBlock) {
         rewriter.setInsertionPointToEnd(blk);
-        rewriter.create<cf::BranchOp>(loc, endBlock, blk->getArguments());
+        cf::BranchOp::create(rewriter, loc, endBlock, blk->getArguments());
         tailRegion.push_back(blk);
       }
       if (auto *blk = blockInfo.returnBlock) {
         rewriter.setInsertionPointToEnd(blk);
         auto *retBlk = getLandingPad(infoMap, loopOp).returnBlock;
         assert(retBlk);
-        rewriter.create<cf::BranchOp>(loc, retBlk, blk->getArguments());
+        cf::BranchOp::create(rewriter, loc, retBlk, blk->getArguments());
         tailRegion.push_back(blk);
       }
     }
@@ -684,27 +684,27 @@ struct LoopOpPattern : public OpRewritePattern<cudaq::cc::LoopOp> {
     if (loopOp.isPostConditional()) {
       // Branch from `initBlock` to getBodyRegion().front().
       rewriter.setInsertionPointToEnd(initBlock);
-      rewriter.create<cf::BranchOp>(loc, bodyBlock, loopOperands);
+      cf::BranchOp::create(rewriter, loc, bodyBlock, loopOperands);
       // Move the body region blocks between initBlock and end block.
       rewriter.inlineRegionBefore(loopOp.getBodyRegion(), endBlock);
       // Replace the condition op with a `cf.cond_br`.
       rewriter.setInsertionPointToEnd(whileBlock);
-      rewriter.create<cf::CondBranchOp>(loc, comparison, bodyBlock,
-                                        whileCond.getResults(), endBlock,
-                                        whileCond.getResults());
+      cf::CondBranchOp::create(rewriter, loc, comparison, bodyBlock,
+                               whileCond.getResults(), endBlock,
+                               whileCond.getResults());
       rewriter.eraseOp(whileCond);
       // Move the while region between the body and end block.
       rewriter.inlineRegionBefore(loopOp.getWhileRegion(), endBlock);
     } else {
       // Branch from `initBlock` to whileRegion().front().
       rewriter.setInsertionPointToEnd(initBlock);
-      rewriter.create<cf::BranchOp>(loc, whileBlock, loopOperands);
+      cf::BranchOp::create(rewriter, loc, whileBlock, loopOperands);
       // Replace the condition op with a `cf.cond_br` op.
       rewriter.setInsertionPointToEnd(whileBlock);
-      rewriter.create<cf::CondBranchOp>(
-          loc, comparison, bodyBlock, whileCond.getResults(),
-          loopOp.hasPythonElse() ? elseBlock : endBlock,
-          whileCond.getResults());
+      cf::CondBranchOp::create(rewriter, loc, comparison, bodyBlock,
+                               whileCond.getResults(),
+                               loopOp.hasPythonElse() ? elseBlock : endBlock,
+                               whileCond.getResults());
       rewriter.eraseOp(whileCond);
       // Move the while and body region blocks between initBlock and endBlock.
       rewriter.inlineRegionBefore(loopOp.getWhileRegion(), endBlock);
@@ -715,8 +715,8 @@ struct LoopOpPattern : public OpRewritePattern<cudaq::cc::LoopOp> {
         auto *stepBlock = &loopOp.getStepRegion().front();
         auto *terminator = stepBlock->getTerminator();
         rewriter.setInsertionPointToEnd(stepBlock);
-        rewriter.create<cf::BranchOp>(loc, whileBlock,
-                                      terminator->getOperands());
+        cf::BranchOp::create(rewriter, loc, whileBlock,
+                             terminator->getOperands());
         rewriter.eraseOp(terminator);
         rewriter.inlineRegionBefore(loopOp.getStepRegion(), endBlock);
       }
@@ -726,7 +726,8 @@ struct LoopOpPattern : public OpRewritePattern<cudaq::cc::LoopOp> {
         auto *elseBlock = &loopOp.getElseRegion().front();
         auto *terminator = elseBlock->getTerminator();
         rewriter.setInsertionPointToEnd(elseBlock);
-        rewriter.create<cf::BranchOp>(loc, endBlock, terminator->getOperands());
+        cf::BranchOp::create(rewriter, loc, endBlock,
+                             terminator->getOperands());
         rewriter.eraseOp(terminator);
         rewriter.inlineRegionBefore(loopOp.getElseRegion(), endBlock);
       }
diff --git a/lib/Optimizer/Transforms/Mapping.cpp b/lib/Optimizer/Transforms/Mapping.cpp
index c7b1f33d910..d89a75c757a 100644
--- a/lib/Optimizer/Transforms/Mapping.cpp
+++ b/lib/Optimizer/Transforms/Mapping.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Support/Device.h"
 #include "cudaq/Support/Placement.h"
@@ -14,21 +14,8 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
-
-#define DEBUG_TYPE "quantum-mapper"
-
-using namespace mlir;
-
-// Use specific cudaq elements without bringing in the full namespace
-using cudaq::Device;
-using cudaq::Placement;
-using cudaq::QuantumMeasure;
-
-//===----------------------------------------------------------------------===//
-// Generated logic
-//===----------------------------------------------------------------------===//
 
 namespace cudaq::opt {
 #define GEN_PASS_DEF_MAPPINGFUNC
@@ -36,6 +23,10 @@ namespace cudaq::opt {
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
+#define DEBUG_TYPE "quantum-mapper"
+
+using namespace mlir;
+
 namespace {
 
 constexpr StringRef mappedWireSetName("mapped_wireset");
@@ -44,9 +35,9 @@ constexpr StringRef mappedWireSetName("mapped_wireset");
 // Placement
 //===----------------------------------------------------------------------===//
 
-void identityPlacement(Placement &placement) {
+void identityPlacement(cudaq::Placement &placement) {
   for (unsigned i = 0, end = placement.getNumVirtualQubits(); i < end; ++i)
-    placement.map(Placement::VirtualQ(i), Placement::DeviceQ(i));
+    placement.map(cudaq::Placement::VirtualQ(i), cudaq::Placement::DeviceQ(i));
 }
 
 //===----------------------------------------------------------------------===//
@@ -57,9 +48,9 @@ void identityPlacement(Placement &placement) {
 /// about the virtual qubits these wires correspond.
 struct VirtualOp {
   mlir::Operation *op;
-  SmallVector<Placement::VirtualQ, 2> qubits;
+  SmallVector<cudaq::Placement::VirtualQ, 2> qubits;
 
-  VirtualOp(mlir::Operation *op, ArrayRef<Placement::VirtualQ> qubits)
+  VirtualOp(mlir::Operation *op, ArrayRef<cudaq::Placement::VirtualQ> qubits)
       : op(op), qubits(qubits) {}
 };
 
@@ -94,13 +85,14 @@ struct VirtualOp {
 /// measurement mapping until the end, which is required for QIR Base Profile
 /// programs (see the `allowMeasurementMapping` member variable).
 class SabreRouter {
-  using WireMap = DenseMap<Value, Placement::VirtualQ>;
-  using Swap = std::pair<Placement::DeviceQ, Placement::DeviceQ>;
+  using WireMap = DenseMap<Value, cudaq::Placement::VirtualQ>;
+  using Swap = std::pair<cudaq::Placement::DeviceQ, cudaq::Placement::DeviceQ>;
 
 public:
-  SabreRouter(const Device &device, WireMap &wireMap, Placement &placement,
-              unsigned extendedLayerSize, float extendedLayerWeight,
-              float decayDelta, unsigned roundsDecayReset)
+  SabreRouter(const cudaq::Device &device, WireMap &wireMap,
+              cudaq::Placement &placement, unsigned extendedLayerSize,
+              float extendedLayerWeight, float decayDelta,
+              unsigned roundsDecayReset)
       : device(device), wireToVirtualQ(wireMap), placement(placement),
         extendedLayerSize(extendedLayerSize),
         extendedLayerWeight(extendedLayerWeight), decayDelta(decayDelta),
@@ -130,9 +122,9 @@ class SabreRouter {
   Swap chooseSwap();
 
 private:
-  const Device &device;
+  const cudaq::Device &device;
   WireMap &wireToVirtualQ;
-  Placement &placement;
+  cudaq::Placement &placement;
 
   // Parameters
   const unsigned extendedLayerSize;
@@ -145,7 +137,7 @@ class SabreRouter {
   SmallVector<VirtualOp> extendedLayer;
   SmallVector<VirtualOp> measureLayer;
   llvm::SmallPtrSet<mlir::Operation *, 32> measureLayerSet;
-  llvm::SmallSet<Placement::DeviceQ, 32> involvedPhy;
+  llvm::SmallSet<cudaq::Placement::DeviceQ, 32> involvedPhy;
   SmallVector<float> phyDecay;
 
   SmallVector<Value> phyToWire;
@@ -181,11 +173,12 @@ void SabreRouter::visitUsers(ResultRange::user_range users,
     } else {
       auto wires = quake::getQuantumOperands(user);
       if (entry->second == wires.size()) {
-        SmallVector<Placement::VirtualQ, 2> qubits;
+        SmallVector<cudaq::Placement::VirtualQ, 2> qubits;
         for (auto wire : wires)
           qubits.push_back(wireToVirtualQ[wire]);
         // Don't process measurements until we're ready
-        if (allowMeasurementMapping || !user->hasTrait<QuantumMeasure>()) {
+        if (allowMeasurementMapping ||
+            !user->hasTrait<cudaq::QuantumMeasure>()) {
           layer.emplace_back(user, qubits);
         } else {
           // Add to measureLayer. Don't add duplicates.
@@ -201,13 +194,14 @@ void SabreRouter::visitUsers(ResultRange::user_range users,
 
 LogicalResult SabreRouter::mapOperation(VirtualOp &virtOp) {
   // Take the device qubits from this operation.
-  SmallVector<Placement::DeviceQ, 2> deviceQubits;
+  SmallVector<cudaq::Placement::DeviceQ, 2> deviceQubits;
   for (auto vr : virtOp.qubits)
     deviceQubits.push_back(placement.getPhy(vr));
 
   // An operation cannot be mapped if it is not a measurement and uses two
   // qubits virtual qubit that are no adjacently placed.
-  if (!virtOp.op->hasTrait<QuantumMeasure>() && deviceQubits.size() == 2 &&
+  if (!virtOp.op->hasTrait<cudaq::QuantumMeasure>() &&
+      deviceQubits.size() == 2 &&
       !device.areConnected(deviceQubits[0], deviceQubits[1]))
     return failure();
 
@@ -280,7 +274,7 @@ void SabreRouter::selectExtendedLayer() {
     for (VirtualOp &virtOp : newTmpLayer)
       // We only add operations that can influence placement to the extended
       // frontlayer, i.e., quantum operators that use two qubits.
-      if (!virtOp.op->hasTrait<QuantumMeasure>() &&
+      if (!virtOp.op->hasTrait<cudaq::QuantumMeasure>() &&
           quake::getQuantumOperands(virtOp.op).size() == 2)
         extendedLayer.emplace_back(virtOp);
     tmpLayer = std::move(newTmpLayer);
@@ -382,10 +376,11 @@ void SabreRouter::route(Block &block, ArrayRef<quake::BorrowWireOp> sources) {
 
   OpBuilder builder(&block, block.begin());
   auto wireType = builder.getType<quake::WireType>();
-  auto addSwap = [&](Placement::DeviceQ q0, Placement::DeviceQ q1) {
+  auto addSwap = [&](cudaq::Placement::DeviceQ q0,
+                     cudaq::Placement::DeviceQ q1) {
     placement.swap(q0, q1);
-    auto swap = builder.create<quake::SwapOp>(
-        builder.getUnknownLoc(), TypeRange{wireType, wireType}, false,
+    auto swap = quake::SwapOp::create(
+        builder, builder.getUnknownLoc(), TypeRange{wireType, wireType}, false,
         ValueRange{}, ValueRange{},
         ValueRange{phyToWire[q0.index], phyToWire[q1.index]},
         DenseBoolArrayAttr{});
@@ -434,7 +429,7 @@ void SabreRouter::route(Block &block, ArrayRef<quake::BorrowWireOp> sources) {
   LLVM_DEBUG(logger.startLine() << '\n' << logLineComment << '\n';);
 }
 
-std::pair<bool, std::optional<Device>>
+std::pair<bool, std::optional<cudaq::Device>>
 deviceFromString(llvm::StringRef deviceString) {
   std::size_t deviceDim[2];
   deviceDim[0] = deviceDim[1] = 0;
@@ -476,7 +471,7 @@ deviceFromString(llvm::StringRef deviceString) {
       return std::make_pair(false, std::nullopt);
     }
 
-    return std::make_pair(false, Device::file(deviceFilename));
+    return std::make_pair(false, cudaq::Device::file(deviceFilename));
   } else {
     if (deviceString.consume_front("(")) {
       deviceString = deviceString.ltrim();
@@ -505,13 +500,15 @@ deviceFromString(llvm::StringRef deviceString) {
     }
 
     if (deviceTopoStr == "path") {
-      return std::make_pair(false, Device::path(deviceDim[0]));
+      return std::make_pair(false, cudaq::Device::path(deviceDim[0]));
     } else if (deviceTopoStr == "ring") {
-      return std::make_pair(false, Device::ring(deviceDim[0]));
+      return std::make_pair(false, cudaq::Device::ring(deviceDim[0]));
     } else if (deviceTopoStr == "star") {
-      return std::make_pair(false, Device::star(deviceDim[0], deviceDim[1]));
+      return std::make_pair(false,
+                            cudaq::Device::star(deviceDim[0], deviceDim[1]));
     } else if (deviceTopoStr == "grid") {
-      return std::make_pair(false, Device::grid(deviceDim[0], deviceDim[1]));
+      return std::make_pair(false,
+                            cudaq::Device::grid(deviceDim[0], deviceDim[1]));
     } else if (deviceTopoStr == "bypass") {
       return std::make_pair(true, std::nullopt);
     } else {
@@ -528,7 +525,7 @@ deviceFromString(llvm::StringRef deviceString) {
 struct MappingPrep : public cudaq::opt::impl::MappingPrepBase<MappingPrep> {
   using MappingPrepBase::MappingPrepBase;
 
-  std::optional<Device> deviceInstance;
+  std::optional<cudaq::Device> deviceInstance;
   bool deviceBypass = false;
 
   virtual LogicalResult initialize(MLIRContext *context) override {
@@ -542,13 +539,14 @@ struct MappingPrep : public cudaq::opt::impl::MappingPrepBase<MappingPrep> {
   }
 
   /// Create an adjacency matrix attribute for a WireSetOp.
-  SparseElementsAttr getAdjacencyFromDevice(Device &d, MLIRContext *ctx) {
+  SparseElementsAttr getAdjacencyFromDevice(cudaq::Device &d,
+                                            MLIRContext *ctx) {
     int numEdges = 0;
     unsigned int qubitCardinality = static_cast<unsigned int>(d.getNumQubits());
 
     SmallVector<APInt, 32> edgeVector;
     for (unsigned int i = 0; i < qubitCardinality; i++) {
-      auto neighbors = d.getNeighbours(Device::Qubit(i));
+      auto neighbors = d.getNeighbours(cudaq::Device::Qubit(i));
       numEdges += neighbors.size();
       for (auto neighbor : neighbors) {
         edgeVector.emplace_back(64, i);
@@ -570,15 +568,15 @@ struct MappingPrep : public cudaq::opt::impl::MappingPrepBase<MappingPrep> {
     return sparseInt;
   }
 
-  quake::WireSetOp insertWireSetOpForDevice(Device &d, ModuleOp mod) {
+  quake::WireSetOp insertWireSetOpForDevice(cudaq::Device &d, ModuleOp mod) {
     if (auto wires = mod.lookupSymbol<quake::WireSetOp>(mappedWireSetName))
       return wires;
 
     auto adjacency = getAdjacencyFromDevice(d, mod.getContext());
     OpBuilder builder(mod.getBodyRegion());
-    auto wireSetOp = builder.create<quake::WireSetOp>(
-        builder.getUnknownLoc(), mappedWireSetName, d.getNumQubits(),
-        adjacency);
+    auto wireSetOp = quake::WireSetOp::create(builder, builder.getUnknownLoc(),
+                                              mappedWireSetName,
+                                              d.getNumQubits(), adjacency);
     wireSetOp.setPrivate();
     return wireSetOp;
   }
@@ -597,7 +595,7 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
   using MappingFuncBase::MappingFuncBase;
 
   bool deviceBypass = false;
-  std::optional<Device> deviceInstance;
+  std::optional<cudaq::Device> deviceInstance;
 
   virtual LogicalResult initialize(MLIRContext *context) override {
     std::tie(deviceBypass, deviceInstance) = deviceFromString(device);
@@ -705,7 +703,7 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
 
     SmallVector<quake::BorrowWireOp> sources(deviceNumQubits);
     SmallVector<quake::ReturnWireOp> returnsToRemove;
-    DenseMap<Value, Placement::VirtualQ> wireToVirtualQ;
+    DenseMap<Value, cudaq::Placement::VirtualQ> wireToVirtualQ;
     SmallVector<std::size_t> userQubitsMeasured;
     DenseMap<std::size_t, Value> finalQubitWire;
     Operation *lastSource = nullptr;
@@ -713,7 +711,7 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
       if (auto qop = dyn_cast<quake::BorrowWireOp>(op)) {
         // Assign a new virtual qubit to the resulting wire.
         auto id = qop.getIdentity();
-        wireToVirtualQ[qop.getResult()] = Placement::VirtualQ(id);
+        wireToVirtualQ[qop.getResult()] = cudaq::Placement::VirtualQ(id);
         finalQubitWire[id] = qop.getResult();
         sources[id] = qop;
         lastSource = &op;
@@ -760,7 +758,7 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
         // Get the wire operands and check if the operators uses at most two
         // qubits. N.B: Measurements do not have this restriction.
         auto wireOperands = quake::getQuantumOperands(&op);
-        if (!op.hasTrait<QuantumMeasure>() && wireOperands.size() > 2) {
+        if (!op.hasTrait<cudaq::QuantumMeasure>() && wireOperands.size() > 2) {
           if (nonComposable) {
             func.emitError("Cannot map a kernel with operators that use more "
                            "than two qubits.");
@@ -820,11 +818,11 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
       Type resTy = builder.getI1Type();
       for (unsigned i = 0; i < sources.size(); i++) {
         if (sources[i] != nullptr) {
-          auto measureOp = builder.create<quake::MzOp>(
-              finalQubitWire[i].getLoc(), TypeRange{measTy, wireTy},
-              finalQubitWire[i]);
-          builder.create<quake::DiscriminateOp>(finalQubitWire[i].getLoc(),
-                                                resTy, measureOp.getMeasOut());
+          auto measureOp =
+              quake::MzOp::create(builder, finalQubitWire[i].getLoc(),
+                                  TypeRange{measTy, wireTy}, finalQubitWire[i]);
+          quake::DiscriminateOp::create(builder, finalQubitWire[i].getLoc(),
+                                        resTy, measureOp.getMeasOut());
 
           wireToVirtualQ.insert(
               {measureOp.getWires()[0], wireToVirtualQ[finalQubitWire[i]]});
@@ -848,15 +846,15 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
     builder.setInsertionPointAfter(lastSource);
     for (unsigned i = 0; i < deviceInstance->getNumQubits(); i++) {
       if (!sources[i]) {
-        auto borrowOp = builder.create<quake::BorrowWireOp>(
-            unknownLoc, wireTy, mappedWireSetName, i);
-        wireToVirtualQ[borrowOp.getResult()] = Placement::VirtualQ(i);
+        auto borrowOp = quake::BorrowWireOp::create(builder, unknownLoc, wireTy,
+                                                    mappedWireSetName, i);
+        wireToVirtualQ[borrowOp.getResult()] = cudaq::Placement::VirtualQ(i);
         sources[i] = borrowOp;
       }
     }
 
     // Place
-    Placement placement(sources.size(), deviceInstance->getNumQubits());
+    cudaq::Placement placement(sources.size(), deviceInstance->getNumQubits());
     identityPlacement(placement);
 
     // Route
@@ -881,13 +879,13 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
     // unsigned highestMappedQubit = 0;
     builder.setInsertionPoint(block.getTerminator());
     auto phyToWire = router.getPhyToWire();
-    for (auto &[i, s] : llvm::enumerate(sources)) {
+    for (const auto &[i, s] : llvm::enumerate(sources)) {
       if (s->getUsers().empty()) {
         s->erase();
       } else {
         // highestMappedQubit = i;
-        builder.create<quake::ReturnWireOp>(phyToWire[i].getLoc(),
-                                            phyToWire[i]);
+        quake::ReturnWireOp::create(builder, phyToWire[i].getLoc(),
+                                    phyToWire[i]);
       }
     }
 
@@ -900,9 +898,9 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
     //     dataForOriginalQubit[v] = dataFromBackendQubit[mapping_v2p[v]];
     llvm::SmallVector<Attribute> attrs(*highestIdentity + 1);
     for (unsigned int v = 0; v < *highestIdentity + 1; v++)
-      attrs[v] =
-          IntegerAttr::get(builder.getIntegerType(64),
-                           placement.getPhy(Placement::VirtualQ(v)).index);
+      attrs[v] = IntegerAttr::get(
+          builder.getIntegerType(64),
+          placement.getPhy(cudaq::Placement::VirtualQ(v)).index);
 
     func->setAttr("mapping_v2p", builder.getArrayAttr(attrs));
 
@@ -919,7 +917,7 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
     measuredQubits.reserve(userQubitsMeasured.size());
     for (auto mq : userQubitsMeasured) {
       measuredQubits.emplace_back(
-          mq, placement.getPhy(Placement::VirtualQ(mq)).index);
+          mq, placement.getPhy(cudaq::Placement::VirtualQ(mq)).index);
     }
     // First sort the pairs according to the physical qubits.
     llvm::sort(measuredQubits,
diff --git a/lib/Optimizer/Transforms/MemToReg.cpp b/lib/Optimizer/Transforms/MemToReg.cpp
index 0cece166036..0bf0d7593ab 100644
--- a/lib/Optimizer/Transforms/MemToReg.cpp
+++ b/lib/Optimizer/Transforms/MemToReg.cpp
@@ -17,7 +17,6 @@
 /// load/store form (QLS), is required and performed.
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "llvm/ADT/MapVector.h"
@@ -211,16 +210,21 @@ class RegionDataFlow {
     // Stitch together the control-flow across op's regions.
     if (auto regionOp = dyn_cast<RegionBranchOpInterface>(op)) {
       SmallVector<RegionSuccessor> successors;
-      regionOp.getSuccessorRegions(std::nullopt, {}, successors);
+      regionOp.getSuccessorRegions(RegionBranchPoint::parent(), successors);
       for (auto iter : successors)
-        if (iter.getSuccessor())
+        if (iter.getSuccessor() && !iter.getSuccessor()->empty())
           entryCFG.insert(&iter.getSuccessor()->front());
       for (auto &region : op->getRegions()) {
+        if (region.empty())
+          continue;
         SmallVector<Block *> regionExitBlocks;
         for (auto &b : region)
           if (b.hasNoSuccessors())
             regionExitBlocks.push_back(&b);
-        regionOp.getSuccessorRegions(region.getRegionNumber(), {}, successors);
+        auto *terminator = region.back().getTerminator();
+        if (auto terminatorOp =
+                dyn_cast<RegionBranchTerminatorOpInterface>(terminator))
+          regionOp.getSuccessorRegions(terminatorOp, successors);
         // Every region has exactly one entry and one or more exits.
         for (auto *b : regionExitBlocks)
           for (auto iter : successors) {
@@ -315,9 +319,9 @@ class RegionDataFlow {
   SSAReg reloadMemoryReference(OpBuilder &builder, MemRef mr) {
     if (isa<quake::RefType>(mr.getType())) {
       auto wireTy = quake::WireType::get(builder.getContext());
-      return builder.create<quake::UnwrapOp>(mr.getLoc(), wireTy, mr);
+      return quake::UnwrapOp::create(builder, mr.getLoc(), wireTy, mr);
     }
-    return builder.create<cudaq::cc::LoadOp>(mr.getLoc(), mr);
+    return cudaq::cc::LoadOp::create(builder, mr.getLoc(), mr);
   }
 
   SSAReg unsafeAddLiveInToBlock(Block *block, MemRef mr) {
@@ -550,9 +554,9 @@ class ResetOpPattern : public OpRewritePattern<quake::ResetOp> {
     auto wireTy = quake::WireType::get(rewriter.getContext());
     auto opnd = op.getTargets();
     assert(opnd.getType() == quake::RefType::get(rewriter.getContext()));
-    Value target = rewriter.create<quake::UnwrapOp>(loc, wireTy, opnd);
+    Value target = quake::UnwrapOp::create(rewriter, loc, wireTy, opnd);
     auto newOp =
-        rewriter.create<quake::ResetOp>(loc, TypeRange{wireTy}, target);
+        quake::ResetOp::create(rewriter, loc, TypeRange{wireTy}, target);
     rewriter.replaceOpWithNewOp<quake::WrapOp>(op, newOp.getResult(0), opnd);
     return success();
   }
@@ -568,7 +572,7 @@ class DeallocOpPattern : public OpRewritePattern<quake::DeallocOp> {
     auto wireTy = quake::WireType::get(rewriter.getContext());
     auto opnd = op.getReference();
     assert(isa<quake::RefType>(opnd.getType()));
-    Value target = rewriter.create<quake::UnwrapOp>(loc, wireTy, opnd);
+    Value target = quake::UnwrapOp::create(rewriter, loc, wireTy, opnd);
     rewriter.replaceOpWithNewOp<quake::SinkOp>(op, target);
     return success();
   }
@@ -594,7 +598,7 @@ class Wrapper : public OpRewritePattern<OP> {
       for (auto opnd : op.getControls()) {
         auto opndTy = opnd.getType();
         if (opndTy == qrefTy) {
-          auto unwrap = rewriter.create<quake::UnwrapOp>(loc, wireTy, opnd);
+          auto unwrap = quake::UnwrapOp::create(rewriter, loc, wireTy, opnd);
           unwrapCtrls.push_back(unwrap);
         } else {
           unwrapCtrls.push_back(opnd);
@@ -605,7 +609,7 @@ class Wrapper : public OpRewritePattern<OP> {
     for (auto opnd : op.getTargets()) {
       auto opndTy = opnd.getType();
       if (opndTy == qrefTy) {
-        auto unwrap = rewriter.create<quake::UnwrapOp>(loc, wireTy, opnd);
+        auto unwrap = quake::UnwrapOp::create(rewriter, loc, wireTy, opnd);
         unwrapTargs.push_back(unwrap);
       } else {
         unwrapTargs.push_back(opnd);
@@ -619,8 +623,8 @@ class Wrapper : public OpRewritePattern<OP> {
         auto opndTy = i.value().getType();
         auto offset = i.index() + addend;
         if (opndTy == qrefTy) {
-          rewriter.create<quake::WrapOp>(loc, newOp.getResult(offset),
-                                         i.value());
+          quake::WrapOp::create(rewriter, loc, newOp.getResult(offset),
+                                i.value());
         } else if (opndTy == wireTy) {
           op.getResult(count++).replaceAllUsesWith(newOp.getResult(offset));
         }
@@ -633,8 +637,8 @@ class Wrapper : public OpRewritePattern<OP> {
       SmallVector<Type> newTy = {op.getMeasOut().getType()};
       SmallVector<Type> wireTys(unwrapTargs.size(), wireTy);
       newTy.append(wireTys.begin(), wireTys.end());
-      auto newOp = rewriter.create<OP>(loc, newTy, unwrapTargs,
-                                       op.getRegisterNameAttr());
+      auto newOp = OP::create(rewriter, loc, newTy, unwrapTargs,
+                              op.getRegisterNameAttr());
       SmallVector<Value> wireOperands = op.getTargets();
       op.getResult(0).replaceAllUsesWith(newOp.getResult(0));
       threadWires(wireOperands, newOp, 1);
@@ -644,9 +648,9 @@ class Wrapper : public OpRewritePattern<OP> {
       // propagated to wrap operations.
       auto numberOfWires = wireCount(unwrapCtrls, unwrapTargs);
       SmallVector<Type> wireTys{numberOfWires, wireTy};
-      auto newOp = rewriter.create<OP>(
-          loc, wireTys, op.getIsAdjAttr(), op.getParameters(), unwrapCtrls,
-          unwrapTargs, op.getNegatedQubitControlsAttr());
+      auto newOp = OP::create(rewriter, loc, wireTys, op.getIsAdjAttr(),
+                              op.getParameters(), unwrapCtrls, unwrapTargs,
+                              op.getNegatedQubitControlsAttr());
       auto wireOperands =
           filteredByType(qrefTy, op.getControls(), op.getTargets());
       threadWires(wireOperands, newOp, 0);
@@ -726,8 +730,12 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
       op->erase();
     }
     for (auto wrap : wrapOps) {
-      auto ref = wrap.getRefValue();
-      auto wire = wrap.getWireValue();
+      // In LLVM 22, the typed accessors (getRefValue/getWireValue) perform
+      // llvm::cast<TypedValue<T>> which crashes on null operands. After
+      // erasing other ops above (with dropAllUses), WrapOp operands may be
+      // null. Use raw getOperand() to safely check for null.
+      Value ref = wrap->getOperand(1);  // ref_value is operand 1
+      Value wire = wrap->getOperand(0); // wire_value is operand 0
       if (!ref || !wire.hasOneUse()) {
         LLVM_DEBUG(llvm::dbgs() << "erasing: "; wrap->dump();
                    llvm::dbgs() << '\n');
@@ -771,7 +779,7 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
         elseRegion.push_back(block);
         OpBuilder builder(ctx);
         builder.setInsertionPointToEnd(block);
-        builder.create<cudaq::cc::ContinueOp>(ifOp.getLoc());
+        cudaq::cc::ContinueOp::create(builder, ifOp.getLoc());
       }
     }
 
@@ -799,7 +807,7 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
               OpBuilder builder(ctx);
               builder.setInsertionPointToStart(block);
               Value v =
-                  builder.create<quake::UnwrapOp>(arg.getLoc(), wireTy, arg);
+                  quake::UnwrapOp::create(builder, arg.getLoc(), wireTy, arg);
               dataFlow.addBinding(block, arg, v);
             }
           }
@@ -823,7 +831,7 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
               if (!dataFlow.hasBinding(block, alloc)) {
                 OpBuilder builder(alloc);
                 Value v =
-                    builder.create<quake::NullWireOp>(alloc.getLoc(), wireTy);
+                    quake::NullWireOp::create(builder, alloc.getLoc(), wireTy);
                 cleanUps.insert(alloc);
                 dataFlow.addBinding(block, alloc, v);
               }
@@ -838,14 +846,14 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
               for (auto v : op->getOperands())
                 if (v.getType() == qrefTy && dataFlow.hasBinding(block, v))
                   if (auto vBinding = dataFlow.getBinding(block, v)) {
-                    builder.create<quake::WrapOp>(op->getLoc(), vBinding, v);
+                    quake::WrapOp::create(builder, op->getLoc(), vBinding, v);
                     dataFlow.cancelBinding(block, v);
                   }
               builder.setInsertionPointAfter(op);
               for (auto r : op->getResults())
                 if (r.getType() == qrefTy) {
                   Value v =
-                      builder.create<quake::UnwrapOp>(op->getLoc(), wireTy, r);
+                      quake::UnwrapOp::create(builder, op->getLoc(), wireTy, r);
                   dataFlow.addBinding(block, r, v);
                 }
             }
@@ -858,8 +866,8 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
             if (memAnalysis.isMember(alloc)) {
               if (classicalValues && !dataFlow.hasBinding(block, alloc)) {
                 OpBuilder builder(alloc);
-                Value v = builder.create<cudaq::cc::UndefOp>(
-                    alloc.getLoc(), alloc.getElementType());
+                Value v = cudaq::cc::UndefOp::create(builder, alloc.getLoc(),
+                                                     alloc.getElementType());
                 cleanUps.insert(alloc);
                 dataFlow.addBinding(block, alloc, v);
               }
@@ -981,7 +989,7 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
             if ((v.getType() == qrefTy) && dataFlow.hasBinding(block, v))
               if (auto vBinding = dataFlow.getBinding(block, v)) {
                 OpBuilder builder(op);
-                builder.create<quake::WrapOp>(op->getLoc(), vBinding, v);
+                quake::WrapOp::create(builder, op->getLoc(), vBinding, v);
                 dataFlow.cancelBinding(block, v);
               }
 
@@ -1051,9 +1059,9 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
           auto oldVal = dataFlow.getBinding(block, liveOut);
           if (!oldVal) {
             OpBuilder builder(term);
-            oldVal = builder.create<quake::UnwrapOp>(
-                term->getLoc(), quake::WireType::get(builder.getContext()),
-                liveOut);
+            oldVal = quake::UnwrapOp::create(
+                builder, term->getLoc(),
+                quake::WireType::get(builder.getContext()), liveOut);
           }
           addTerminatorArgument(term, target, oldVal);
         } else if ((usePromo ||
@@ -1106,14 +1114,15 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
       SmallVector<Type> resultTypes(parent->getResultTypes());
       for (auto d : allDefs)
         resultTypes.push_back(dereferencedType(d.getType()));
-      ConversionPatternRewriter builder(ctx);
+      IRRewriter builder(ctx);
       builder.setInsertionPoint(parent);
       SmallVector<Value> operands(parent->getOperands());
       operands.insert(operands.end(), dataFlow.getLiveInArgs().begin(),
                       dataFlow.getLiveInArgs().end());
       Operation *np = Operation::create(
           parent->getLoc(), parent->getName(), resultTypes, operands,
-          parent->getAttrs(), parent->getSuccessors(), parent->getNumRegions());
+          parent->getAttrs(), OpaqueProperties{nullptr},
+          parent->getSuccessors(), parent->getNumRegions());
       builder.insert(np);
       for (unsigned i = 0; i < parent->getNumRegions(); ++i)
         builder.inlineRegionBefore(parent->getRegion(i), np->getRegion(i),
@@ -1124,11 +1133,11 @@ class MemToRegPass : public cudaq::opt::impl::MemToRegBase<MemToRegPass> {
       for (auto iter : llvm::enumerate(allDefs)) {
         auto i = iter.index() + parent->getNumResults();
         if (np->getResult(i).getType() == wireTy)
-          builder.create<quake::WrapOp>(np->getLoc(), np->getResult(i),
-                                        iter.value());
+          quake::WrapOp::create(builder, np->getLoc(), np->getResult(i),
+                                iter.value());
         else
-          builder.create<cudaq::cc::StoreOp>(np->getLoc(), np->getResult(i),
-                                             iter.value());
+          cudaq::cc::StoreOp::create(builder, np->getLoc(), np->getResult(i),
+                                     iter.value());
       }
       cleanUps.insert(parent);
       parent = np;
diff --git a/lib/Optimizer/Transforms/MultiControlDecomposition.cpp b/lib/Optimizer/Transforms/MultiControlDecomposition.cpp
index c05753a7800..d1b586b4c8e 100644
--- a/lib/Optimizer/Transforms/MultiControlDecomposition.cpp
+++ b/lib/Optimizer/Transforms/MultiControlDecomposition.cpp
@@ -7,23 +7,17 @@
  ******************************************************************************/
 
 #include "DecompositionPatterns.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeInterfaces.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 
-using namespace mlir;
-using namespace cudaq;
-
-//===----------------------------------------------------------------------===//
-// Generated logic
-//===----------------------------------------------------------------------===//
-
 namespace cudaq::opt {
 #define GEN_PASS_DEF_MULTICONTROLDECOMPOSITION
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
+using namespace mlir;
+
 //===----------------------------------------------------------------------===//
 // Helpers
 //===----------------------------------------------------------------------===//
@@ -35,10 +29,10 @@ static Operation *createOperator(Location loc, StringRef name,
   SmallVector<Value> operands(parameters);
   operands.append(controls.begin(), controls.end());
   operands.append(targets.begin(), targets.end());
-  auto segmentSizes =
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(parameters.size()),
-                                    static_cast<int32_t>(controls.size()),
-                                    static_cast<int32_t>(targets.size())});
+  auto segmentSizes = builder.getDenseI32ArrayAttr(
+      {static_cast<std::int32_t>(parameters.size()),
+       static_cast<std::int32_t>(controls.size()),
+       static_cast<std::int32_t>(targets.size())});
   auto op = builder.create(loc, nameAttr, operands);
   op->setAttr("operand_segment_sizes", segmentSizes);
   return op;
@@ -87,7 +81,7 @@ Decomposer::extractControls(quake::OperatorInterface op,
       size = veq.getSize();
       for (size_t i = 0; i < size; ++i)
         newControls.push_back(
-            builder.create<quake::ExtractRefOp>(op.getLoc(), control, i));
+            quake::ExtractRefOp::create(builder, op.getLoc(), control, i));
     }
     if (negControls)
       negatedControls.append(size, (*negControls)[index]);
@@ -100,7 +94,7 @@ ArrayRef<Value> Decomposer::getAncillas(Location loc, std::size_t numAncillas) {
   builder.setInsertionPointToStart(entryBlock);
   // If we don't have enough ancillas, allocate some more.
   for (size_t i = allocatedAncillas.size(); i < numAncillas; ++i)
-    allocatedAncillas.push_back(builder.create<quake::AllocaOp>(loc));
+    allocatedAncillas.push_back(quake::AllocaOp::create(builder, loc));
   return {allocatedAncillas.begin(), allocatedAncillas.begin() + numAncillas};
 }
 
@@ -137,14 +131,14 @@ LogicalResult Decomposer::v_decomposition(quake::OperatorInterface op) {
   // Compute intermediate results
   SmallVector<Operation *> toCleanup;
   std::array<Value, 2> cs = {controls[0], controls[1]};
-  toCleanup.push_back(builder.create<quake::XOp>(loc, cs, ancillas[0]));
+  toCleanup.push_back(quake::XOp::create(builder, loc, cs, ancillas[0]));
   if (!negatedControls.empty() && (negatedControls[0] || negatedControls[1]))
     toCleanup.back()->setAttr("negated_qubit_controls",
                               builder.getDenseBoolArrayAttr(
                                   {negatedControls[0], negatedControls[1]}));
   for (std::size_t c = 2, a = 0, n = requiredAncillas + 1; c < n; ++c, ++a) {
     cs = {controls[c], ancillas[a]};
-    toCleanup.push_back(builder.create<quake::XOp>(loc, cs, ancillas[a + 1]));
+    toCleanup.push_back(quake::XOp::create(builder, loc, cs, ancillas[a + 1]));
     if (!negatedControls.empty() && negatedControls[c])
       toCleanup.back()->setAttr("negated_qubit_controls",
                                 builder.getDenseBoolArrayAttr({true, false}));
@@ -174,7 +168,7 @@ LogicalResult Decomposer::v_decomposition(quake::OperatorInterface op) {
 //===----------------------------------------------------------------------===//
 namespace {
 struct Decomposition
-    : public opt::impl::MultiControlDecompositionBase<Decomposition> {
+    : public cudaq::opt::impl::MultiControlDecompositionBase<Decomposition> {
   using MultiControlDecompositionBase::MultiControlDecompositionBase;
 
   void runOnOperation() override {
@@ -194,5 +188,4 @@ struct Decomposition
     });
   }
 };
-
 } // namespace
diff --git a/lib/Optimizer/Transforms/ObserveAnsatz.cpp b/lib/Optimizer/Transforms/ObserveAnsatz.cpp
index 184f9e91984..623ba6b6eae 100644
--- a/lib/Optimizer/Transforms/ObserveAnsatz.cpp
+++ b/lib/Optimizer/Transforms/ObserveAnsatz.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 
 namespace cudaq::opt {
@@ -29,31 +29,32 @@ void appendMeasurement(MeasureBasis &basis, OpBuilder &builder, Location &loc,
     // Value semantics
     auto wireTy = quake::WireType::get(builder.getContext());
     if (basis == MeasureBasis::X) {
-      auto newOp = builder.create<quake::HOp>(
-          loc, TypeRange{wireTy}, /*is_adj=*/false, ValueRange{}, ValueRange{},
-          targets, DenseBoolArrayAttr{});
+      auto newOp = quake::HOp::create(
+          builder, loc, TypeRange{wireTy}, /*is_adj=*/false, ValueRange{},
+          ValueRange{}, targets, DenseBoolArrayAttr{});
       qubit.replaceAllUsesExcept(newOp.getResult(0), newOp);
       qubit = newOp.getResult(0);
     } else if (basis == MeasureBasis::Y) {
       llvm::APFloat d(M_PI_2);
       Value rotation =
-          builder.create<arith::ConstantFloatOp>(loc, d, builder.getF64Type());
-      auto newOp = builder.create<quake::RxOp>(
-          loc, TypeRange{wireTy}, /*is_adj=*/false, ValueRange{rotation},
-          ValueRange{}, ValueRange{qubit}, DenseBoolArrayAttr{});
+          arith::ConstantFloatOp::create(builder, loc, builder.getF64Type(), d);
+      auto newOp =
+          quake::RxOp::create(builder, loc, TypeRange{wireTy}, /*is_adj=*/false,
+                              ValueRange{rotation}, ValueRange{},
+                              ValueRange{qubit}, DenseBoolArrayAttr{});
       qubit.replaceAllUsesExcept(newOp.getResult(0), newOp);
       qubit = newOp.getResult(0);
     }
   } else {
     // Reference semantics
     if (basis == MeasureBasis::X) {
-      builder.create<quake::HOp>(loc, ValueRange{}, targets);
+      quake::HOp::create(builder, loc, ValueRange{}, targets);
     } else if (basis == MeasureBasis::Y) {
       llvm::APFloat d(M_PI_2);
       Value rotation =
-          builder.create<arith::ConstantFloatOp>(loc, d, builder.getF64Type());
+          arith::ConstantFloatOp::create(builder, loc, builder.getF64Type(), d);
       SmallVector<Value> params{rotation};
-      builder.create<quake::RxOp>(loc, params, ValueRange{}, targets);
+      quake::RxOp::create(builder, loc, params, ValueRange{}, targets);
     }
   }
 }
@@ -304,7 +305,7 @@ class ObserveAnsatzPass
         auto veqOp = seekIndexed->second.first;
         auto index = seekIndexed->second.second;
         auto extractRef =
-            builder.create<quake::ExtractRefOp>(loc, veqOp, index);
+            quake::ExtractRefOp::create(builder, loc, veqOp, index);
         qubitVal = extractRef.getResult();
       } else {
         qubitVal = seek->second;
@@ -321,19 +322,19 @@ class ObserveAnsatzPass
 
     auto measTy = quake::MeasureType::get(builder.getContext());
     auto wireTy = quake::WireType::get(builder.getContext());
-    for (auto &[measureNum, qubitToMeasure] :
+    for (const auto &[measureNum, qubitToMeasure] :
          llvm::enumerate(qubitsToMeasure)) {
       // add the measure
       char regName[16];
       std::snprintf(regName, sizeof(regName), "r%05lu", measureNum);
       if (quake::isLinearType(qubitToMeasure.getType())) {
-        auto newOp = builder.create<quake::MzOp>(
-            loc, TypeRange{measTy, wireTy}, ValueRange{qubitToMeasure},
+        auto newOp = quake::MzOp::create(
+            builder, loc, TypeRange{measTy, wireTy}, ValueRange{qubitToMeasure},
             builder.getStringAttr(regName));
         qubitToMeasure.replaceAllUsesExcept(newOp.getResult(1), newOp);
       } else {
-        builder.create<quake::MzOp>(loc, measTy, qubitToMeasure,
-                                    builder.getStringAttr(regName));
+        quake::MzOp::create(builder, loc, measTy, qubitToMeasure,
+                            builder.getStringAttr(regName));
       }
     }
 
diff --git a/lib/Optimizer/Transforms/PassDetails.h b/lib/Optimizer/Transforms/PassDetails.h
index 5927f6b04e3..1246351fa19 100644
--- a/lib/Optimizer/Transforms/PassDetails.h
+++ b/lib/Optimizer/Transforms/PassDetails.h
@@ -9,22 +9,19 @@
 #pragma once
 
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 
-namespace cudaq::opt {
-
-#define GEN_PASS_CLASSES
-#include "cudaq/Optimizer/Transforms/Passes.h.inc"
-
-} // namespace cudaq::opt
-
 #define GATE_OPS(MACRO)                                                        \
   MACRO(XOp), MACRO(YOp), MACRO(ZOp), MACRO(HOp), MACRO(SOp), MACRO(TOp),      \
       MACRO(SwapOp), MACRO(R1Op), MACRO(RxOp), MACRO(PhasedRxOp), MACRO(RyOp), \
diff --git a/lib/Optimizer/Transforms/PhaseFolding.cpp b/lib/Optimizer/Transforms/PhaseFolding.cpp
index 959b17d910b..f2a07aba2cc 100644
--- a/lib/Optimizer/Transforms/PhaseFolding.cpp
+++ b/lib/Optimizer/Transforms/PhaseFolding.cpp
@@ -7,8 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -528,7 +526,7 @@ class PhaseStorage {
     auto rot_arg2 = rzop.getOperand(0);
     auto builder = OpBuilder(rzop);
     auto new_rot_arg =
-        builder.create<arith::AddFOp>(rzop.getLoc(), rot_arg1, rot_arg2);
+        arith::AddFOp::create(builder, rzop.getLoc(), rot_arg1, rot_arg2);
     rzop->setOperand(0, new_rot_arg.getResult());
     old_rzop.erase();
     rotations[prev_idx] = rzop;
diff --git a/lib/Optimizer/Transforms/Pipelines.cpp b/lib/Optimizer/Transforms/Pipelines.cpp
index b52da3e3474..c7f019d9d3f 100644
--- a/lib/Optimizer/Transforms/Pipelines.cpp
+++ b/lib/Optimizer/Transforms/Pipelines.cpp
@@ -124,8 +124,8 @@ void cudaq::opt::addDecomposition(OpPassManager &pm,
   // NB: Both of these ListOption *must* be set here or they may contain garbage
   // and the compiler may crash.
   cudaq::opt::DecompositionOptions opts;
-  opts.disabledPatterns = disabledPats;
-  opts.enabledPatterns = enabledPats;
+  opts.disabledPatterns.assign(disabledPats.begin(), disabledPats.end());
+  opts.enabledPatterns.assign(enabledPats.begin(), enabledPats.end());
   pm.addPass(cudaq::opt::createDecomposition(opts));
 }
 
diff --git a/lib/Optimizer/Transforms/PruneCtrlRelations.cpp b/lib/Optimizer/Transforms/PruneCtrlRelations.cpp
index 57324593a9c..b305456429b 100644
--- a/lib/Optimizer/Transforms/PruneCtrlRelations.cpp
+++ b/lib/Optimizer/Transforms/PruneCtrlRelations.cpp
@@ -60,7 +60,7 @@ class MakeControl : public OpRewritePattern<OP> {
         if (auto fromCtrl = cv.template getDefiningOp<quake::FromControlOp>()) {
           input = fromCtrl.getCtrlbit();
         } else {
-          input = rewriter.template create<quake::ToControlOp>(loc, ctrlTy, cv);
+          input = quake::ToControlOp::create(rewriter, loc, ctrlTy, cv);
         }
         newCtrls.push_back(input);
         coarity--;
@@ -72,9 +72,9 @@ class MakeControl : public OpRewritePattern<OP> {
     // Create a copy of `op` with the correct coarity and with the control wires
     // each now passing through a ToControlOp.
     SmallVector<Type> wireTys{coarity, wireTy};
-    auto newOp = rewriter.create<OP>(
-        loc, wireTys, op.getIsAdjAttr(), op.getParameters(), newCtrls,
-        op.getTargets(), op.getNegatedQubitControlsAttr());
+    auto newOp = OP::create(rewriter, loc, wireTys, op.getIsAdjAttr(),
+                            op.getParameters(), newCtrls, op.getTargets(),
+                            op.getNegatedQubitControlsAttr());
 
     // Loop over the original controls again, this time adding a FromControlOp
     // so that the IR will type check when we replace the old op.
@@ -82,8 +82,8 @@ class MakeControl : public OpRewritePattern<OP> {
     for (auto i : llvm::enumerate(op.getControls())) {
       auto cv = i.value();
       if (cv.getType() == wireTy) {
-        Value fromCtrl = rewriter.template create<quake::FromControlOp>(
-            loc, wireTy, newCtrls[i.index()]);
+        Value fromCtrl = quake::FromControlOp::create(rewriter, loc, wireTy,
+                                                      newCtrls[i.index()]);
         op.getResult(i.index()).replaceAllUsesWith(fromCtrl);
       } else {
         op.getResult(i.index()).replaceAllUsesWith(newOp.getResult(newIdx++));
@@ -134,8 +134,8 @@ class PruneCtrlRelationsPass
     auto func = getOperation();
     RewritePatternSet patterns(ctx);
     patterns.insert<WRAPPER_GATE_OPS, ForwardControl>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                            std::move(patterns)))) {
+    if (failed(
+            applyPatternsGreedily(func.getOperation(), std::move(patterns)))) {
       signalPassFailure();
     }
   }
diff --git a/lib/Optimizer/Transforms/PySynthCallableBlockArgs.cpp b/lib/Optimizer/Transforms/PySynthCallableBlockArgs.cpp
index 3b956e96eff..a81e787af61 100644
--- a/lib/Optimizer/Transforms/PySynthCallableBlockArgs.cpp
+++ b/lib/Optimizer/Transforms/PySynthCallableBlockArgs.cpp
@@ -8,15 +8,17 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
+namespace cudaq::opt {
+#define GEN_PASS_DEF_PYSYNTHCALLABLEBLOCKARGS
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
 using namespace mlir;
 
 namespace {
@@ -126,13 +128,14 @@ class UpdateQuakeApplyOp : public OpConversionPattern<quake::ApplyOp> {
 };
 
 class PySynthCallableBlockArgs
-    : public cudaq::opt::PySynthCallableBlockArgsBase<
+    : public cudaq::opt::impl::PySynthCallableBlockArgsBase<
           PySynthCallableBlockArgs> {
 private:
   bool removeBlockArg = false;
 
 public:
   SmallVector<StringRef> names;
+  PySynthCallableBlockArgs() = default;
   PySynthCallableBlockArgs(const SmallVector<StringRef> &_names, bool remove)
       : removeBlockArg(remove), names(_names) {}
 
@@ -191,7 +194,7 @@ class PySynthCallableBlockArgs
         if (isa<cudaq::cc::CallableType>(op.getArgument(argIndex).getType()))
           argsToErase.set(argIndex);
 
-      op.eraseArguments(argsToErase);
+      (void)op.eraseArguments(argsToErase);
     }
   }
 };
diff --git a/lib/Optimizer/Transforms/QuakePropagateMetadata.cpp b/lib/Optimizer/Transforms/QuakePropagateMetadata.cpp
index a6db45dd7a8..2fca2ec772f 100644
--- a/lib/Optimizer/Transforms/QuakePropagateMetadata.cpp
+++ b/lib/Optimizer/Transforms/QuakePropagateMetadata.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/CallGraph.h"
@@ -92,8 +91,8 @@ class QuakePropagateMetadataPass
       for (auto caller : callers) {
 
         LLVM_DEBUG(llvm::dbgs() << "  Caller: " << caller.getName() << "\n\n");
-        if (auto boolAttr = callee->getAttr("qubitMeasurementFeedback")
-                                .dyn_cast_or_null<mlir::BoolAttr>()) {
+        if (auto boolAttr = dyn_cast_if_present<mlir::BoolAttr>(
+                callee->getAttr("qubitMeasurementFeedback"))) {
           if (boolAttr.getValue()) {
             LLVM_DEBUG(llvm::dbgs()
                        << "  Propagating qubitMeasurementFeedback attr: "
diff --git a/lib/Optimizer/Transforms/QuakeSimplify.cpp b/lib/Optimizer/Transforms/QuakeSimplify.cpp
index fcb46b1ab4f..57292f147db 100644
--- a/lib/Optimizer/Transforms/QuakeSimplify.cpp
+++ b/lib/Optimizer/Transforms/QuakeSimplify.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -278,10 +277,10 @@ class RotationCombine : public OpRewritePattern<QOP> {
         return failure();
       }
       if (qop.isAdj())
-        p = rewriter.create<arith::NegFOp>(loc, ty, p);
+        p = arith::NegFOp::create(rewriter, loc, ty, p);
       if (prev.isAdj())
-        pp = rewriter.create<arith::NegFOp>(loc, ty, pp);
-      newParams.push_back(rewriter.create<arith::AddFOp>(loc, ty, p, pp));
+        pp = arith::NegFOp::create(rewriter, loc, ty, pp);
+      newParams.push_back(arith::AddFOp::create(rewriter, loc, ty, p, pp));
     }
 
     // Combine the two rotations.
@@ -551,7 +550,7 @@ class QuakeSimplifyPass
         RotationCombine<quake::R1Op>, RotationCombine<quake::RxOp>,
         RotationCombine<quake::RyOp>, RotationCombine<quake::RzOp>,
         RotationCombine<quake::PhasedRxOp>>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
+    if (failed(applyPatternsGreedily(op, std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
index 605e31e7511..8bf71b2db81 100644
--- a/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
+++ b/lib/Optimizer/Transforms/QuakeSynthesizer.cpp
@@ -11,21 +11,21 @@
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 
+namespace cudaq::opt {
+#define GEN_PASS_DEF_QUAKESYNTHESIZE
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
 #define DEBUG_TYPE "quake-synthesizer"
 
 using namespace mlir;
@@ -85,14 +85,14 @@ void synthesizeRuntimeArgument(
 template <typename T>
 Value makeIntegerElement(OpBuilder &builder, Location argLoc, T val,
                          IntegerType eleTy) {
-  return builder.create<arith::ConstantIntOp>(argLoc, val, eleTy);
+  return arith::ConstantIntOp::create(builder, argLoc, eleTy, val);
 }
 
 template <typename T>
 Value makeFloatElement(OpBuilder &builder, Location argLoc, T val,
                        FloatType eleTy) {
-  return builder.create<arith::ConstantFloatOp>(argLoc, llvm::APFloat{val},
-                                                eleTy);
+  return arith::ConstantFloatOp::create(builder, argLoc, eleTy,
+                                        llvm::APFloat{val});
 }
 
 template <typename T>
@@ -102,7 +102,7 @@ Value makeComplexElement(OpBuilder &builder, Location argLoc,
   auto realPart = builder.getFloatAttr(eleTy, llvm::APFloat{val.real()});
   auto imagPart = builder.getFloatAttr(eleTy, llvm::APFloat{val.imag()});
   auto complexVal = builder.getArrayAttr({realPart, imagPart});
-  return builder.create<complex::ConstantOp>(argLoc, eleTy, complexVal);
+  return complex::ConstantOp::create(builder, argLoc, eleTy, complexVal);
 }
 
 /// returns true if and only if \p argument is used by a `quake.init_state`
@@ -128,8 +128,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
   auto eleTy = cast<ELETY>(strTy.getElementType());
   builder.setInsertionPointToStart(argument.getOwner());
   auto argLoc = argument.getLoc();
-  auto conArray = builder.create<cudaq::cc::ConstantArrayOp>(
-      argLoc, cudaq::cc::ArrayType::get(ctx, eleTy, vec.size()), arrayAttr);
+  auto conArray = cudaq::cc::ConstantArrayOp::create(
+      builder, argLoc, cudaq::cc::ArrayType::get(ctx, eleTy, vec.size()),
+      arrayAttr);
   auto arrTy = cudaq::cc::ArrayType::get(ctx, eleTy, vec.size());
   std::optional<Value> arrayInMemory;
   auto ptrEleTy = cudaq::cc::PointerType::get(eleTy);
@@ -150,17 +151,17 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
       irBuilder.genVectorOfConstants(argLoc, module, symbol, vec);
 
       builder.setInsertionPointToStart(argument.getOwner());
-      buffer = builder.create<cudaq::cc::AddressOfOp>(
-          argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
+      buffer = cudaq::cc::AddressOfOp::create(
+          builder, argLoc, cudaq::cc::PointerType::get(arrTy), symbol);
     } else {
       builder.setInsertionPointAfter(conArray);
-      buffer = builder.create<cudaq::cc::AllocaOp>(argLoc, arrTy);
-      builder.create<cudaq::cc::StoreOp>(argLoc, conArray, buffer);
+      buffer = cudaq::cc::AllocaOp::create(builder, argLoc, arrTy);
+      cudaq::cc::StoreOp::create(builder, argLoc, conArray, buffer);
     }
 
     auto ptrArrEleTy =
         cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
-    Value res = builder.create<cudaq::cc::CastOp>(argLoc, ptrArrEleTy, buffer);
+    Value res = cudaq::cc::CastOp::create(builder, argLoc, ptrArrEleTy, buffer);
     arrayInMemory = res;
     return res;
   };
@@ -182,8 +183,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     // Handle the StdvecSize use case.
     // Replace a `vec.size()` with the length, which is a synthesized constant.
     if (auto stdvecSizeOp = dyn_cast<cudaq::cc::StdvecSizeOp>(argUser)) {
-      Value length = builder.create<arith::ConstantIntOp>(
-          argLoc, vec.size(), stdvecSizeOp.getType());
+      Value length = arith::ConstantIntOp::create(
+          builder, argLoc, stdvecSizeOp.getType(), vec.size());
       stdvecSizeOp.replaceAllUsesWith(length);
       continue;
     }
@@ -214,14 +215,15 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
           if (index == cudaq::cc::ComputePtrOp::kDynamicIndex) {
             OpBuilder::InsertionGuard guard(builder);
             builder.setInsertionPoint(elePtrOp);
-            Value getEle = builder.create<cudaq::cc::ExtractValueOp>(
-                elePtrOp.getLoc(), eleTy, conArray,
+            Value getEle = cudaq::cc::ExtractValueOp::create(
+                builder, elePtrOp.getLoc(), eleTy, conArray,
                 elePtrOp.getDynamicIndices()[0]);
             if (failed(replaceLoads(elePtrOp, getEle))) {
               Value memArr = getArrayInMemory();
               builder.setInsertionPoint(elePtrOp);
-              Value newComputedPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                  argLoc, ptrEleTy, memArr, elePtrOp.getDynamicIndices()[0]);
+              Value newComputedPtr = cudaq::cc::ComputePtrOp::create(
+                  builder, argLoc, ptrEleTy, memArr,
+                  elePtrOp.getDynamicIndices()[0]);
               elePtrOp.replaceAllUsesWith(newComputedPtr);
             }
             continue;
@@ -232,8 +234,8 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
             Value memArr = getArrayInMemory();
             OpBuilder::InsertionGuard guard(builder);
             builder.setInsertionPoint(elePtrOp);
-            Value newComputedPtr = builder.create<cudaq::cc::ComputePtrOp>(
-                argLoc, ptrEleTy, memArr,
+            Value newComputedPtr = cudaq::cc::ComputePtrOp::create(
+                builder, argLoc, ptrEleTy, memArr,
                 SmallVector<cudaq::cc::ComputePtrArg>{0, index});
             elePtrOp.replaceAllUsesWith(newComputedPtr);
           }
@@ -259,9 +261,9 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
     Value memArr = getArrayInMemory();
     OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPointAfter(memArr.getDefiningOp());
-    Value size = builder.create<arith::ConstantIntOp>(argLoc, vec.size(), 64);
+    Value size = arith::ConstantIntOp::create(builder, argLoc, vec.size(), 64);
     Value newVec =
-        builder.create<cudaq::cc::StdvecInitOp>(argLoc, strTy, memArr, size);
+        cudaq::cc::StdvecInitOp::create(builder, argLoc, strTy, memArr, size);
     argument.replaceAllUsesWith(newVec);
   }
   return success();
@@ -376,7 +378,7 @@ synthesizeVectorArgument(OpBuilder &builder, ModuleOp module, unsigned &counter,
 
 namespace {
 class QuakeSynthesizer
-    : public cudaq::opt::QuakeSynthesizeBase<QuakeSynthesizer> {
+    : public cudaq::opt::impl::QuakeSynthesizeBase<QuakeSynthesizer> {
 protected:
   // The name of the kernel to be synthesized
   std::string kernelName;
@@ -472,35 +474,38 @@ class QuakeSynthesizer
           synthesizeRuntimeArgument<bool>(
               builder, argument, args, offset, sizeof(bool),
               [=](OpBuilder &builder, bool *concrete) {
-                return builder.create<arith::ConstantIntOp>(loc, *concrete, 1);
+                return arith::ConstantIntOp::create(builder, loc, *concrete, 1);
               });
           break;
         case 8:
           synthesizeRuntimeArgument<std::uint8_t>(
               builder, argument, args, offset, sizeof(std::uint8_t),
               [=](OpBuilder &builder, std::uint8_t *concrete) {
-                return builder.create<arith::ConstantIntOp>(loc, *concrete, 8);
+                return arith::ConstantIntOp::create(builder, loc, *concrete, 8);
               });
           break;
         case 16:
           synthesizeRuntimeArgument<std::int16_t>(
               builder, argument, args, offset, sizeof(std::int16_t),
               [=](OpBuilder &builder, std::int16_t *concrete) {
-                return builder.create<arith::ConstantIntOp>(loc, *concrete, 16);
+                return arith::ConstantIntOp::create(builder, loc, *concrete,
+                                                    16);
               });
           break;
         case 32:
           synthesizeRuntimeArgument<std::int32_t>(
               builder, argument, args, offset, sizeof(std::int32_t),
               [=](OpBuilder &builder, std::int32_t *concrete) {
-                return builder.create<arith::ConstantIntOp>(loc, *concrete, 32);
+                return arith::ConstantIntOp::create(builder, loc, *concrete,
+                                                    32);
               });
           break;
         case 64:
           synthesizeRuntimeArgument<std::int64_t>(
               builder, argument, args, offset, sizeof(std::int64_t),
               [=](OpBuilder &builder, std::int64_t *concrete) {
-                return builder.create<arith::ConstantIntOp>(loc, *concrete, 64);
+                return arith::ConstantIntOp::create(builder, loc, *concrete,
+                                                    64);
               });
           break;
         default:
@@ -516,22 +521,24 @@ class QuakeSynthesizer
         synthesizeRuntimeArgument<float>(
             builder, argument, args, offset,
             cudaq::opt::convertBitsToBytes(type.getIntOrFloatBitWidth()),
-            [=](OpBuilder &builder, float *concrete) {
-              llvm::APFloat f(*concrete);
-              return builder.create<arith::ConstantFloatOp>(
-                  loc, f, builder.getF32Type());
-            });
+            std::function<Value(OpBuilder &, float *)>(
+                [=](OpBuilder &builder, float *concrete) -> Value {
+                  llvm::APFloat f(*concrete);
+                  return arith::ConstantFloatOp::create(
+                      builder, loc, builder.getF32Type(), f);
+                }));
         continue;
       }
       if (type == builder.getF64Type()) {
         synthesizeRuntimeArgument<double>(
             builder, argument, args, offset,
             cudaq::opt::convertBitsToBytes(type.getIntOrFloatBitWidth()),
-            [=](OpBuilder &builder, double *concrete) {
-              llvm::APFloat f(*concrete);
-              return builder.create<arith::ConstantFloatOp>(
-                  loc, f, builder.getF64Type());
-            });
+            std::function<Value(OpBuilder &, double *)>(
+                [=](OpBuilder &builder, double *concrete) -> Value {
+                  llvm::APFloat f(*concrete);
+                  return arith::ConstantFloatOp::create(
+                      builder, loc, builder.getF64Type(), f);
+                }));
         continue;
       }
 
@@ -544,12 +551,13 @@ class QuakeSynthesizer
             synthesizeRuntimeArgument<cudaq::state *>(
                 builder, argument, args, offset, sizeof(void *),
                 [=](OpBuilder &builder, cudaq::state **concrete) {
-                  Value rawPtr = builder.create<arith::ConstantIntOp>(
-                      loc, reinterpret_cast<std::intptr_t>(*concrete),
+                  Value rawPtr = arith::ConstantIntOp::create(
+                      builder, loc, reinterpret_cast<std::intptr_t>(*concrete),
                       sizeof(void *) * 8);
                   auto stateTy = quake::StateType::get(builder.getContext());
-                  return builder.create<cudaq::cc::CastOp>(
-                      loc, cudaq::cc::PointerType::get(stateTy), rawPtr);
+                  return cudaq::cc::CastOp::create(
+                      builder, loc, cudaq::cc::PointerType::get(stateTy),
+                      rawPtr);
                 });
             continue;
           } else {
@@ -699,30 +707,30 @@ class QuakeSynthesizer
         // that can be used in, say, a Pauli op.
         auto ptrTy = cudaq::cc::PointerType::get(charSpanTy);
         auto loc = arguments[idx].getLoc();
-        auto ns = builder.create<arith::ConstantIntOp>(loc, numberSpans, 64);
-        auto aos = builder.create<cudaq::cc::AllocaOp>(loc, charSpanTy, ns);
+        auto ns = arith::ConstantIntOp::create(builder, loc, numberSpans, 64);
+        auto aos = cudaq::cc::AllocaOp::create(builder, loc, charSpanTy, ns);
         auto pi8Ty = cudaq::cc::PointerType::get(charSpanTy.getElementType());
         cudaq::IRBuilder irBuilder(module);
         for (decltype(numberSpans) i = 0; i < numberSpans; ++i) {
           std::size_t length = spanSizes[i];
-          auto strLen = builder.create<arith::ConstantIntOp>(loc, length, 64);
+          auto strLen = arith::ConstantIntOp::create(builder, loc, length, 64);
           StringRef strData{bufferAppendix, length};
           auto global =
               irBuilder.genCStringLiteralAppendNul(loc, module, strData);
-          auto addr = builder.create<cudaq::cc::AddressOfOp>(
-              loc, cudaq::cc::PointerType::get(global.getType()),
+          auto addr = cudaq::cc::AddressOfOp::create(
+              builder, loc, cudaq::cc::PointerType::get(global.getType()),
               global.getName());
-          auto str = builder.create<cudaq::cc::CastOp>(loc, pi8Ty, addr);
-          auto spanp = builder.create<cudaq::cc::ComputePtrOp>(
-              loc, ptrTy, aos,
+          auto str = cudaq::cc::CastOp::create(builder, loc, pi8Ty, addr);
+          auto spanp = cudaq::cc::ComputePtrOp::create(
+              builder, loc, ptrTy, aos,
               ArrayRef<cudaq::cc::ComputePtrArg>{static_cast<std::int32_t>(i)});
-          auto spanData = builder.create<cudaq::cc::StdvecInitOp>(
-              loc, charSpanTy, str, strLen);
-          builder.create<cudaq::cc::StoreOp>(loc, spanData, spanp);
+          auto spanData = cudaq::cc::StdvecInitOp::create(
+              builder, loc, charSpanTy, str, strLen);
+          cudaq::cc::StoreOp::create(builder, loc, spanData, spanp);
           bufferAppendix += length;
         }
         auto svTy = cudaq::cc::StdvecType::get(charSpanTy);
-        auto ics = builder.create<cudaq::cc::StdvecInitOp>(loc, svTy, aos, ns);
+        auto ics = cudaq::cc::StdvecInitOp::create(builder, loc, svTy, aos, ns);
         arguments[idx].replaceAllUsesWith(ics);
         continue;
       }
@@ -747,7 +755,11 @@ class QuakeSynthesizer
         return;
       }
     }
-    funcOp.eraseArguments(argsToErase);
+
+    // FIXME: erasing the arguments like this breaks the semantics of the code
+    // and is a bad idea in general. This practice is HIGHLY DISCOURAGED.
+    if (failed(funcOp.eraseArguments(argsToErase)))
+      funcOp->emitWarning("could not erase arguments");
   }
 };
 
diff --git a/lib/Optimizer/Transforms/RefToVeqAlloc.cpp b/lib/Optimizer/Transforms/RefToVeqAlloc.cpp
index 4c5f3aa153d..fb71b65cb89 100644
--- a/lib/Optimizer/Transforms/RefToVeqAlloc.cpp
+++ b/lib/Optimizer/Transforms/RefToVeqAlloc.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -32,7 +31,7 @@ struct AllocaPat : public OpRewritePattern<quake::AllocaOp> {
                                 PatternRewriter &rewriter) const override {
     if (isa<quake::VeqType>(alloc.getType()))
       return failure();
-    Value newAlloc = rewriter.create<quake::AllocaOp>(alloc.getLoc(), 1u);
+    Value newAlloc = quake::AllocaOp::create(rewriter, alloc.getLoc(), 1u);
     rewriter.replaceOpWithNewOp<quake::ExtractRefOp>(alloc, newAlloc, 0u);
     return success();
   }
@@ -49,7 +48,7 @@ class PromoteRefToVeqAllocPass
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.insert<AllocaPat>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(op, std::move(patterns)))) {
       op->emitOpError("could not promote allocations");
       signalPassFailure();
     }
diff --git a/lib/Optimizer/Transforms/RegToMem.cpp b/lib/Optimizer/Transforms/RegToMem.cpp
index 4d31b57b504..50a0b2eefdf 100644
--- a/lib/Optimizer/Transforms/RegToMem.cpp
+++ b/lib/Optimizer/Transforms/RegToMem.cpp
@@ -66,8 +66,7 @@ struct RegToMemAnalysis {
   unsigned getCardinality() const { return cardinality; }
 
   std::optional<unsigned> idFromValue(Value v) const {
-    auto iter = eqClasses.findValue(toOpaque(v));
-    if (iter == eqClasses.end())
+    if (!eqClasses.contains(toOpaque(v)))
       return std::nullopt;
     return setIds.find(eqClasses.getLeaderValue(toOpaque(v)))->second;
   }
@@ -87,10 +86,10 @@ struct RegToMemAnalysis {
         auto *term = pred->getTerminator();
         auto i = successorIndex(term, block);
         Value u = cast<BranchOpInterface>(term).getSuccessorOperands(i)[argNum];
-        if (eqClasses.findValue(toOpaque(u)) == eqClasses.end())
-          insertToEqClass(u, v);
-        else
+        if (eqClasses.contains(toOpaque(u)))
           eqClasses.unionSets(toOpaque(v), toOpaque(u));
+        else
+          insertToEqClass(u, v);
       }
     }
   }
@@ -249,8 +248,8 @@ struct RegToMemAnalysis {
     }
     unsigned id = 0;
     for (auto i = eqClasses.begin(), end = eqClasses.end(); i != end; ++i)
-      if (i->isLeader()) {
-        void *leader = const_cast<void *>(*eqClasses.findLeader(i));
+      if ((*i)->isLeader()) {
+        void *leader = const_cast<void *>(*eqClasses.findLeader(**i));
         setIds.insert(std::make_pair(leader, id++));
       }
   }
@@ -258,10 +257,11 @@ struct RegToMemAnalysis {
   // For debugging purposes.
   void dump() const {
     for (auto i = eqClasses.begin(); i != eqClasses.end(); ++i) {
-      if (!i->isLeader())
+      if (!(*i)->isLeader())
         continue;
       llvm::errs() << "Set {\n";
-      for (auto e = eqClasses.member_begin(i); e != eqClasses.member_end(); ++e)
+      for (auto e = eqClasses.member_begin(**i); e != eqClasses.member_end();
+           ++e)
         llvm::errs() << "  " << Value::getFromOpaquePointer(*e) << '\n';
       llvm::errs() << "}\n";
     }
@@ -309,15 +309,16 @@ class CollapseWrappers : public OpRewritePattern<OP> {
       auto args = collect(op.getOperands());
       auto nameAttr = op.getRegisterNameAttr();
       eraseWrapUsers(op);
-      auto newOp = rewriter.create<OP>(
-          loc, ArrayRef<Type>{op.getMeasOut().getType()}, args, nameAttr);
+      auto newOp =
+          OP::create(rewriter, loc, ArrayRef<Type>{op.getMeasOut().getType()},
+                     args, nameAttr);
       op.getResult(0).replaceAllUsesWith(newOp.getResult(0));
       rewriter.eraseOp(op);
     } else if constexpr (std::is_same_v<OP, quake::ResetOp>) {
       // Reset is a special case.
       auto targ = findLookupValue(op.getTargets());
       eraseWrapUsers(op);
-      rewriter.create<quake::ResetOp>(loc, TypeRange{}, targ);
+      quake::ResetOp::create(rewriter, loc, TypeRange{}, targ);
       rewriter.eraseOp(op);
     } else if constexpr (std::is_same_v<OP, quake::SinkOp>) {
       auto targ = findLookupValue(op.getTarget());
@@ -328,8 +329,8 @@ class CollapseWrappers : public OpRewritePattern<OP> {
       auto ctrls = collect(op.getControls());
       auto targs = collect(op.getTargets());
       eraseWrapUsers(op);
-      rewriter.create<OP>(loc, op.getIsAdj(), op.getParameters(), ctrls, targs,
-                          op.getNegatedQubitControlsAttr());
+      OP::create(rewriter, loc, op.getIsAdj(), op.getParameters(), ctrls, targs,
+                 op.getNegatedQubitControlsAttr());
       rewriter.eraseOp(op);
     }
     return success();
@@ -381,8 +382,8 @@ struct EraseWiresCondBranch : public OpRewritePattern<cf::CondBranchOp> {
         newFalseOperands.push_back(v);
     }
     rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
-        branch, branch.getCondition(), newTrueOperands, newFalseOperands,
-        branch.getTrueDest(), branch.getFalseDest());
+        branch, branch.getCondition(), branch.getTrueDest(), newTrueOperands,
+        branch.getFalseDest(), newFalseOperands);
     return success();
   }
   BlockSet &blocks;
@@ -411,8 +412,8 @@ struct EraseWiresIf : public OpRewritePattern<cudaq::cc::IfOp> {
         newIfTy.push_back(ty);
     auto origThenArgs = ifOp.getThenRegion().front().getArguments();
     auto origElseArgs = ifOp.getElseRegion().front().getArguments();
-    auto newIf = rewriter.create<cudaq::cc::IfOp>(
-        ifOp.getLoc(), newIfTy, ifOp.getCondition(),
+    auto newIf = cudaq::cc::IfOp::create(
+        rewriter, ifOp.getLoc(), newIfTy, ifOp.getCondition(),
         [&](OpBuilder &, Location, Region &region) {
           rewriter.inlineRegionBefore(ifOp.getThenRegion(), region,
                                       region.end());
@@ -433,8 +434,8 @@ struct EraseWiresIf : public OpRewritePattern<cudaq::cc::IfOp> {
         for (auto [arg, from] : llvm::zip(entry.getArguments(), origArgs)) {
           auto id = analysis.idFromValue(from);
           assert(id);
-          auto unwrap = builder.create<quake::UnwrapOp>(ifOp.getLoc(), wireTy,
-                                                        allocas[*id]);
+          auto unwrap = quake::UnwrapOp::create(builder, ifOp.getLoc(), wireTy,
+                                                allocas[*id]);
           arg.replaceAllUsesWith(unwrap);
         }
       }
@@ -447,7 +448,7 @@ struct EraseWiresIf : public OpRewritePattern<cudaq::cc::IfOp> {
             for (auto v : cont.getOperands())
               if (!quake::isLinearType(v.getType()))
                 newOpnds.push_back(v);
-            builder.create<cudaq::cc::ContinueOp>(cont.getLoc(), newOpnds);
+            cudaq::cc::ContinueOp::create(builder, cont.getLoc(), newOpnds);
             rewriter.eraseOp(cont);
           }
     };
@@ -462,8 +463,8 @@ struct EraseWiresIf : public OpRewritePattern<cudaq::cc::IfOp> {
       if (quake::isLinearType(v.getType())) {
         auto id = analysis.idFromValue(v);
         assert(id);
-        auto unwrap = rewriter.create<quake::UnwrapOp>(ifOp.getLoc(), wireTy,
-                                                       allocas[*id]);
+        auto unwrap = quake::UnwrapOp::create(rewriter, ifOp.getLoc(), wireTy,
+                                              allocas[*id]);
         unwraps.push_back(unwrap);
       } else {
         unwraps.push_back(newIf.getResult(i++));
@@ -511,7 +512,7 @@ class RegToMemPass : public cudaq::opt::impl::RegToMemBase<RegToMemPass> {
         builder.setInsertionPoint(nwire);
       auto qrefTy = quake::RefType::get(ctx);
       Value a =
-          builder.create<quake::AllocaOp>(nwire->getLoc(), qrefTy, Value{});
+          quake::AllocaOp::create(builder, nwire->getLoc(), qrefTy, Value{});
       if (fromWire)
         borrowAllocas.push_back(a);
       if (auto opt = analysis.idFromValue(nwire->getResult(0))) {
@@ -575,7 +576,7 @@ class RegToMemPass : public cudaq::opt::impl::RegToMemBase<RegToMemPass> {
       if (isa<func::ReturnOp>(op) && !borrowAllocas.empty()) {
         OpBuilder builder(op);
         for (auto v : borrowAllocas)
-          builder.create<quake::DeallocOp>(func.getLoc(), v);
+          quake::DeallocOp::create(builder, func.getLoc(), v);
       }
       return WalkResult::advance();
     });
diff --git a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
index 3b7c4f30d08..f8680220eb6 100644
--- a/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
+++ b/lib/Optimizer/Transforms/ReplaceStateWithKernel.cpp
@@ -8,11 +8,7 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -124,8 +120,7 @@ class ReplaceStateWithKernelPass
     LLVM_DEBUG(llvm::dbgs()
                << "Before replace state with kernel: " << func << '\n');
 
-    if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                            std::move(patterns))))
+    if (failed(applyPatternsGreedily(func.getOperation(), std::move(patterns))))
       signalPassFailure();
 
     LLVM_DEBUG(llvm::dbgs()
diff --git a/lib/Optimizer/Transforms/ResetBeforeReuse.cpp b/lib/Optimizer/Transforms/ResetBeforeReuse.cpp
index 65f3e5e7d4a..ebb70a80f32 100644
--- a/lib/Optimizer/Transforms/ResetBeforeReuse.cpp
+++ b/lib/Optimizer/Transforms/ResetBeforeReuse.cpp
@@ -8,16 +8,10 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/CodeGen/Emitter.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Todo.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dominance.h"
-#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -125,7 +119,7 @@ class ResetAfterMeasurePattern : public OpRewritePattern<quake::MzOp> {
         // Insert reset
         Location loc = mz->getLoc();
         rewriter.setInsertionPointAfter(mz);
-        rewriter.create<quake::ResetOp>(loc, TypeRange{}, measuredQubit);
+        quake::ResetOp::create(rewriter, loc, TypeRange{}, measuredQubit);
         // Insert a conditional X to initialize qubit after reset.
         auto measOut = mz.getMeasOut();
         mlir::Value measBit = [&]() {
@@ -137,19 +131,19 @@ class ResetAfterMeasurePattern : public OpRewritePattern<quake::MzOp> {
             }
           }
           // No discriminate exists - create the discriminate Op
-          auto discOp = rewriter.create<quake::DiscriminateOp>(
-              loc, rewriter.getI1Type(), measOut);
+          auto discOp = quake::DiscriminateOp::create(
+              rewriter, loc, rewriter.getI1Type(), measOut);
           return discOp.getResult();
         }();
-        rewriter.create<cudaq::cc::IfOp>(
-            loc, TypeRange{}, measBit,
+        cudaq::cc::IfOp::create(
+            rewriter, loc, TypeRange{}, measBit,
             [&](OpBuilder &opBuilder, Location location, Region &region) {
               region.push_back(new Block{});
               auto &bodyBlock = region.front();
               OpBuilder::InsertionGuard guad(opBuilder);
               opBuilder.setInsertionPointToStart(&bodyBlock);
-              opBuilder.create<quake::XOp>(location, measuredQubit);
-              opBuilder.create<cudaq::cc::ContinueOp>(location);
+              quake::XOp::create(opBuilder, location, measuredQubit);
+              cudaq::cc::ContinueOp::create(opBuilder, location);
             });
         modified = true;
       } else {
@@ -190,7 +184,7 @@ class ResetAfterMeasurePattern : public OpRewritePattern<quake::MzOp> {
             if (v.value() != extractOp) {
               // This is another extract.
               auto nextExtractOp =
-                  dyn_cast_or_null<quake::ExtractRefOp>(v.value());
+                  dyn_cast_if_present<quake::ExtractRefOp>(v.value());
               if (nextExtractOp) {
                 std::optional<int64_t> nextIndex =
                     nextExtractOp.hasConstantIndex()
@@ -239,8 +233,8 @@ class QubitResetBeforeReusePass
     RegUseTracker tracker(funcOp);
     RewritePatternSet patterns(ctx);
     patterns.insert<ResetAfterMeasurePattern>(ctx, tracker);
-    if (failed(applyPatternsAndFoldGreedily(funcOp.getOperation(),
-                                            std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(funcOp.getOperation(),
+                                     std::move(patterns)))) {
       funcOp.emitOpError("Adding qubit reset before reuse pass failed");
       signalPassFailure();
     }
diff --git a/lib/Optimizer/Transforms/ResourceCount.cpp b/lib/Optimizer/Transforms/ResourceCount.cpp
index ed6ce573c67..8d553399c23 100644
--- a/lib/Optimizer/Transforms/ResourceCount.cpp
+++ b/lib/Optimizer/Transforms/ResourceCount.cpp
@@ -7,9 +7,8 @@
  ******************************************************************************/
 
 #include "cudaq/Optimizer/Transforms/ResourceCount.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
+#include "PassDetails.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
diff --git a/lib/Optimizer/Transforms/ResourceCountPreprocess.cpp b/lib/Optimizer/Transforms/ResourceCountPreprocess.cpp
index cafec122895..e9b12a8e4c0 100644
--- a/lib/Optimizer/Transforms/ResourceCountPreprocess.cpp
+++ b/lib/Optimizer/Transforms/ResourceCountPreprocess.cpp
@@ -10,27 +10,21 @@
 #include "PassDetails.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
-#define DEBUG_TYPE "resource-count-preprocess"
-
-using namespace mlir;
-
-//===----------------------------------------------------------------------===//
-// Generated logic
-//===----------------------------------------------------------------------===//
 namespace cudaq::opt {
 #define GEN_PASS_DEF_RESOURCECOUNTPREPROCESS
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
+#define DEBUG_TYPE "resource-count-preprocess"
+
+using namespace mlir;
+
 struct ResourceCountPreprocessPass
     : public cudaq::opt::impl::ResourceCountPreprocessBase<
           ResourceCountPreprocessPass> {
diff --git a/lib/Optimizer/Transforms/SROA.cpp b/lib/Optimizer/Transforms/SROA.cpp
index a2b48db86d5..e8017e9c412 100644
--- a/lib/Optimizer/Transforms/SROA.cpp
+++ b/lib/Optimizer/Transforms/SROA.cpp
@@ -7,8 +7,6 @@
  ******************************************************************************/
 
 #include "PassDetails.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -74,12 +72,12 @@ class AllocaAggregate : public OpRewritePattern<cudaq::cc::AllocaOp> {
     if (auto strTy =
             dyn_cast<cudaq::cc::StructType>(allocOp.getElementType())) {
       for (auto mTy : strTy.getMembers())
-        scalars.push_back(rewriter.create<cudaq::cc::AllocaOp>(loc, mTy));
+        scalars.push_back(cudaq::cc::AllocaOp::create(rewriter, loc, mTy));
     } else if (auto arrTy =
                    dyn_cast<cudaq::cc::ArrayType>(allocOp.getElementType())) {
       Type vTy = arrTy.getElementType();
       for (cudaq::cc::ArrayType::SizeType i = 0; i < arrTy.getSize(); ++i)
-        scalars.push_back(rewriter.create<cudaq::cc::AllocaOp>(loc, vTy));
+        scalars.push_back(cudaq::cc::AllocaOp::create(rewriter, loc, vTy));
     }
 
     // Replace the cc.compute_ptr ops with forwarding.
@@ -100,19 +98,21 @@ class AllocaAggregate : public OpRewritePattern<cudaq::cc::AllocaOp> {
         rewriter.setInsertionPoint(loadOp);
         auto loadTy = loadOp.getType();
         auto loc = loadOp.getLoc();
-        Value result = rewriter.create<cudaq::cc::UndefOp>(loc, loadTy);
+        Value result = cudaq::cc::UndefOp::create(rewriter, loc, loadTy);
         if (auto strTy = dyn_cast<cudaq::cc::StructType>(loadTy)) {
           for (auto [i, mTy] : llvm::enumerate(strTy.getMembers())) {
-            Value loadEle = rewriter.create<cudaq::cc::LoadOp>(loc, scalars[i]);
-            result = rewriter.create<cudaq::cc::InsertValueOp>(
-                loc, loadTy, result, loadEle, i);
+            Value loadEle =
+                cudaq::cc::LoadOp::create(rewriter, loc, scalars[i]);
+            result = cudaq::cc::InsertValueOp::create(rewriter, loc, loadTy,
+                                                      result, loadEle, i);
           }
         } else {
           auto arrTy = cast<cudaq::cc::ArrayType>(loadTy);
           for (cudaq::cc::ArrayType::SizeType i = 0; i < arrTy.getSize(); ++i) {
-            Value loadEle = rewriter.create<cudaq::cc::LoadOp>(loc, scalars[i]);
-            result = rewriter.create<cudaq::cc::InsertValueOp>(
-                loc, loadTy, result, loadEle, i);
+            Value loadEle =
+                cudaq::cc::LoadOp::create(rewriter, loc, scalars[i]);
+            result = cudaq::cc::InsertValueOp::create(rewriter, loc, loadTy,
+                                                      result, loadEle, i);
           }
         }
         updates.emplace_back(loadOp, result);
@@ -211,8 +211,8 @@ class StoreAggregate : public OpRewritePattern<cudaq::cc::StoreOp> {
       auto loc = insVal.getLoc();
       auto vTy = cudaq::cc::PointerType::get(v.getType());
       auto toAddr =
-          rewriter.create<cudaq::cc::ComputePtrOp>(loc, vTy, dest, args);
-      rewriter.create<cudaq::cc::StoreOp>(loc, v, toAddr);
+          cudaq::cc::ComputePtrOp::create(rewriter, loc, vTy, dest, args);
+      cudaq::cc::StoreOp::create(rewriter, loc, v, toAddr);
     }
     LLVM_DEBUG(llvm::dbgs() << "updated: " << storeOp << '\n');
     rewriter.eraseOp(storeOp);
@@ -230,7 +230,7 @@ class SROAPass : public cudaq::opt::impl::SROABase<SROAPass> {
     LLVM_DEBUG(llvm::dbgs() << "Before SROA:\n" << *op << '\n');
     RewritePatternSet patterns(ctx);
     patterns.insert<AllocaAggregate, StoreAggregate>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(op, std::move(patterns)))) {
       signalPassFailure();
       return;
     }
diff --git a/lib/Optimizer/Transforms/StatePreparation.cpp b/lib/Optimizer/Transforms/StatePreparation.cpp
index 5cdca277dc6..07dc1a53025 100644
--- a/lib/Optimizer/Transforms/StatePreparation.cpp
+++ b/lib/Optimizer/Transforms/StatePreparation.cpp
@@ -8,13 +8,9 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -163,13 +159,13 @@ class StateGateBuilder {
   void applyRotationOp(double theta, std::size_t target) {
     auto qubit = createQubitRef(target);
     auto thetaValue = createAngleValue(theta);
-    rewriter.create<Op>(loc, thetaValue, mlir::ValueRange{}, qubit);
+    Op::create(rewriter, loc, thetaValue, mlir::ValueRange{}, qubit);
   };
 
   void applyX(std::size_t control, std::size_t target) {
     auto qubitC = createQubitRef(control);
     auto qubitT = createQubitRef(target);
-    rewriter.create<quake::XOp>(loc, qubitC, qubitT);
+    quake::XOp::create(rewriter, loc, qubitC, qubitT);
   };
 
 private:
@@ -177,14 +173,14 @@ class StateGateBuilder {
     if (qubitRefs.contains(index))
       return qubitRefs[index];
 
-    auto ref = rewriter.create<quake::ExtractRefOp>(loc, qubits, index);
+    auto ref = quake::ExtractRefOp::create(rewriter, loc, qubits, index);
     qubitRefs[index] = ref;
     return ref;
   }
 
   mlir::Value createAngleValue(double angle) {
-    return rewriter.create<mlir::arith::ConstantFloatOp>(
-        loc, llvm::APFloat{angle}, rewriter.getF64Type());
+    return arith::ConstantFloatOp::create(rewriter, loc, rewriter.getF64Type(),
+                                          llvm::APFloat{angle});
   }
 
   PatternRewriter &rewriter;
@@ -451,7 +447,7 @@ class StatePreparationPass
     patterns.insert<StatePrepPattern>(ctx, phaseThreshold);
     patterns.insert<FoldQubitsPattern, KillDeleteStatePattern>(ctx);
 
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+    if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
       func.emitOpError("State preparation failed");
       signalPassFailure();
     }
diff --git a/lib/Optimizer/Transforms/UnitarySynthesis.cpp b/lib/Optimizer/Transforms/UnitarySynthesis.cpp
index 590e6a61c0e..baab8de9b9e 100644
--- a/lib/Optimizer/Transforms/UnitarySynthesis.cpp
+++ b/lib/Optimizer/Transforms/UnitarySynthesis.cpp
@@ -10,12 +10,9 @@
 #include "common/EigenDense.h"
 #include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -31,7 +28,6 @@ namespace cudaq::opt {
 #define DEBUG_TYPE "unitary-synthesis"
 
 using namespace mlir;
-using namespace std::complex_literals;
 
 namespace {
 
@@ -80,6 +76,7 @@ struct OneQubitOpZYZ : public Decomposer {
   /// corresponding explanation in https://threeplusone.com/pubs/on_gates.pdf,
   /// Section 4.
   void decompose() override {
+    using namespace std::complex_literals;
     /// Rescale the input unitary matrix, `u`, to be special unitary.
     /// Extract a phase factor, `phase`, so that
     /// `determinant(inverse_phase * unitary) = 1`
@@ -110,8 +107,8 @@ struct OneQubitOpZYZ : public Decomposer {
         FunctionType::get(parentModule.getContext(), targets[0].getType(), {});
     auto insPt = rewriter.saveInsertionPoint();
     rewriter.setInsertionPointToStart(parentModule.getBody());
-    auto func =
-        rewriter.create<func::FuncOp>(parentModule->getLoc(), funcName, funcTy);
+    auto func = func::FuncOp::create(rewriter, parentModule->getLoc(), funcName,
+                                     funcTy);
     func.setPrivate();
     auto *block = func.addEntryBlock();
     rewriter.setInsertionPointToStart(block);
@@ -123,17 +120,17 @@ struct OneQubitOpZYZ : public Decomposer {
     if (isAboveThreshold(angles.gamma)) {
       auto gamma = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, angles.gamma, floatTy);
-      rewriter.create<quake::RzOp>(loc, gamma, ValueRange{}, arguments);
+      quake::RzOp::create(rewriter, loc, gamma, ValueRange{}, arguments);
     }
     if (isAboveThreshold(angles.beta)) {
       auto beta = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, angles.beta, floatTy);
-      rewriter.create<quake::RyOp>(loc, beta, ValueRange{}, arguments);
+      quake::RyOp::create(rewriter, loc, beta, ValueRange{}, arguments);
     }
     if (isAboveThreshold(angles.alpha)) {
       auto alpha = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, angles.alpha, floatTy);
-      rewriter.create<quake::RzOp>(loc, alpha, ValueRange{}, arguments);
+      quake::RzOp::create(rewriter, loc, alpha, ValueRange{}, arguments);
     }
     /// NOTE: Typically global phase can be ignored but, if this decomposition
     /// is applied in a kernel that is called with `cudaq::control`, the global
@@ -145,11 +142,11 @@ struct OneQubitOpZYZ : public Decomposer {
     if (isAboveThreshold(globalPhase)) {
       auto phase = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, globalPhase, floatTy);
-      Value negPhase = rewriter.create<arith::NegFOp>(loc, phase);
-      rewriter.create<quake::R1Op>(loc, phase, ValueRange{}, arguments[0]);
-      rewriter.create<quake::RzOp>(loc, negPhase, ValueRange{}, arguments[0]);
+      Value negPhase = arith::NegFOp::create(rewriter, loc, phase);
+      quake::R1Op::create(rewriter, loc, phase, ValueRange{}, arguments[0]);
+      quake::RzOp::create(rewriter, loc, negPhase, ValueRange{}, arguments[0]);
     }
-    rewriter.create<func::ReturnOp>(loc);
+    func::ReturnOp::create(rewriter, loc);
     rewriter.restoreInsertionPoint(insPt);
   }
 
@@ -180,6 +177,7 @@ struct KAKComponents {
 ///                    0  i −1  0
 ///                    1  0  0 −i
 const Eigen::Matrix4cd &MagicBasisMatrix() {
+  using namespace std::complex_literals;
   static Eigen::Matrix4cd MagicBasisMatrix;
   MagicBasisMatrix << 1.0, 0.0, 0.0, 1i, 0.0, 1i, 1.0, 0, 0, 1i, -1.0, 0, 1.0,
       0, 0, -1i;
@@ -278,6 +276,7 @@ extractSU2FromSO4(const Eigen::Matrix4cd &matrix) {
 
 /// Compute exp(i(x XX + y YY + z ZZ)) matrix for verification
 Eigen::Matrix4cd canonicalVecToMatrix(double x, double y, double z) {
+  using namespace std::complex_literals;
   Eigen::Matrix2cd X{Eigen::Matrix2cd::Zero()};
   Eigen::Matrix2cd Y{Eigen::Matrix2cd::Zero()};
   Eigen::Matrix2cd Z{Eigen::Matrix2cd::Zero()};
@@ -300,6 +299,7 @@ struct TwoQubitOpKAK : public Decomposer {
   /// Ref: https://arxiv.org/pdf/quant-ph/0507171
   /// Ref: https://arxiv.org/pdf/0806.4015
   void decompose() override {
+    using namespace std::complex_literals;
     /// Step0: Convert to special unitary
     phase = std::pow(targetMatrix.determinant(), 0.25);
     auto specialUnitary = targetMatrix / phase;
@@ -355,8 +355,8 @@ struct TwoQubitOpKAK : public Decomposer {
         FunctionType::get(parentModule.getContext(), targets.getTypes(), {});
     auto insPt = rewriter.saveInsertionPoint();
     rewriter.setInsertionPointToStart(parentModule.getBody());
-    auto func =
-        rewriter.create<func::FuncOp>(parentModule->getLoc(), funcName, funcTy);
+    auto func = func::FuncOp::create(rewriter, parentModule->getLoc(), funcName,
+                                     funcTy);
     func.setPrivate();
     auto *block = func.addEntryBlock();
     rewriter.setInsertionPointToStart(block);
@@ -364,67 +364,67 @@ struct TwoQubitOpKAK : public Decomposer {
     FloatType floatTy = rewriter.getF64Type();
     /// NOTE: Operator notation is right-to-left, whereas circuit notation is
     /// left-to-right. Hence, operations are applied in reverse order.
-    rewriter.create<quake::ApplyOp>(
-        loc, TypeRange{},
+    quake::ApplyOp::create(
+        rewriter, loc, TypeRange{},
         SymbolRefAttr::get(rewriter.getContext(), funcName + "b0"), false,
         ValueRange{}, ValueRange{arguments[1]});
-    rewriter.create<quake::ApplyOp>(
-        loc, TypeRange{},
+    quake::ApplyOp::create(
+        rewriter, loc, TypeRange{},
         SymbolRefAttr::get(rewriter.getContext(), funcName + "b1"), false,
         ValueRange{}, ValueRange{arguments[0]});
     /// TODO: Refactor to use a transformation pass for `quake.exp_pauli`
     /// XX
     if (isAboveThreshold(components.x)) {
-      rewriter.create<quake::HOp>(loc, arguments[0]);
-      rewriter.create<quake::HOp>(loc, arguments[1]);
-      rewriter.create<quake::XOp>(loc, arguments[1], arguments[0]);
+      quake::HOp::create(rewriter, loc, arguments[0]);
+      quake::HOp::create(rewriter, loc, arguments[1]);
+      quake::XOp::create(rewriter, loc, arguments[1], arguments[0]);
       auto xAngle = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, -2.0 * components.x, floatTy);
-      rewriter.create<quake::RzOp>(loc, xAngle, ValueRange{}, arguments[0]);
-      rewriter.create<quake::XOp>(loc, arguments[1], arguments[0]);
-      rewriter.create<quake::HOp>(loc, arguments[1]);
-      rewriter.create<quake::HOp>(loc, arguments[0]);
+      quake::RzOp::create(rewriter, loc, xAngle, ValueRange{}, arguments[0]);
+      quake::XOp::create(rewriter, loc, arguments[1], arguments[0]);
+      quake::HOp::create(rewriter, loc, arguments[1]);
+      quake::HOp::create(rewriter, loc, arguments[0]);
     }
     /// YY
     if (isAboveThreshold(components.y)) {
       auto piBy2 = cudaq::opt::factory::createFloatConstant(loc, rewriter,
                                                             M_PI_2, floatTy);
-      rewriter.create<quake::RxOp>(loc, piBy2, ValueRange{}, arguments[0]);
-      rewriter.create<quake::RxOp>(loc, piBy2, ValueRange{}, arguments[1]);
-      rewriter.create<quake::XOp>(loc, arguments[1], arguments[0]);
+      quake::RxOp::create(rewriter, loc, piBy2, ValueRange{}, arguments[0]);
+      quake::RxOp::create(rewriter, loc, piBy2, ValueRange{}, arguments[1]);
+      quake::XOp::create(rewriter, loc, arguments[1], arguments[0]);
       auto yAngle = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, -2.0 * components.y, floatTy);
-      rewriter.create<quake::RzOp>(loc, yAngle, ValueRange{}, arguments[0]);
-      rewriter.create<quake::XOp>(loc, arguments[1], arguments[0]);
-      Value negPiBy2 = rewriter.create<arith::NegFOp>(loc, piBy2);
-      rewriter.create<quake::RxOp>(loc, negPiBy2, ValueRange{}, arguments[1]);
-      rewriter.create<quake::RxOp>(loc, negPiBy2, ValueRange{}, arguments[0]);
+      quake::RzOp::create(rewriter, loc, yAngle, ValueRange{}, arguments[0]);
+      quake::XOp::create(rewriter, loc, arguments[1], arguments[0]);
+      Value negPiBy2 = arith::NegFOp::create(rewriter, loc, piBy2);
+      quake::RxOp::create(rewriter, loc, negPiBy2, ValueRange{}, arguments[1]);
+      quake::RxOp::create(rewriter, loc, negPiBy2, ValueRange{}, arguments[0]);
     }
     /// ZZ
     if (isAboveThreshold(components.z)) {
-      rewriter.create<quake::XOp>(loc, arguments[1], arguments[0]);
+      quake::XOp::create(rewriter, loc, arguments[1], arguments[0]);
       auto zAngle = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, -2.0 * components.z, floatTy);
-      rewriter.create<quake::RzOp>(loc, zAngle, ValueRange{}, arguments[0]);
-      rewriter.create<quake::XOp>(loc, arguments[1], arguments[0]);
+      quake::RzOp::create(rewriter, loc, zAngle, ValueRange{}, arguments[0]);
+      quake::XOp::create(rewriter, loc, arguments[1], arguments[0]);
     }
-    rewriter.create<quake::ApplyOp>(
-        loc, TypeRange{},
+    quake::ApplyOp::create(
+        rewriter, loc, TypeRange{},
         SymbolRefAttr::get(rewriter.getContext(), funcName + "a0"), false,
         ValueRange{}, ValueRange{arguments[1]});
-    rewriter.create<quake::ApplyOp>(
-        loc, TypeRange{},
+    quake::ApplyOp::create(
+        rewriter, loc, TypeRange{},
         SymbolRefAttr::get(rewriter.getContext(), funcName + "a1"), false,
         ValueRange{}, ValueRange{arguments[0]});
     auto globalPhase = 2.0 * std::arg(phase);
     if (isAboveThreshold(globalPhase)) {
       auto phase = cudaq::opt::factory::createFloatConstant(
           loc, rewriter, globalPhase, floatTy);
-      Value negPhase = rewriter.create<arith::NegFOp>(loc, phase);
-      rewriter.create<quake::R1Op>(loc, phase, ValueRange{}, arguments[0]);
-      rewriter.create<quake::RzOp>(loc, negPhase, ValueRange{}, arguments[0]);
+      Value negPhase = arith::NegFOp::create(rewriter, loc, phase);
+      quake::R1Op::create(rewriter, loc, phase, ValueRange{}, arguments[0]);
+      quake::RzOp::create(rewriter, loc, negPhase, ValueRange{}, arguments[0]);
     }
-    rewriter.create<func::ReturnOp>(loc);
+    func::ReturnOp::create(rewriter, loc);
     rewriter.restoreInsertionPoint(insPt);
   }
 
@@ -499,8 +499,8 @@ class UnitarySynthesisPass
       RewritePatternSet patterns(ctx);
       patterns.insert<CustomUnitaryPattern>(ctx);
       LLVM_DEBUG(llvm::dbgs() << "Before unitary synthesis: " << func << '\n');
-      if (failed(applyPatternsAndFoldGreedily(func.getOperation(),
-                                              std::move(patterns))))
+      if (failed(
+              applyPatternsGreedily(func.getOperation(), std::move(patterns))))
         signalPassFailure();
       LLVM_DEBUG(llvm::dbgs() << "After unitary synthesis: " << func << '\n');
     }
diff --git a/lib/Optimizer/Transforms/VariableCoalesce.cpp b/lib/Optimizer/Transforms/VariableCoalesce.cpp
index b20c5047e35..74d12ba01dd 100644
--- a/lib/Optimizer/Transforms/VariableCoalesce.cpp
+++ b/lib/Optimizer/Transforms/VariableCoalesce.cpp
@@ -242,7 +242,7 @@ class VariableCoalescePass
         }
         auto loc = o->getLoc();
         auto ty = cast<cudaq::cc::AllocaOp>(o).getElementType();
-        auto newVar = rewriter.create<cudaq::cc::AllocaOp>(loc, ty);
+        auto newVar = cudaq::cc::AllocaOp::create(rewriter, loc, ty);
         analysis.addBinding(o, newVar);
       }
     }
@@ -250,7 +250,7 @@ class VariableCoalescePass
     // Step 2: Replace old variables with new ones.
     RewritePatternSet patterns(ctx);
     patterns.insert<PackingPattern>(ctx, analysis);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+    if (failed(applyPatternsGreedily(func, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After variable coalescing:\n"
                             << func << "\n\n");
diff --git a/lib/Optimizer/Transforms/WiresToWiresets.cpp b/lib/Optimizer/Transforms/WiresToWiresets.cpp
index cc674b9cbfb..392c4005559 100644
--- a/lib/Optimizer/Transforms/WiresToWiresets.cpp
+++ b/lib/Optimizer/Transforms/WiresToWiresets.cpp
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+#include "PassDetails.h"
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
-#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Characteristics.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Threading.h"
@@ -18,17 +16,14 @@
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/DialectConversion.h"
 
-using namespace mlir;
-
-//===----------------------------------------------------------------------===//
-// Generated logic
-//===----------------------------------------------------------------------===//
 namespace cudaq::opt {
 #define GEN_PASS_DEF_ASSIGNWIREINDICES
 #define GEN_PASS_DEF_ADDWIRESET
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
+using namespace mlir;
+
 namespace {
 class NullWirePat : public OpRewritePattern<quake::NullWireOp> {
 public:
@@ -111,9 +106,9 @@ struct AddWiresetPass
   void runOnOperation() override {
     ModuleOp mod = getOperation();
     OpBuilder builder(mod.getBodyRegion());
-    auto wireSetOp = builder.create<quake::WireSetOp>(
-        builder.getUnknownLoc(), cudaq::opt::topologyAgnosticWiresetName,
-        INT_MAX, ElementsAttr{});
+    auto wireSetOp = quake::WireSetOp::create(
+        builder, builder.getUnknownLoc(),
+        cudaq::opt::topologyAgnosticWiresetName, INT_MAX, ElementsAttr{});
     wireSetOp.setPrivate();
   }
 };
diff --git a/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp b/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp
index e377d771427..67484bc9f2c 100644
--- a/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp
+++ b/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp
@@ -8,10 +8,7 @@
 
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
-#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/lib/Optimizer/Transforms/WriteAfterWriteEliminationPatterns.inc b/lib/Optimizer/Transforms/WriteAfterWriteEliminationPatterns.inc
index 867e93c743c..c971ee3f4f6 100644
--- a/lib/Optimizer/Transforms/WriteAfterWriteEliminationPatterns.inc
+++ b/lib/Optimizer/Transforms/WriteAfterWriteEliminationPatterns.inc
@@ -1,5 +1,5 @@
 /****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -94,8 +94,8 @@ private:
       if (auto store = dyn_cast<cudaq::cc::StoreOp>(&op)) {
         auto ptr = store.getPtrvalue().getDefiningOp();
         if (isStoreToStack(store)) {
-          auto &[b, ptrToStores] = blockInfo.FindAndConstruct(block);
-          auto &[p, stores] = ptrToStores.FindAndConstruct(ptr);
+          auto ptrToStores = blockInfo[block];
+          auto stores = ptrToStores[ptr];
           stores.push_back(&op);
         }
       }
@@ -113,7 +113,7 @@ private:
   /// cc.store %c0_i64, %3 : !cc.ptr<i64>
   /// ```
   static bool isStoreToStack(cudaq::cc::StoreOp store) {
-    auto ptrOp = store.getPtrvalue();
+    Value ptrOp = store.getPtrvalue();
     if (auto cast = ptrOp.getDefiningOp<cudaq::cc::CastOp>())
       ptrOp = cast.getOperand();
 
diff --git a/lib/Verifier/NVQIRCalls.cpp b/lib/Verifier/NVQIRCalls.cpp
index 4d8b6fd5ecc..7c473885616 100644
--- a/lib/Verifier/NVQIRCalls.cpp
+++ b/lib/Verifier/NVQIRCalls.cpp
@@ -39,7 +39,7 @@ constexpr const char *libcFuncs[] = {"malloc", "free", "memcpy", "memset"};
 static bool isVerifiedFunction(StringRef name,
                                const SmallVector<StringRef> &goldenFuncs) {
   auto prefixCheck = [&](const char *prefix) {
-    return name.startswith(prefix);
+    return name.starts_with(prefix);
   };
 
   // Check if name has an accepted QIR or LLVM intrinsic prefix.
diff --git a/lib/Verifier/QIRLLVMIRDialect.cpp b/lib/Verifier/QIRLLVMIRDialect.cpp
index ba6df68db4c..92812236d1d 100644
--- a/lib/Verifier/QIRLLVMIRDialect.cpp
+++ b/lib/Verifier/QIRLLVMIRDialect.cpp
@@ -74,7 +74,7 @@ LogicalResult cudaq::verifier::checkQIRLLVMIRDialect(ModuleOp module,
         func && func->hasAttr(cudaq::kernelAttrName))
       funcs.push_back(func);
 
-  const bool isBaseProfile = profile.startswith("qir-base");
+  const bool isBaseProfile = profile.starts_with("qir-base");
   auto *ctx = module.getContext();
   for (auto func : funcs) {
     auto walkResult = func.walk([&](Operation *op) {
@@ -87,19 +87,28 @@ LogicalResult cudaq::verifier::checkQIRLLVMIRDialect(ModuleOp module,
         if (!funcNameAttr)
           return WalkResult::advance();
         auto funcName = funcNameAttr.getValue();
-        if (isBaseProfile && (!funcName.startswith("__quantum_") ||
-                              funcName.equals(cudaq::opt::QIRCustomOp))) {
+        if (isBaseProfile && (!funcName.starts_with("__quantum_") ||
+                              funcName == cudaq::opt::QIRCustomOp)) {
           call.emitOpError("unexpected call in QIR base profile");
           return WalkResult::interrupt();
         }
 
         // Check that qubits are unique values.
         const std::size_t numOpnds = call.getNumOperands();
-        auto qubitTy = cudaq::opt::getQubitType(ctx);
-        if (numOpnds > 0)
-          for (std::size_t i = 0; i < numOpnds - 1; ++i)
+        auto qubitTy = cudaq::cg::getQubitType(ctx);
+        // Determine how many leading operands are qubit pointers. With
+        // opaque pointers, Qubit* and Result* are both !llvm.ptr so we
+        // cannot distinguish them by type. For measurement functions
+        // like mz__body(Qubit*, Result*), only the first operand is a
+        // qubit; the second is a Result. Limit the uniqueness check to
+        // qubit operand indices only.
+        std::size_t numQubitOpnds = numOpnds;
+        if (funcName == cudaq::opt::QIRMeasureBody)
+          numQubitOpnds = 1;
+        if (numQubitOpnds > 1)
+          for (std::size_t i = 0; i < numQubitOpnds - 1; ++i)
             if (call.getOperand(i).getType() == qubitTy)
-              for (std::size_t j = i + 1; j < numOpnds; ++j)
+              for (std::size_t j = i + 1; j < numQubitOpnds; ++j)
                 if (call.getOperand(j).getType() == qubitTy) {
                   auto i1 =
                       call.getOperand(i).getDefiningOp<LLVM::IntToPtrOp>();
diff --git a/lib/Verifier/QIRSpec.cpp b/lib/Verifier/QIRSpec.cpp
index 8d028b25516..4c016bdaaa2 100644
--- a/lib/Verifier/QIRSpec.cpp
+++ b/lib/Verifier/QIRSpec.cpp
@@ -11,6 +11,7 @@
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
diff --git a/pyproject.toml.cu12 b/pyproject.toml.cu12
index 96cd0a73ace..b65219d4ebd 100644
--- a/pyproject.toml.cu12
+++ b/pyproject.toml.cu12
@@ -24,7 +24,7 @@ dependencies = [
   'cudensitymat-cu12 ~= 0.5.1',
   'numpy >= 1.24',
   'scipy >= 1.10.1',
-  'requests >= 2.32.4',
+  'requests >= 2.32.3',
   'nvidia-cublas-cu12 ~= 12.0',
   'nvidia-curand-cu12 ~= 10.3',
   'nvidia-cusparse-cu12 ~= 12.5',
@@ -62,7 +62,7 @@ visualization = [ "qutip>5" , "matplotlib>=3.5" ]
 integrators = [ "torchdiffeq" ]
 
 [build-system]
-requires = ["scikit-build-core==0.11.6", "cmake>=3.27,<3.29", "numpy>=1.24", "pytest==9.0.3"]
+requires = ["scikit-build-core==0.11.6", "cmake>=3.27,<3.29", "numpy>=1.24", "pytest==9.0.3", "nanobind>=2.9.0"]
 build-backend = "scikit_build_core.build"
 
 [tool.scikit-build]
@@ -87,5 +87,16 @@ if.platform-machine = "x86_64"
 inherit.cmake.args = "append"
 cmake.args = ["-DCUDAQ_ENABLE_PASQAL_QRMI_CONNECTOR=ON"]
 
+# Linux: use LLD as the linker
+[[tool.scikit-build.overrides]]
+if.platform-system = "linux"
+inherit.cmake.args = "append"
+cmake.args = [
+    "-DLLVM_USE_LINKER=lld",
+    "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld -B/usr/local/llvm/bin",
+    "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld -B/usr/local/llvm/bin",
+    "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld -B/usr/local/llvm/bin",
+]
+
 [tool.setuptools_scm]
 write_to = "_version.py"
diff --git a/pyproject.toml.cu13 b/pyproject.toml.cu13
index fc226812534..c811fbfe5cc 100644
--- a/pyproject.toml.cu13
+++ b/pyproject.toml.cu13
@@ -21,7 +21,7 @@ dependencies = [
   'astpretty ~= 3.0',
   'numpy >= 1.24',
   'scipy >= 1.10.1',
-  'requests >= 2.32.4',
+  'requests >= 2.32.3',
   # CUDA dependencies - excluded on macOS (CPU-only support)
   'custatevec-cu13 ~= 1.13.1; sys_platform != "darwin"',
   'cutensornet-cu13 ~= 2.12.1; sys_platform != "darwin"',
@@ -64,7 +64,7 @@ visualization = [ "qutip>5" , "matplotlib>=3.5" ]
 integrators = [ "torchdiffeq" ]
 
 [build-system]
-requires = ["scikit-build-core==0.11.6", "cmake>=3.27,<3.29", "numpy>=1.24", "pytest==9.0.3"]
+requires = ["scikit-build-core==0.11.6", "cmake>=3.27,<3.29", "numpy>=1.24", "pytest==9.0.3", "nanobind>=2.9.0"]
 build-backend = "scikit_build_core.build"
 
 [tool.scikit-build]
@@ -89,6 +89,17 @@ if.platform-machine = "x86_64"
 inherit.cmake.args = "append"
 cmake.args = ["-DCUDAQ_ENABLE_PASQAL_QRMI_CONNECTOR=ON"]
 
+# Linux: use LLD as the linker
+[[tool.scikit-build.overrides]]
+if.platform-system = "linux"
+inherit.cmake.args = "append"
+cmake.args = [
+    "-DLLVM_USE_LINKER=lld",
+    "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld -B/usr/local/llvm/bin",
+    "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld -B/usr/local/llvm/bin",
+    "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld -B/usr/local/llvm/bin",
+]
+
 # macOS: Disable symbol stripping. LLVM's JIT relies on local symbols for
 # internal data structures (eg., PassRegistry). On macOS
 # with it's two-level namespace they are removed by stripping which
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 2eaf4ee0d75..3dd993f587d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -8,10 +8,6 @@
 
 add_subdirectory(utils)
 
-if (CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-  set(CMAKE_BUILD_TYPE "Debug")
-endif()
-
 # [RFC]:
 # Check how to solve this better than just disable the warning for the whole directory.
 # If this is better addressed after updating to a newer LLVM version, track as an issue on GitHub.
@@ -35,15 +31,22 @@ if (CUDA_FOUND)
   find_package(CUDAToolkit REQUIRED)
 endif()
 
+if(LLVM_ENABLE_ASSERTIONS)
+  set(CUDAQ_ASSERTIONS_ENABLED "1")
+else()
+  set(CUDAQ_ASSERTIONS_ENABLED "")
+endif()
+
 set(METADATA_FILE "${CMAKE_BINARY_DIR}/python/cudaq/_metadata.py" )
 add_custom_target(
   CopyPythonFiles ALL
   COMMAND ${CMAKE_COMMAND} -E copy_directory
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${CMAKE_BINARY_DIR}/python
-  COMMAND ${CMAKE_COMMAND} 
-  -DMETADATA_FILE="${METADATA_FILE}" 
-  -DCUDA_VERSION_MAJOR=${CUDAToolkit_VERSION_MAJOR} 
+  COMMAND ${CMAKE_COMMAND}
+  -DMETADATA_FILE="${METADATA_FILE}"
+  -DCUDA_VERSION_MAJOR=${CUDAToolkit_VERSION_MAJOR}
+  -DASSERTIONS_ENABLED=${CUDAQ_ASSERTIONS_ENABLED}
   -P ${CMAKE_CURRENT_SOURCE_DIR}/metadata.cmake
   DEPENDS ${PYTHON_SOURCES}
   BYPRODUCTS "${METADATA_FILE}"
diff --git a/python/cudaq/__init__.py b/python/cudaq/__init__.py
index afce1fb5832..8c675f9ea24 100644
--- a/python/cudaq/__init__.py
+++ b/python/cudaq/__init__.py
@@ -127,6 +127,46 @@ def _configure_cuda_library_paths() -> None:
         print("Could not find a suitable cuQuantum Python package.")
     pass
 
+
+def _patch_mlir_isinstance() -> None:
+    import builtins
+
+    from .mlir._mlir_libs import _mlir as _mlir_ext
+    ir = _mlir_ext.ir
+    value_base = getattr(ir, "Value", None)
+    py_isinstance = builtins.isinstance
+    for name in dir(ir):
+        cls = getattr(ir, name)
+        if not py_isinstance(cls, type) or "isinstance" in cls.__dict__:
+            continue
+        static_typeid = None
+        try:
+            static_typeid = cls.static_typeid
+        except Exception:
+            pass
+        if static_typeid is not None:
+
+            def _isinstance(other, _tid=static_typeid):
+                try:
+                    return other.typeid == _tid
+                except Exception:
+                    return False
+        elif value_base is not None and cls is not value_base and \
+                issubclass(cls, value_base):
+
+            def _isinstance(other, _cls=cls, _isinst=py_isinstance):
+                try:
+                    return _isinst(other.maybe_downcast(), _cls)
+                except Exception:
+                    return False
+        else:
+            continue
+        setattr(cls, "isinstance", staticmethod(_isinstance))
+
+
+_patch_mlir_isinstance()
+del _patch_mlir_isinstance
+
 # ============================================================================ #
 # Module Imports
 # ============================================================================ #
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 2f0aac710df..e7f58617f0f 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2794,12 +2794,12 @@ def isExactCudaqDbgAstCall(func_node: ast.AST) -> bool:
 
                 totalSize = arith.SubIOp(endVal, startVal).result
                 if isDecrementing:
-                    roundingOffset = arith.AddIOp(stepVal, one)
+                    roundingOffset = arith.AddIOp(stepVal, one).result
                 else:
-                    roundingOffset = arith.SubIOp(stepVal, one)
-                totalSize = arith.AddIOp(totalSize, roundingOffset)
+                    roundingOffset = arith.SubIOp(stepVal, one).result
+                totalSize = arith.AddIOp(totalSize, roundingOffset).result
                 totalSize = arith.MaxSIOp(
-                    zero,
+                    zero.result,
                     arith.DivSIOp(totalSize, stepVal).result).result
 
                 # Create an array of i64 of the total size
@@ -2815,7 +2815,7 @@ def isExactCudaqDbgAstCall(func_node: ast.AST) -> bool:
                 # but we also need to keep track of a counter
                 counter = cc.AllocaOp(cc.PointerType.get(iTy),
                                       TypeAttr.get(iTy)).result
-                cc.StoreOp(zero, counter)
+                cc.StoreOp(zero.result, counter)
 
                 def bodyBuilder(iterVar):
                     loadedCounter = cc.LoadOp(counter).result
@@ -2824,7 +2824,8 @@ def bodyBuilder(iterVar):
                         DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                               context=self.ctx))
                     cc.StoreOp(iterVar, eleAddr)
-                    incrementedCounter = arith.AddIOp(loadedCounter, one).result
+                    incrementedCounter = arith.AddIOp(loadedCounter,
+                                                      one.result).result
                     cc.StoreOp(incrementedCounter, counter)
 
                 self.createMonotonicForLoop(bodyBuilder,
@@ -3610,19 +3611,25 @@ def check_vector_init():
                             cudaq_module = importlib.import_module('cudaq')
                             channel_class = getattr(cudaq_module,
                                                     node.args[0].attr)
-                            numParams = channel_class.num_parameters
+                            numParams = (channel_class.num_parameters
+                                         if hasattr(channel_class,
+                                                    'num_parameters') else
+                                         channel_class.get_num_parameters())
                             key = self.getConstantInt(hash(channel_class))
                         elif isinstance(node.args[0], ast.Name):
                             arg = recover_value_of_or_none(
                                 node.args[0].id, self.defFrame)
                             if (arg and isinstance(arg, type) and issubclass(
                                     arg, cudaq_runtime.KrausChannel)):
-                                if not hasattr(arg, 'num_parameters'):
+                                if (not hasattr(arg, 'num_parameters') and
+                                        not hasattr(arg, 'get_num_parameters')):
                                     self.emitFatalError(
                                         'apply_noise kraus channels must have '
                                         '`num_parameters` constant class '
                                         'attribute specified.')
-                                numParams = arg.num_parameters
+                                numParams = (arg.num_parameters if hasattr(
+                                    arg, 'num_parameters') else
+                                             arg.get_num_parameters())
                                 key = self.getConstantInt(hash(arg))
                         if key is None:
                             self.emitFatalError(
@@ -4823,10 +4830,10 @@ def compare_equality(item1, item2):
             if ComplexType.isinstance(item1.type):
                 reComp = arith.CmpFOp(fCondPred,
                                       complex.ReOp(item1).result,
-                                      complex.ReOp(item2).result)
+                                      complex.ReOp(item2).result).result
                 imComp = arith.CmpFOp(fCondPred,
                                       complex.ImOp(item1).result,
-                                      complex.ImOp(item2).result)
+                                      complex.ImOp(item2).result).result
                 return arith.AndIOp(reComp, imComp).result
             elif IntegerType.isinstance(item1.type):
                 return arith.CmpIOp(iCondPred, item1, item2).result
@@ -5515,8 +5522,13 @@ def compile_to_mlir(uniqueId, astModule, signature: KernelSignature, defFrame,
     if verbose:
         print(bridge.module)
     # Clear the live operations cache. This avoids python crashing with
-    # stale references being cached.
-    bridge.module.context._clear_live_operations()
+    # stale references being cached. (MLIR 22+ may expose this as
+    # clear_live_operations instead of _clear_live_operations.)
+    ctx = bridge.module.context
+    clear_fn = getattr(ctx, '_clear_live_operations', None) or getattr(
+        ctx, 'clear_live_operations', None)
+    if clear_fn is not None:
+        clear_fn()
     # The only MLIR code object wrapped & tracked ought to be `newMod` now.
     cudaq_runtime.set_data_layout(bridge.module)
     return bridge.module
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 68627126007..64608e50e50 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -1553,12 +1553,24 @@ def process_channel_param(self, param):
         else:
             emitFatalError("Noise channel parameter must be float")
 
+    @staticmethod
+    def _get_num_parameters(noise_channel):
+        """Return the `num_parameters` for a noise channel class,
+        supporting both the attribute (custom channels) and the
+        method (nanobind-bound built-in channels)."""
+        if hasattr(noise_channel, 'num_parameters'):
+            return noise_channel.num_parameters
+        if hasattr(noise_channel, 'get_num_parameters'):
+            return noise_channel.get_num_parameters()
+        return None
+
     @staticmethod
     def _validate_noise_channel_probability_params(noise_channel, param_values):
         """
         Raise `RuntimeError` if any `param` is a constant float outside [0, 1].
         """
-        if not hasattr(noise_channel, 'num_parameters'):
+        if not (hasattr(noise_channel, 'num_parameters') or
+                hasattr(noise_channel, 'get_num_parameters')):
             return
         for p in param_values:
             if isinstance(p, (int, float)):
@@ -1578,17 +1590,19 @@ def apply_noise(self, noise_channel, *args):
             self.appliedNoiseChannels.append(noise_channel)
 
         if not issubclass(noise_channel, cudaq_runtime.KrausChannel):
-            if not hasattr(noise_channel, 'num_parameters'):
+            if not (hasattr(noise_channel, 'num_parameters') or
+                    hasattr(noise_channel, 'get_num_parameters')):
                 emitFatalError(
                     'apply_noise kraus channels must have `num_parameters` '
                     'constant class attribute specified.')
 
+            n_params = self._get_num_parameters(noise_channel)
             # We needs to have noise channel parameters + qubit arguments
             if isinstance(args[0], list):
-                if len(args[0]) != noise_channel.num_parameters:
+                if len(args[0]) != n_params:
                     emitFatalError(f"Invalid number of arguments passed to "
                                    f"apply_noise for channel `{noise_channel}`")
-            elif len(args) <= noise_channel.num_parameters:
+            elif len(args) <= n_params:
                 emitFatalError(f"Invalid number of arguments passed to "
                                f"apply_noise for channel `{noise_channel}`")
 
@@ -1612,11 +1626,12 @@ def apply_noise(self, noise_channel, *args):
                         emitFatalError("Invalid qubit operand type")
                     target_qubits.append(p.mlirValue)
             else:
-                param_values = args[:noise_channel.num_parameters]
+                n_params = self._get_num_parameters(noise_channel)
+                param_values = args[:n_params]
                 self._validate_noise_channel_probability_params(
                     noise_channel, param_values)
                 for i, p in enumerate(args):
-                    if i < noise_channel.num_parameters:
+                    if i < n_params:
                         noise_channel_params.append(
                             self.process_channel_param(p))
                     else:
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index 98787d1844f..a706760fc4c 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -305,7 +305,7 @@ def merge_kernel(self, otherMod):
         for op in newMod.body:
             if isinstance(op, func.FuncOp):
                 for attr in op.attributes:
-                    if 'cudaq-entrypoint' == attr.name:
+                    if 'cudaq-entrypoint' == attr:
                         name = op.name.value.removeprefix(nvqppPrefix)
                         break
 
@@ -327,7 +327,7 @@ def merge_quake_source(self, quakeText):
         for op in newMod.body:
             if isinstance(op, func.FuncOp):
                 for attr in op.attributes:
-                    if 'cudaq-entrypoint' == attr.name:
+                    if 'cudaq-entrypoint' == attr:
                         name = op.name.value.removeprefix(nvqppPrefix)
                         break
 
diff --git a/python/cudaq/mlir/dialects/CCOps.td b/python/cudaq/mlir/dialects/CCOps.td
index db5f1469beb..7822ababa66 100644
--- a/python/cudaq/mlir/dialects/CCOps.td
+++ b/python/cudaq/mlir/dialects/CCOps.td
@@ -9,7 +9,6 @@
 #ifndef PYTHON_BINDINGS_CC_OPS
 #define PYTHON_BINDINGS_CC_OPS
 
-include "mlir/Bindings/Python/Attributes.td"
 include "cudaq/Optimizer/Dialect/CC/CCOps.td"
 
 #endif
diff --git a/python/cudaq/mlir/dialects/QuakeOps.td b/python/cudaq/mlir/dialects/QuakeOps.td
index 6552c781014..e7ef1d46ab4 100644
--- a/python/cudaq/mlir/dialects/QuakeOps.td
+++ b/python/cudaq/mlir/dialects/QuakeOps.td
@@ -9,7 +9,6 @@
 #ifndef PYTHON_BINDINGS_QUAKE_OPS
 #define PYTHON_BINDINGS_QUAKE_OPS
 
-include "mlir/Bindings/Python/Attributes.td"
 include "cudaq/Optimizer/Dialect/Quake/QuakeOps.td"
 
 #endif
diff --git a/python/cudaq/runtime/sample.py b/python/cudaq/runtime/sample.py
index 21975599e43..4957336721d 100644
--- a/python/cudaq/runtime/sample.py
+++ b/python/cudaq/runtime/sample.py
@@ -92,8 +92,10 @@ def _detail_check_conditionals_on_measure(kernel):
         # Only check for kernels that can be compiled, not library-mode kernels (e.g., photonics)
         if kernel.supports_compilation():
             for operation in kernel.qkeModule.body.operations:
-                if (hasattr(operation, 'name') and nvqppPrefix + kernel.uniqName
-                        == operation.name.value and
+                op_name = getattr(operation.name,
+                                  'value', operation.name) if hasattr(
+                                      operation, 'name') else None
+                if (op_name == nvqppPrefix + kernel.uniqName and
                         'qubitMeasurementFeedback' in operation.attributes):
                     has_conditionals_on_measure_result = True
     elif isinstance(kernel, PyKernel) and kernel.conditionalOnMeasure:
diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index da035cc75cf..5873bae9597 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -6,53 +6,23 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
-if (CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-  set(CMAKE_BUILD_TYPE "Debug")
-endif()
-
 include(HandleLLVMOptions)
 include(AddMLIRPython)
 
-function(add_mlir_python_extension libname extname)
-  cmake_parse_arguments(ARG
-    ""
-    "INSTALL_COMPONENT;INSTALL_DIR;OUTPUT_DIRECTORY"
-    "SOURCES;LINK_LIBS"
-    ${ARGN})
-
-  # Use nanobind for CUDA-Q's own extension (_quakeDialects) and pybind11
-  # for upstream MLIR extensions (AsyncPasses, RegisterEverything, etc.).
-  if(libname MATCHES "_quakeDialects")
-    nanobind_add_module(${libname} NB_STATIC ${ARG_SOURCES})
-    target_compile_options(${libname} PRIVATE -frtti -fexceptions -Wno-cast-qual)
-  else()
-    pybind11_add_module(${libname} MODULE ${ARG_SOURCES})
-    target_compile_options(${libname} PRIVATE -frtti -fexceptions)
-  endif()
-
-  set_target_properties(${libname} PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY ${ARG_OUTPUT_DIRECTORY}
-    OUTPUT_NAME "${extname}"
-    NO_SONAME ON
-  )
-
-  target_link_libraries(${libname} PRIVATE ${ARG_LINK_LIBS})
-  target_link_options(${libname} PRIVATE
-    $<$<PLATFORM_ID:Linux>:LINKER:--exclude-libs,ALL>
-  )
-
-  if(ARG_INSTALL_DIR)
-    install(TARGETS ${libname}
-      COMPONENT ${ARG_INSTALL_COMPONENT}
-      LIBRARY DESTINATION "${ARG_INSTALL_DIR}"
-      RUNTIME DESTINATION "${ARG_INSTALL_DIR}"
-    )
-  endif()
-endfunction()
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wdeprecated-literal-operator"
+  CUDAQ_HAS_WDEPRECATED_LITERAL_OPERATOR)
+if(NOT CUDAQ_HAS_WDEPRECATED_LITERAL_OPERATOR)
+  add_compile_options(-Wno-unknown-warning-option)
+endif()
 
 # Specifies that all MLIR packages are co-located under the cudaq
 # top level package (the API has been embedded in a relocatable way).
 add_compile_definitions("MLIR_PYTHON_PACKAGE_PREFIX=cudaq.mlir.")
+# Mark QPU sources compiled into the Python extension so they use the
+# cross-DSO registry hook (cudaq_add_qpu_node) instead of the local
+# CUDAQ_REGISTER_TYPE which would register into the wrong DSO.
+add_compile_definitions("CUDAQ_PYTHON_EXTENSION")
 
 ################################################################################
 # Sources
@@ -64,14 +34,50 @@ declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT CUDAQuantumPythonSources
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../cudaq/mlir"
   TD_FILE dialects/QuakeOps.td
+  SOURCES
+    dialects/quake.py
   DIALECT_NAME quake)
 
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT CUDAQuantumPythonSources
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../cudaq/mlir"
   TD_FILE dialects/CCOps.td
+  SOURCES
+    dialects/cc.py
   DIALECT_NAME cc)
 
+if(APPLE)
+  set(_quakeDialects_mlir_runtime_sources "")
+  set(_quakeDialects_extra_link_libs
+    cudaq-mlir-runtime
+    MLIRPass
+    CUDAQTargetConfigUtil
+    cudaq-python-interop
+    cudaq-platform-default
+  )
+  set(_quakeDialects_pipeline_carrying_link_libs "")
+else()
+  set(_quakeDialects_mlir_runtime_sources
+    ../../runtime/internal/compiler/ArgumentConversion.cpp
+    ../../runtime/internal/compiler/CompiledModuleHelper.cpp
+    ../../runtime/internal/compiler/LayoutInfo.cpp
+    ../../runtime/internal/compiler/RuntimeMLIR.cpp
+    ../../runtime/internal/compiler/JIT.cpp
+    ../../runtime/internal/compiler/Compiler.cpp
+    ../../runtime/internal/compiler/TracePassInstrumentation.cpp
+  )
+  set(_quakeDialects_extra_link_libs "")
+  set(_quakeDialects_pipeline_carrying_link_libs
+    OptCodeGen
+    OptTransforms
+    MLIRPass
+    CUDAQTargetConfigUtil
+    cudaq-python-interop
+    cudaq-platform-default
+    cudaq-qir-verifier
+  )
+endif()
+
 declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
   MODULE_NAME _quakeDialects
   ADD_TO_PARENT CUDAQuantumPythonSources
@@ -130,27 +136,15 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../../runtime/cudaq/platform/fermioniq/FermioniqQPU.cpp
     ../../runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
     ../../runtime/cudaq/platform/default/python/QPU.cpp
-    ../../runtime/internal/compiler/ArgumentConversion.cpp
-    ../../runtime/internal/compiler/CompiledModuleHelper.cpp
-    ../../runtime/internal/compiler/LayoutInfo.cpp
-    ../../runtime/internal/compiler/RuntimeMLIR.cpp
+    ${_quakeDialects_mlir_runtime_sources}
     ../../runtime/internal/compiler/RuntimePyMLIR.cpp
-    ../../runtime/internal/compiler/JIT.cpp
-    ../../runtime/internal/compiler/Compiler.cpp
-    ../../runtime/internal/compiler/TracePassInstrumentation.cpp
 
   EMBED_CAPI_LINK_LIBS
    CUDAQuantumMLIRCAPI
    MLIRCAPIExecutionEngine
   PRIVATE_LINK_LIBS
-   OptCodeGen
-   OptTransforms
-   MLIRPass
-   CUDAQTargetConfigUtil
-   cudaq-python-interop
-   cudaq-platform-default
-   cudaq-qir-verifier
-   cudaq-mlir-runtime-headers
+   ${_quakeDialects_pipeline_carrying_link_libs}
+   ${_quakeDialects_extra_link_libs}
 )
 
 # MLIR/LLVM is built without RTTI (LLVM_ENABLE_RTTI=OFF). This file subclasses
@@ -166,6 +160,7 @@ target_include_directories(CUDAQuantumPythonSources.Extension INTERFACE
     ${CMAKE_SOURCE_DIR}/python
     ${CMAKE_SOURCE_DIR}/python/utils
     ${CMAKE_SOURCE_DIR}/runtime
+    ${CMAKE_SOURCE_DIR}/runtime/internal/compiler/include
 )
 target_link_libraries(CUDAQuantumPythonSources.Extension INTERFACE 
     cudaq 
@@ -174,8 +169,10 @@ target_link_libraries(CUDAQuantumPythonSources.Extension INTERFACE
     cudaq-em-default
     cudaq-em-photonics
     fmt::fmt-header-only
-    unzip_util
 )
+if (CUDAQ_ENABLE_REST)
+  target_link_libraries(CUDAQuantumPythonSources.Extension INTERFACE unzip_util)
+endif()
 
 ################################################################################
 # Common CAPI
@@ -192,12 +189,29 @@ add_mlir_python_common_capi_library(CUDAQuantumPythonCAPI
     # available.
     MLIRPythonExtension.RegisterEverything
     MLIRPythonSources.Core
+    # Include full MLIRPythonSources so dialect extensions' EMBED_CAPI_LINK_LIBS
+    # (e.g. obj.MLIRCAPILLVM for the LLVM dialect) are embedded into the common
+    # CAPI lib. Otherwise _mlirDialectsLLVM.so fails with undefined symbol
+    # mlirTypeIsALLVMStructType at runtime.
+    MLIRPythonSources
 )
 
+if(APPLE)
+  target_link_options(CUDAQuantumPythonCAPI PRIVATE
+    "LINKER:-flat_namespace"
+    "LINKER:-undefined,dynamic_lookup")
+endif()
+
 ################################################################################
 # Instantiation of Python module
 ################################################################################
 
+# This variable is unused in cudaq but if it is not set, we hit a bug in  
+# add_mlir_python_modules whereby it is defined twice on the compilation line:
+# -DMLIR_BINDINGS_PYTHON_NB_DOMAIN "" -DMLIR_BINDINGS_PYTHON_NB_DOMAIN mlir
+# which results in a compilation error. 
+set(MLIR_BINDINGS_PYTHON_NB_DOMAIN "cudaq")
+
 add_mlir_python_modules(CUDAQuantumPythonModules
   ROOT_PREFIX "${MLIR_BINARY_DIR}/python/cudaq/mlir"
   INSTALL_PREFIX "cudaq/mlir"
@@ -211,11 +225,94 @@ add_mlir_python_modules(CUDAQuantumPythonModules
     CUDAQuantumPythonCAPI
   )
 
-if(TARGET nanobind-static)
-  target_compile_options(nanobind-static PRIVATE -Wno-cast-qual -Wno-covered-switch-default)
+# Suppress warnings-as-errors for upstream MLIR Python extension sources
+# that have minor GCC warnings (address-of-function, parentheses) in LLVM 22.
+foreach(_cudaq_py_ext_target
+    CUDAQuantumPythonModules.extension._mlir.dso
+    CUDAQuantumPythonModules.extension.MLIRPythonSupport-cudaq.so)
+  if(TARGET ${_cudaq_py_ext_target})
+    target_compile_options(${_cudaq_py_ext_target} PRIVATE
+      -Wno-error=address -Wno-error=parentheses)
+  endif()
+endforeach()
+
+# Upstream MLIR's add_mlir_python_extension sets `-Wl,--exclude-libs,ALL` on
+# every extension, which hides the symbols pulled in from the static MLIR
+# archives from the extension's dynamic symbol table. For upstream extensions
+# that only use CAPI functions this is fine, but CUDA-Q's _quakeDialects.so
+# calls MLIR C++ APIs directly (e.g. StringAttr::get in py_register_dialects
+# and CUDAQuantumExtension). Those calls reference template statics like
+# `mlir::detail::TypeIDResolver<T>::id` which are GNU UNIQUE symbols with
+# default visibility. When `--exclude-libs,ALL` hides them, each DSO ends up
+# with its own private copy, and the TypeID used inside _quakeDialects.so no
+# longer matches CAPI's — tripping "storage uniquer isn't initialized" and
+# "different dialects for the same namespace" errors at runtime.
+#
+# Strip that option so the UNIQUE statics stay in the dynamic symbol table
+# and the runtime linker unifies them with libCUDAQuantumPythonCAPI.so's
+# copy at load time.
+if(TARGET CUDAQuantumPythonModules.extension._quakeDialects.dso)
+  # 1) Strip --exclude-libs,ALL so the MLIR template statics (e.g.
+  #    mlir::detail::TypeIDResolver<T>::id) that come in via the MLIR static
+  #    archives stay in the dynamic symbol table. These are STB_GLOBAL
+  #    (default visibility) COMDAT symbols; with --exclude-libs,ALL they are
+  #    demoted to local and each DSO ends up with its own private copy.
+  get_target_property(_qd_link_options
+    CUDAQuantumPythonModules.extension._quakeDialects.dso LINK_OPTIONS)
+  if(_qd_link_options)
+    list(REMOVE_ITEM _qd_link_options
+      "$<$<PLATFORM_ID:Linux>:LINKER:--exclude-libs,ALL>"
+      "LINKER:--exclude-libs,ALL"
+      "LINKER:-twolevel_namespace")
+    set_target_properties(CUDAQuantumPythonModules.extension._quakeDialects.dso
+      PROPERTIES LINK_OPTIONS "${_qd_link_options}")
+  endif()
+
+  # 2) Prepend libCUDAQuantumPythonCAPI.so to the link line so ld's archive
+  #    extraction finds MLIR symbols in the (shared) CAPI before scanning the
+  #    static archives. When CAPI already defines `mlir::StringAttr::get`,
+  #    `mlir::detail::TypeIDResolver<T>::id`, etc., the matching .o files in
+  #    libMLIRIR.a are not pulled in, so _quakeDialects.so has no private
+  #    copies and its references resolve to CAPI at runtime — keeping the
+  #    TypeID addresses consistent with the ones CAPI used when constructing
+  #    the MLIRContext.
+  target_link_options(CUDAQuantumPythonModules.extension._quakeDialects.dso
+    BEFORE PRIVATE
+      "$<TARGET_FILE:CUDAQuantumPythonCAPI>")
+
+  if(APPLE)
+    target_link_options(CUDAQuantumPythonModules.extension._quakeDialects.dso
+      PRIVATE
+        "LINKER:-flat_namespace"
+        "LINKER:-undefined,dynamic_lookup")
+  endif()
 endif()
 
-## The Python bindings module for Quake dialect depends on CUDAQ libraries 
+if(TARGET cudaq-mlir-runtime AND TARGET CUDAQuantumPythonCAPI)
+  get_target_property(_mr_link_options cudaq-mlir-runtime LINK_OPTIONS)
+  if(_mr_link_options)
+    list(REMOVE_ITEM _mr_link_options
+      "$<$<PLATFORM_ID:Linux>:LINKER:--exclude-libs,ALL>"
+      "LINKER:--exclude-libs,ALL"
+      "LINKER:-twolevel_namespace")
+    set_target_properties(cudaq-mlir-runtime
+      PROPERTIES LINK_OPTIONS "${_mr_link_options}")
+  endif()
+  target_link_options(cudaq-mlir-runtime BEFORE PRIVATE
+    "$<TARGET_FILE:CUDAQuantumPythonCAPI>")
+  if(APPLE)
+    target_link_options(cudaq-mlir-runtime PRIVATE
+      "LINKER:-undefined,dynamic_lookup")
+  else()
+    target_link_libraries(cudaq-mlir-runtime INTERFACE
+      $<BUILD_INTERFACE:CUDAQuantumPythonCAPI>)
+  endif()
+  set_property(TARGET cudaq-mlir-runtime APPEND PROPERTY
+    BUILD_RPATH "$<TARGET_FILE_DIR:CUDAQuantumPythonCAPI>")
+  add_dependencies(cudaq-mlir-runtime CUDAQuantumPythonCAPI)
+endif()
+
+## The Python bindings module for Quake dialect depends on CUDAQ libraries
 ## which it can't locate since they are in "../../lib" and the 'rpath' is set
 ## to '$ORIGIN' by default.
 ## macOS uses @loader_path instead of $ORIGIN for RPATH.
@@ -225,6 +322,15 @@ else()
   set(_origin_prefix "$ORIGIN")
 endif()
 
+## Retain all linked libraries (e.g. libcudaq) so that static initializers
+## (ModuleLauncher registry and PythonLauncher registration) run and resolve
+## in the same process. Without --no-as-needed the linker may drop libcudaq
+## and the launcher is never registered.
+if(CUDAQ_FORCE_LINK_FLAG)
+  target_link_options(CUDAQuantumPythonCAPI PRIVATE
+    ${CUDAQ_FORCE_LINK_FLAG})
+endif()
+
 if (NOT SKBUILD)
   list(APPEND CMAKE_INSTALL_RPATH "${_origin_prefix}/../../lib" "${_origin_prefix}/../../lib/plugins")
   set_property(TARGET CUDAQuantumPythonModules.extension._quakeDialects.dso
diff --git a/python/extension/CUDAQuantumExtension.cpp b/python/extension/CUDAQuantumExtension.cpp
index a1a581b680d..8085ab79bf8 100644
--- a/python/extension/CUDAQuantumExtension.cpp
+++ b/python/extension/CUDAQuantumExtension.cpp
@@ -48,8 +48,8 @@
 #include "runtime/interop/PythonCppInteropDecls.h"
 #include "runtime/mlir/py_register_dialects.h"
 #include "utils/LinkedLibraryHolder.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/CAPI/Pass.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -65,7 +65,12 @@ using namespace cudaq;
 
 static std::unique_ptr<LinkedLibraryHolder> holder;
 
+extern "C" void cudaq_ensure_default_launcher_linked(void);
+
 NB_MODULE(_quakeDialects, m) {
+  // Ensure the TU that registers PythonLauncher ("default") is linked so
+  // kernel launches work without an explicit set_target().
+  cudaq_ensure_default_launcher_linked();
   holder = std::make_unique<LinkedLibraryHolder>();
 
   bindRegisterDialects(m);
diff --git a/python/metadata.cmake b/python/metadata.cmake
index a8fb2a8f4cd..8a8e67775d1 100644
--- a/python/metadata.cmake
+++ b/python/metadata.cmake
@@ -20,3 +20,9 @@ if(CUDA_VERSION_MAJOR)
 else()
     file(WRITE ${METADATA_FILE} "cuda_major=None")
 endif()
+
+if(ASSERTIONS_ENABLED)
+    file(APPEND ${METADATA_FILE} "\nassertions_enabled=True")
+else()
+    file(APPEND ${METADATA_FILE} "\nassertions_enabled=False")
+endif()
diff --git a/python/runtime/common/py_AnalogHamiltonian.cpp b/python/runtime/common/py_AnalogHamiltonian.cpp
index ec182338e03..696687994e8 100644
--- a/python/runtime/common/py_AnalogHamiltonian.cpp
+++ b/python/runtime/common/py_AnalogHamiltonian.cpp
@@ -9,6 +9,7 @@
 #include "py_AnalogHamiltonian.h"
 #include "common/AnalogHamiltonian.h"
 #include "common/JsonConvert.h"
+#include <nanobind/stl/optional.h>
 #include <nanobind/stl/pair.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/vector.h>
diff --git a/python/runtime/common/py_ExecutionContext.cpp b/python/runtime/common/py_ExecutionContext.cpp
index 132462462de..b21101d7f7b 100644
--- a/python/runtime/common/py_ExecutionContext.cpp
+++ b/python/runtime/common/py_ExecutionContext.cpp
@@ -129,14 +129,19 @@ void bindExecutionContext(nanobind::module_ &mod) {
       nanobind::arg("qpuId") = 0);
   mod.def("getQirOutputLog", []() { return nvqir::getQirOutputLog(); });
   mod.def("clearQirOutputLog", []() { nvqir::clearQirOutputLog(); });
-  mod.def("decodeQirOutputLog",
-          [](const std::string &outputLog, nanobind::bytearray decodedResults) {
-            cudaq::RecordLogParser parser;
-            parser.parse(outputLog);
-            auto *origBuffer = parser.getBufferPtr();
-            const std::size_t bufferSize = parser.getBufferSize();
-            std::memcpy(decodedResults.data(), origBuffer, bufferSize);
-          });
+  mod.def("decodeQirOutputLog", [](const std::string &outputLog,
+                                   nanobind::object decodedResults) {
+    cudaq::RecordLogParser parser;
+    parser.parse(outputLog);
+    Py_buffer view;
+    if (PyObject_GetBuffer(decodedResults.ptr(), &view, PyBUF_WRITABLE) != 0)
+      throw nanobind::python_error();
+    // Get the buffer and length of buffer (in bytes) from the parser.
+    auto *origBuffer = parser.getBufferPtr();
+    const std::size_t bufferSize = parser.getBufferSize();
+    std::memcpy(view.buf, origBuffer, bufferSize);
+    PyBuffer_Release(&view);
+  });
 
   nanobind::class_<PersistJITEngine>(
       mod, "reuse_compiler_artifacts",
diff --git a/python/runtime/common/py_SampleResult.cpp b/python/runtime/common/py_SampleResult.cpp
index 47b65d5226e..df1785d0eb5 100644
--- a/python/runtime/common/py_SampleResult.cpp
+++ b/python/runtime/common/py_SampleResult.cpp
@@ -83,11 +83,11 @@ terminal measurements.
       .def(
           "__iter__",
           [](sample_result &self) {
-            return nanobind::make_key_iterator(nanobind::type<sample_result>(),
-                                               "key_iterator", self.begin(),
-                                               self.end());
+            nanobind::list keys;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              keys.append(nanobind::cast(it->first));
+            return keys.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Iterate through the :class:`SampleResult` dictionary.\n")
       .def("expectation", &sample_result::expectation,
            nanobind::arg("register_name") = GlobalRegisterName,
@@ -182,21 +182,21 @@ qubits (`marginal_indices`).
       .def(
           "items",
           [](sample_result &self) {
-            return nanobind::make_iterator(nanobind::type<sample_result>(),
-                                           "item_iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::make_tuple(it->first, it->second));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Return the key/value pairs in this :class:`SampleResult` "
           "dictionary.\n")
       .def(
           "values",
           [](sample_result &self) {
-            return nanobind::make_value_iterator(
-                nanobind::type<sample_result>(), "value_iterator", self.begin(),
-                self.end());
+            nanobind::list values;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              values.append(nanobind::cast(it->second));
+            return values.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Return all values (the counts) in this :class:`SampleResult` "
           "dictionary.\n")
       .def(nanobind::self += nanobind::self)
diff --git a/python/runtime/cudaq/algorithms/py_evolve.cpp b/python/runtime/cudaq/algorithms/py_evolve.cpp
index 80e54f3edc7..eac8cebf668 100644
--- a/python/runtime/cudaq/algorithms/py_evolve.cpp
+++ b/python/runtime/cudaq/algorithms/py_evolve.cpp
@@ -11,8 +11,8 @@
 #include "cudaq/algorithms/evolve_internal.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include <nanobind/ndarray.h>
 #include <nanobind/stl/complex.h>
diff --git a/python/runtime/cudaq/algorithms/py_observe_async.cpp b/python/runtime/cudaq/algorithms/py_observe_async.cpp
index 1b134f5731e..faa4b114e78 100644
--- a/python/runtime/cudaq/algorithms/py_observe_async.cpp
+++ b/python/runtime/cudaq/algorithms/py_observe_async.cpp
@@ -13,14 +13,13 @@
 #include "cudaq/Todo.h"
 #include "cudaq/algorithms/observe.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include <fmt/core.h>
 #include <nanobind/stl/optional.h>
 #include <nanobind/stl/string.h>
-#include <nanobind/stl/tuple.h>
 #include <nanobind/stl/vector.h>
 
 using namespace cudaq;
@@ -131,8 +130,10 @@ pyObservePar(const PyParType &type, const std::string &shortName,
       printf(
           "[cudaq::observe warning] distributed observe requested but only 1 "
           "QPU available. no speedup expected.\n");
+    nanobind::gil_scoped_release release;
     return details::distributeComputations(
         [&](std::size_t i, const spin_op &op) {
+          nanobind::gil_scoped_acquire acquire;
           return pyObserveAsync(shortName, module, op, i, shots, args);
         },
         spin_operator, nQpus);
@@ -154,8 +155,10 @@ pyObservePar(const PyParType &type, const std::string &shortName,
   auto localH = spins[rank];
 
   // Distribute locally, i.e. to the local nodes QPUs
+  nanobind::gil_scoped_release release;
   auto localRankResult = details::distributeComputations(
       [&](std::size_t i, const spin_op &op) {
+        nanobind::gil_scoped_acquire acquire;
         return pyObserveAsync(shortName, module, op, i, shots, args);
       },
       localH, nQpus);
@@ -170,12 +173,12 @@ pyObservePar(const PyParType &type, const std::string &shortName,
 /// broadcast. All these variants are handled here.
 static observe_result observe_parallel_impl(const std::string &shortName,
                                             MlirModule module,
-                                            nanobind::type_object execution,
+                                            nanobind::object execution,
                                             spin_op &spin_operator, int shots,
                                             std::optional<noise_model> noise,
                                             nanobind::args arguments) {
   std::string applicatorKey =
-      nanobind::cast<std::string>(execution.attr("__name__"));
+      std::string(nanobind::str(execution.attr("__name__")).c_str());
   auto mod = unwrap(module);
   if (applicatorKey == "thread")
     return pyObservePar(PyParType::thread, shortName, mod, spin_operator, shots,
@@ -207,5 +210,9 @@ void cudaq::bindObserveAsync(nanobind::module_ &mod) {
           "Test to see if the kernel is suited for use with observe.");
 
   mod.def("observe_parallel_impl", observe_parallel_impl,
+          nanobind::arg("shortName"), nanobind::arg("module"),
+          nanobind::arg("execution"), nanobind::arg("spin_operator"),
+          nanobind::arg("shots"), nanobind::arg("noise").none(),
+          nanobind::arg("arguments"),
           "See the python documentation for observe_parallel.");
 }
diff --git a/python/runtime/cudaq/algorithms/py_optimizer.cpp b/python/runtime/cudaq/algorithms/py_optimizer.cpp
index 339b33e81ae..39c390e6b28 100644
--- a/python/runtime/cudaq/algorithms/py_optimizer.cpp
+++ b/python/runtime/cudaq/algorithms/py_optimizer.cpp
@@ -16,6 +16,7 @@
 #include "cudaq/algorithms/gradients/central_difference.h"
 #include "cudaq/algorithms/gradients/forward_difference.h"
 #include "cudaq/algorithms/gradients/parameter_shift.h"
+#include "cudaq/algorithms/optimizer.h"
 #include "cudaq/algorithms/optimizers/ensmallen/ensmallen.h"
 #include "cudaq/algorithms/optimizers/nlopt/nlopt.h"
 #include "py_optimizer.h"
@@ -23,12 +24,40 @@
 
 namespace cudaq {
 
-/// @brief optimization_result is a typedef for std::tuple<double,
-/// std::vector<double>> which is automatically converted by nanobind's
-/// stl/tuple type caster.
+/// Wrapper exposed as OptimizationResult so cudaq_runtime.OptimizationResult
+/// exists for re-export and type hints. optimize() returns a plain tuple
+/// (opt_value, opt_params); this type can wrap that for structured access.
+struct OptimizationResultPy {
+  double opt_value = 0.0;
+  std::vector<double> optimal_parameters;
+
+  OptimizationResultPy() = default;
+  OptimizationResultPy(double v, std::vector<double> p)
+      : opt_value(v), optimal_parameters(std::move(p)) {}
+  explicit OptimizationResultPy(const optimization_result &r)
+      : opt_value(std::get<0>(r)), optimal_parameters(std::get<1>(r)) {}
+};
+
 void bindOptimizationResult(nanobind::module_ &mod) {
-  mod.attr("OptimizationResult") =
-      nanobind::handle(reinterpret_cast<PyObject *>(&PyTuple_Type));
+  nanobind::class_<OptimizationResultPy>(
+      mod, "OptimizationResult",
+      "Result of an optimization: (opt_value, optimal_parameters). "
+      "optimize() returns a tuple; this type is for type hints and wrapping.")
+      .def(nanobind::init<double, std::vector<double>>(),
+           nanobind::arg("opt_value"), nanobind::arg("optimal_parameters"))
+      .def(nanobind::init<const optimization_result &>(),
+           "Wrap a tuple (opt_value, optimal_parameters).")
+      .def_ro("opt_value", &OptimizationResultPy::opt_value)
+      .def_ro("optimal_parameters", &OptimizationResultPy::optimal_parameters)
+      .def("__getitem__",
+           [](const OptimizationResultPy &self, size_t i) -> nanobind::object {
+             if (i == 0)
+               return nanobind::cast(self.opt_value);
+             if (i == 1)
+               return nanobind::cast(self.optimal_parameters);
+             throw std::out_of_range("OptimizationResult index out of range");
+           })
+      .def("__len__", [](const OptimizationResultPy &) { return 2; });
 }
 
 void bindGradientStrategies(nanobind::module_ &mod) {
@@ -156,8 +185,24 @@ nanobind::class_<OptimizerT, optimizer> addPyOptimizer(nanobind::module_ &mod,
           the optimizer will perform. If not set, the optimizer may run until 
           convergence or until another stopping criterion is met.
           )doc")
-      .def_rw("initial_parameters", &OptimizerT::initial_parameters,
-              R"doc(
+      .def_prop_rw(
+          "initial_parameters",
+          [](OptimizerT &self) -> nanobind::object {
+            if (self.initial_parameters.has_value())
+              return nanobind::cast(self.initial_parameters.value());
+            return nanobind::none();
+          },
+          [](OptimizerT &self, nanobind::object vals) {
+            if (vals.is_none()) {
+              self.initial_parameters = std::nullopt;
+              return;
+            }
+            std::vector<double> v;
+            for (auto val : vals)
+              v.push_back(nanobind::cast<double>(val));
+            self.initial_parameters = std::move(v);
+          },
+          R"doc(
           list[float]: Initial values for the optimization parameters (optional).
 
           Provides a starting point for the optimization. If not specified, the 
@@ -170,7 +215,24 @@ nanobind::class_<OptimizerT, optimizer> addPyOptimizer(nanobind::module_ &mod,
 
                   optimizer.initial_parameters = [0.5, -0.3, 1.2]
           )doc")
-      .def_rw("lower_bounds", &OptimizerT::lower_bounds, R"doc(
+      .def_prop_rw(
+          "lower_bounds",
+          [](OptimizerT &self) -> nanobind::object {
+            if (self.lower_bounds.has_value())
+              return nanobind::cast(self.lower_bounds.value());
+            return nanobind::none();
+          },
+          [](OptimizerT &self, nanobind::object vals) {
+            if (vals.is_none()) {
+              self.lower_bounds = std::nullopt;
+              return;
+            }
+            std::vector<double> v;
+            for (auto val : vals)
+              v.push_back(nanobind::cast<double>(val));
+            self.lower_bounds = std::move(v);
+          },
+          R"doc(
           list[float]: Lower bounds for optimization parameters (optional).
 
           Constrains the search space by specifying minimum allowed values for 
@@ -182,7 +244,24 @@ nanobind::class_<OptimizerT, optimizer> addPyOptimizer(nanobind::module_ &mod,
 
                   optimizer.lower_bounds = [-2.0, -2.0]  # For 2D problem
           )doc")
-      .def_rw("upper_bounds", &OptimizerT::upper_bounds, R"doc(
+      .def_prop_rw(
+          "upper_bounds",
+          [](OptimizerT &self) -> nanobind::object {
+            if (self.upper_bounds.has_value())
+              return nanobind::cast(self.upper_bounds.value());
+            return nanobind::none();
+          },
+          [](OptimizerT &self, nanobind::object vals) {
+            if (vals.is_none()) {
+              self.upper_bounds = std::nullopt;
+              return;
+            }
+            std::vector<double> v;
+            for (auto val : vals)
+              v.push_back(nanobind::cast<double>(val));
+            self.upper_bounds = std::move(v);
+          },
+          R"doc(
           list[float]: Upper bounds for optimization parameters (optional).
 
           Constrains the search space by specifying maximum allowed values for 
diff --git a/python/runtime/cudaq/algorithms/py_resource_count.cpp b/python/runtime/cudaq/algorithms/py_resource_count.cpp
index 53af2405cf5..ec52bb03c68 100644
--- a/python/runtime/cudaq/algorithms/py_resource_count.cpp
+++ b/python/runtime/cudaq/algorithms/py_resource_count.cpp
@@ -10,7 +10,7 @@
 #include "common/Resources.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
 #include "utils/LinkedLibraryHolder.h"
-#include "utils/NanobindAdaptors.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include <nanobind/stl/function.h>
 #include <nanobind/stl/optional.h>
 
@@ -60,6 +60,7 @@ estimate_resources_impl(const std::string &kernelName, MlirModule kernelMod,
 }
 
 void cudaq::bindCountResources(nanobind::module_ &mod) {
-  mod.def("estimate_resources_impl", estimate_resources_impl,
+  mod.def("estimate_resources_impl", estimate_resources_impl, nanobind::arg(),
+          nanobind::arg(), nanobind::arg().none(), nanobind::arg(),
           "See python documentation for estimate_resources.");
 }
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index fb09b0b8e7a..70eb9cb2986 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -11,8 +11,8 @@
 #include "cudaq/algorithms/run.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include <future>
 #include <nanobind/nanobind.h>
 #include <nanobind/ndarray.h>
@@ -23,7 +23,6 @@
 #include <nanobind/stl/vector.h>
 
 using namespace cudaq;
-using namespace cudaq_internal::compiler;
 
 static std::vector<nanobind::object>
 readRunResults(mlir::ModuleOp module, mlir::Type ty,
@@ -72,16 +71,17 @@ pyRunTheKernel(const std::string &name, quantum_platform &platform,
   // kernels.
   if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(returnTy)) {
     auto elemTy = vecTy.getElementType();
-    if (elemTy.isa<cudaq::cc::StdvecType>())
+    if (mlir::isa<cudaq::cc::StdvecType>(elemTy))
       throw std::runtime_error(
           "`cudaq.run` does not yet support returning nested `list` from "
           "entry-point kernels.");
-    if (elemTy.isa<cudaq::cc::StructType>())
+    if (mlir::isa<cudaq::cc::StructType>(elemTy))
       throw std::runtime_error("`cudaq.run` does not yet support returning "
                                "`list` of `dataclass`/`tuple` from "
                                "entry-point kernels.");
   }
-  auto layoutInfo = getLayoutInfo(name, mod.getOperation());
+  auto layoutInfo =
+      cudaq_internal::compiler::getLayoutInfo(name, mod.getOperation());
   auto results = details::runTheKernel(
       [&]() mutable {
         [[maybe_unused]] auto result = clean_launch_module(name, mod, opaques);
@@ -242,7 +242,9 @@ run_async_impl(const std::string &shortName, MlirModule module,
 
 /// @brief Bind the run cudaq function.
 void cudaq::bindPyRun(nanobind::module_ &mod) {
-  mod.def("run_impl", run_impl,
+  mod.def("run_impl", run_impl, nanobind::arg(), nanobind::arg(),
+          nanobind::arg(), nanobind::arg().none(), nanobind::arg(),
+          nanobind::arg(),
           R"#(
 Run the provided `kernel` with the given kernel arguments over the specified
 number of circuit executions (`shots_count`).
@@ -281,7 +283,9 @@ void cudaq::bindPyRunAsync(nanobind::module_ &mod) {
           },
           "FIXME: documentation goes here");
 
-  mod.def("run_async_impl", run_async_impl,
+  mod.def("run_async_impl", run_async_impl, nanobind::arg(), nanobind::arg(),
+          nanobind::arg(), nanobind::arg().none(), nanobind::arg(),
+          nanobind::arg(),
           R"#(
 Run the provided `kernel` with the given kernel arguments over the specified
 number of circuit executions (`shots_count`) asynchronously on the specified
diff --git a/python/runtime/cudaq/algorithms/py_sample_async.cpp b/python/runtime/cudaq/algorithms/py_sample_async.cpp
index d11969242f7..3429f7da3fb 100644
--- a/python/runtime/cudaq/algorithms/py_sample_async.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_async.cpp
@@ -10,8 +10,8 @@
 #include "common/DeviceCodeRegistry.h"
 #include "cudaq/algorithms/sample.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include <fmt/core.h>
@@ -109,5 +109,10 @@ programming pattern.
           },
           "FIXME: document");
 
-  mod.def("sample_async_impl", sample_async_impl, "FIXME: document");
+  mod.def("sample_async_impl", sample_async_impl, "FIXME: document",
+          nanobind::arg("short_name"), nanobind::arg("module"),
+          nanobind::arg("shots_count"),
+          nanobind::arg("noise_model").none() = std::nullopt,
+          nanobind::arg("explicit_measurements"), nanobind::arg("qpu_id"),
+          nanobind::arg("runtime_args"));
 }
diff --git a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
index 7d0c58e3b16..85126dbcd46 100644
--- a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
@@ -20,8 +20,8 @@
 #include "cudaq/ptsbe/strategies/OrderedSamplingStrategy.h"
 #include "cudaq/ptsbe/strategies/ProbabilisticSamplingStrategy.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include <nanobind/stl/optional.h>
@@ -36,13 +36,18 @@ using namespace cudaq;
 ///
 /// All PTSBE configuration is handled by the Python wrapper
 /// (cudaq.ptsbe.sample) and passed here as positional parameters.
+// nanobind 2.x cannot dispatch NB_TYPE_CASTER-based parameters (MlirModule)
+// when nanobind::object appears in the same function signature. Use concrete
+// std::optional types for all nullable parameters instead.
 static ptsbe::sample_result
 pySamplePTSBE(const std::string &shortName, MlirModule module,
               std::size_t shots_count, noise_model noiseModel,
               std::optional<std::size_t> max_trajectories,
-              nanobind::object sampling_strategy,
-              nanobind::object shot_allocation_obj, bool return_execution_data,
-              bool include_sequential_data, nanobind::args runtimeArgs) {
+              std::optional<std::shared_ptr<ptsbe::PTSSamplingStrategy>>
+                  sampling_strategy,
+              std::optional<ptsbe::ShotAllocationStrategy> shot_allocation,
+              bool return_execution_data, bool include_sequential_data,
+              nanobind::args runtimeArgs) {
   if (shots_count == 0)
     return ptsbe::sample_result();
 
@@ -51,14 +56,11 @@ pySamplePTSBE(const std::string &shortName, MlirModule module,
   ptsbe_options.include_sequential_data = include_sequential_data;
   ptsbe_options.max_trajectories = max_trajectories;
 
-  if (!sampling_strategy.is_none())
-    ptsbe_options.strategy =
-        nanobind::cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
-            sampling_strategy);
+  if (sampling_strategy)
+    ptsbe_options.strategy = *sampling_strategy;
 
-  if (!shot_allocation_obj.is_none())
-    ptsbe_options.shot_allocation =
-        nanobind::cast<ptsbe::ShotAllocationStrategy>(shot_allocation_obj);
+  if (shot_allocation)
+    ptsbe_options.shot_allocation = *shot_allocation;
 
   auto mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
@@ -108,26 +110,26 @@ struct AsyncPTSBESampleResultImpl {
 } // namespace
 
 /// @brief Run PTSBE sampling asynchronously from Python.
-static AsyncPTSBESampleResultImpl pySampleAsyncPTSBE(
-    const std::string &shortName, MlirModule module, std::size_t shots_count,
-    noise_model &noiseModel, std::optional<std::size_t> max_trajectories,
-    nanobind::object sampling_strategy, nanobind::object shot_allocation_obj,
-    bool return_execution_data, bool include_sequential_data,
-    nanobind::args runtimeArgs) {
+static AsyncPTSBESampleResultImpl
+pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
+                   std::size_t shots_count, noise_model &noiseModel,
+                   std::optional<std::size_t> max_trajectories,
+                   std::optional<std::shared_ptr<ptsbe::PTSSamplingStrategy>>
+                       sampling_strategy,
+                   std::optional<ptsbe::ShotAllocationStrategy> shot_allocation,
+                   bool return_execution_data, bool include_sequential_data,
+                   nanobind::args runtimeArgs) {
 
   ptsbe::PTSBEOptions ptsbe_options;
   ptsbe_options.return_execution_data = return_execution_data;
   ptsbe_options.include_sequential_data = include_sequential_data;
   ptsbe_options.max_trajectories = max_trajectories;
 
-  if (!sampling_strategy.is_none())
-    ptsbe_options.strategy =
-        nanobind::cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
-            sampling_strategy);
+  if (sampling_strategy)
+    ptsbe_options.strategy = *sampling_strategy;
 
-  if (!shot_allocation_obj.is_none())
-    ptsbe_options.shot_allocation =
-        nanobind::cast<ptsbe::ShotAllocationStrategy>(shot_allocation_obj);
+  if (shot_allocation)
+    ptsbe_options.shot_allocation = *shot_allocation;
 
   auto mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
@@ -398,14 +400,15 @@ void cudaq::bindSamplePTSBE(nanobind::module_ &mod) {
            "Block until the PTSBE sampling result is available and return it.");
 
   // PTSBE sample implementation
-  ptsbe.def("sample_impl", pySamplePTSBE, nanobind::arg("kernel_name"),
-            nanobind::arg("module"), nanobind::arg("shots_count"),
-            nanobind::arg("noise_model"), nanobind::arg("max_trajectories"),
-            nanobind::arg("sampling_strategy").none(),
-            nanobind::arg("shot_allocation").none(),
-            nanobind::arg("return_execution_data"),
-            nanobind::arg("include_sequential_data"),
-            R"pbdoc(
+  ptsbe.def(
+      "sample_impl", pySamplePTSBE, nanobind::arg("kernel_name"),
+      nanobind::arg("module"), nanobind::arg("shots_count"),
+      nanobind::arg("noise_model"), nanobind::arg("max_trajectories").none(),
+      nanobind::arg("sampling_strategy").none(),
+      nanobind::arg("shot_allocation").none(),
+      nanobind::arg("return_execution_data"),
+      nanobind::arg("include_sequential_data"), nanobind::arg("arguments"),
+      R"pbdoc(
 Run PTSBE sampling on the provided kernel.
 
 Args:
@@ -425,14 +428,14 @@ Run PTSBE sampling on the provided kernel.
 )pbdoc");
 
   // PTSBE async sample implementation
-  ptsbe.def("sample_async_impl", pySampleAsyncPTSBE,
-            nanobind::arg("kernel_name"), nanobind::arg("module"),
-            nanobind::arg("shots_count"), nanobind::arg("noise_model"),
-            nanobind::arg("max_trajectories"),
-            nanobind::arg("sampling_strategy").none(),
-            nanobind::arg("shot_allocation").none(),
-            nanobind::arg("return_execution_data"),
-            nanobind::arg("include_sequential_data"),
-            "Run PTSBE sampling asynchronously. Returns an "
-            "AsyncSampleResultImpl.");
+  ptsbe.def(
+      "sample_async_impl", pySampleAsyncPTSBE, nanobind::arg("kernel_name"),
+      nanobind::arg("module"), nanobind::arg("shots_count"),
+      nanobind::arg("noise_model"), nanobind::arg("max_trajectories").none(),
+      nanobind::arg("sampling_strategy").none(),
+      nanobind::arg("shot_allocation").none(),
+      nanobind::arg("return_execution_data"),
+      nanobind::arg("include_sequential_data"), nanobind::arg("arguments"),
+      "Run PTSBE sampling asynchronously. Returns an "
+      "AsyncSampleResultImpl.");
 }
diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index fff952c31e4..fd62bc02bb8 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -13,39 +13,14 @@
 #include "cudaq/algorithms/get_state.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include <cstdint>
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include <nanobind/ndarray.h>
-#include <nanobind/stl/complex.h>
-#include <nanobind/stl/function.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/pair.h>
-#include <nanobind/stl/string.h>
-#include <nanobind/stl/unique_ptr.h>
-#include <nanobind/stl/vector.h>
-#include <numeric>
 
 using namespace cudaq;
 
-// FIXME: This is using a thread unsafe global?
-/// If we have any implicit device-to-host data transfers we will store that
-/// data here and ensure it is deleted properly.
-static std::vector<std::unique_ptr<void, std::function<void(void *)>>>
-    hostDataFromDevice;
-
-namespace {
-// CuPy interop helpers.
-struct BufferInfo {
-  void *ptr = nullptr;
-  std::size_t itemsize = 0;
-  std::string format;
-  std::vector<std::size_t> shape;
-  std::vector<ssize_t> strides;
-  bool readonly = false;
-  std::size_t size = 0;
-};
-} // namespace
+// Note: Removed unsafe global hostDataFromDevice vector.
+// Ownership is now managed via nb::capsule per-array.
 
 static nanobind::dict getCupyArrayInterface(nanobind::handle cupyArray) {
   if (!nanobind::hasattr(cupyArray, "__cuda_array_interface__"))
@@ -97,6 +72,21 @@ getCupyComplexTypeInfo(const std::string &typeStr) {
                            ". Supported types are: <c16 and <c8.");
 }
 
+namespace {
+/// @brief Helper struct to hold buffer metadata, analogous to Python's
+/// buffer_info.
+struct BufferInfo {
+  void *ptr = nullptr;
+  std::size_t itemsize = 0;
+  std::string format;
+  std::size_t ndim = 0;
+  std::vector<std::size_t> shape;
+  std::vector<ssize_t> strides;
+  bool readonly = false;
+  std::size_t size = 0; // total number of elements
+};
+} // namespace
+
 static BufferInfo getCupyBufferInfo(nanobind::object cupyArray) {
   auto cupyArrayInfo = getCupyArrayInterface(cupyArray);
   auto dataInfo = nanobind::cast<nanobind::tuple>(cupyArrayInfo["data"]);
@@ -283,10 +273,6 @@ state pyGetStateRemote(nanobind::object kernel, nanobind::args args) {
   auto kernelMod = nanobind::cast<MlirModule>(kernel.attr("qkeModule"));
   args = simplifiedValidateInputArguments(args);
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
-#if 0
-  auto [argWrapper, size, returnOffset] =
-      pyCreateNativeKernel(kernelName, kernelMod, *argData);
-#endif
   return state(new PyRemoteSimulationState(kernelName, /*argWrapper*/ {},
                                            argData,
                                            /*size*/ 0, /*returnOffset*/ 0));
@@ -341,6 +327,13 @@ state pyGetStateLibraryMode(nanobind::object kernel, nanobind::args args) {
   });
 }
 
+// Helper to check if object is a CuPy array (has __cuda_array_interface__)
+static bool isCupyArray(nanobind::object obj) {
+  return nanobind::hasattr(obj, "__cuda_array_interface__");
+}
+
+/// @brief Helper to get BufferInfo from a numpy array via Python buffer
+/// protocol.
 static BufferInfo getNumpyBufferInfo(nanobind::object numpy_array) {
   auto dtype = numpy_array.attr("dtype");
   std::string dtypeStr = nanobind::cast<std::string>(dtype.attr("name"));
@@ -365,10 +358,8 @@ static BufferInfo getNumpyBufferInfo(nanobind::object numpy_array) {
   }
   auto stridesTuple =
       nanobind::cast<nanobind::tuple>(numpy_array.attr("strides"));
-  for (std::size_t i = 0; i < stridesTuple.size(); i++) {
+  for (std::size_t i = 0; i < stridesTuple.size(); i++)
     info.strides.push_back(nanobind::cast<ssize_t>(stridesTuple[i]));
-  }
-  // Get the raw data pointer via numpy's ctypes interface
   info.ptr = reinterpret_cast<void *>(
       nanobind::cast<intptr_t>(numpy_array.attr("ctypes").attr("data")));
   info.readonly = false;
@@ -488,100 +479,67 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
       "via the `cudaq.get_state(...)` function or the static "
       "`cudaq.State.from_data()` method.\n")
       .def(
-          "__array__",
-          [](const state &self, nanobind::object dtype_obj,
-             nanobind::object copy_obj) {
+          "to_numpy",
+          [](const state &self) -> nanobind::object {
             if (self.get_num_tensors() != 1)
               throw std::runtime_error(
                   "Numpy interop is only supported for vector "
                   "and matrix state data.");
 
-            // This method enables interoperability with NumPy array data.
-            // We must be careful since the state data may actually be on GPU
-            // device.
-
-            nanobind::module_ np = nanobind::module_::import_("numpy");
             auto stateVector = self.get_tensor();
             auto precision = self.get_precision();
-            auto shape = self.get_tensor().extents;
-
-            // Determine numpy dtype
-            nanobind::object np_dtype =
-                precision == SimulationState::precision::fp32
-                    ? np.attr("complex64")
-                    : np.attr("complex128");
+            std::vector<size_t> shape(stateVector.extents.begin(),
+                                      stateVector.extents.end());
 
             if (self.is_on_gpu()) {
-              // This is device data, transfer to host
               auto numElements = stateVector.get_num_elements();
-              nanobind::object arr;
+
               if (precision == SimulationState::precision::fp32) {
                 auto *hostData = new std::complex<float>[numElements];
                 self.to_host(hostData, numElements);
-                // Create numpy array and copy data
-                if (shape.size() != 1) {
-                  nanobind::tuple np_shape =
-                      nanobind::make_tuple(shape[0], shape[1]);
-                  arr = np.attr("empty")(np_shape, np_dtype);
-                } else {
-                  nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
-                  arr = np.attr("empty")(np_shape, np_dtype);
-                }
-                auto *destPtr = reinterpret_cast<std::complex<float> *>(
-                    nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
-                std::memcpy(destPtr, hostData,
-                            numElements * sizeof(std::complex<float>));
-                delete[] hostData;
+
+                nanobind::capsule owner(hostData, [](void *p) noexcept {
+                  CUDAQ_INFO("freeing data that was copied from GPU device "
+                             "for compatibility with NumPy");
+                  delete[] static_cast<std::complex<float> *>(p);
+                });
+
+                return nanobind::cast(
+                    nanobind::ndarray<nanobind::numpy, std::complex<float>>(
+                        hostData, shape.size(), shape.data(), owner));
               } else {
                 auto *hostData = new std::complex<double>[numElements];
                 self.to_host(hostData, numElements);
-                if (shape.size() != 1) {
-                  nanobind::tuple np_shape =
-                      nanobind::make_tuple(shape[0], shape[1]);
-                  arr = np.attr("empty")(np_shape, np_dtype);
-                } else {
-                  nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
-                  arr = np.attr("empty")(np_shape, np_dtype);
-                }
-                auto *destPtr = reinterpret_cast<std::complex<double> *>(
-                    nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
-                std::memcpy(destPtr, hostData,
-                            numElements * sizeof(std::complex<double>));
-                delete[] hostData;
-              }
-              return arr;
-            }
 
-            // Host data path - wrap existing memory
-            void *dataPtr = self.get_tensor().data;
-            auto numElements = stateVector.get_num_elements();
-            if (shape.size() != 1) {
-              nanobind::tuple np_shape =
-                  nanobind::make_tuple(shape[0], shape[1]);
-              // Use np.frombuffer-like approach: create array from pointer
-              nanobind::object arr = np.attr("empty")(np_shape, np_dtype);
-              auto *destPtr = reinterpret_cast<void *>(
-                  nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
-              std::size_t dataTypeSize =
-                  precision == SimulationState::precision::fp32
-                      ? sizeof(std::complex<float>)
-                      : sizeof(std::complex<double>);
-              std::memcpy(destPtr, dataPtr, numElements * dataTypeSize);
-              return arr;
+                nanobind::capsule owner(hostData, [](void *p) noexcept {
+                  CUDAQ_INFO("freeing data that was copied from GPU device "
+                             "for compatibility with NumPy");
+                  delete[] static_cast<std::complex<double> *>(p);
+                });
+
+                return nanobind::cast(
+                    nanobind::ndarray<nanobind::numpy, std::complex<double>>(
+                        hostData, shape.size(), shape.data(), owner));
+              }
+            } else {
+              if (precision == SimulationState::precision::fp32) {
+                return nanobind::cast(
+                    nanobind::ndarray<nanobind::numpy, std::complex<float>>(
+                        stateVector.data, shape.size(), shape.data(),
+                        nanobind::handle()));
+              } else {
+                return nanobind::cast(
+                    nanobind::ndarray<nanobind::numpy, std::complex<double>>(
+                        stateVector.data, shape.size(), shape.data(),
+                        nanobind::handle()));
+              }
             }
-            nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
-            nanobind::object arr = np.attr("empty")(np_shape, np_dtype);
-            auto *destPtr = reinterpret_cast<void *>(
-                nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
-            std::size_t dataTypeSize =
-                precision == SimulationState::precision::fp32
-                    ? sizeof(std::complex<float>)
-                    : sizeof(std::complex<double>);
-            std::memcpy(destPtr, dataPtr, numElements * dataTypeSize);
-            return arr;
           },
-          nanobind::arg("dtype") = nanobind::none(),
-          nanobind::arg("copy") = nanobind::none())
+          "Convert to a NumPy array.")
+      .def("__array__",
+           [](nanobind::object self, nanobind::args, nanobind::kwargs) {
+             return self.attr("to_numpy")();
+           })
       .def(
           "__len__",
           [](state &self) {
@@ -651,7 +609,6 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
           [](const std::vector<SimulationState::Tensor> &tensors) {
             TensorStateData tensorData;
             for (auto &tensor : tensors) {
-
               tensorData.emplace_back(
                   std::pair<const void *, std::vector<std::size_t>>{
                       tensor.data, tensor.extents});
@@ -659,6 +616,27 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
             return state::from_data(tensorData);
           },
           "Return a state from matrix product state tensor data.")
+      .def_static(
+          "from_data",
+          [&holder](const std::vector<nanobind::object> &tensors) {
+            const bool isHostData = tensors.empty() || !isCupyArray(tensors[0]);
+            if (!holder.getTarget().config.GpuRequired && !isHostData)
+              throw std::runtime_error(fmt::format(
+                  "Current target '{}' does not support CuPy arrays.",
+                  holder.getTarget().name));
+            TensorStateData tensorData;
+            for (auto &tensor : tensors) {
+              auto arr = nanobind::cast<nanobind::ndarray<>>(tensor);
+              std::vector<std::size_t> extents;
+              for (size_t i = 0; i < arr.ndim(); ++i)
+                extents.push_back(arr.shape(i));
+              tensorData.emplace_back(
+                  std::pair<const void *, std::vector<std::size_t>>{arr.data(),
+                                                                    extents});
+            }
+            return state::from_data(tensorData);
+          },
+          "Return a state from matrix product state tensor data.")
       .def_static(
           "from_data",
           [](const nanobind::list &tensors) {
@@ -667,7 +645,7 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
             // for cupy arrays (implementing Python array interface), may be
             // overshadowed by any std::vector overloads.
             TensorStateData tensorData;
-            for (auto tensor : tensors) {
+            for (nanobind::handle tensor : tensors) {
               // Make sure this is a CuPy array
               if (!nanobind::hasattr(tensor, "data"))
                 throw std::runtime_error(
@@ -681,8 +659,8 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
 
               // We know this is a cupy device pointer. Start by ensuring it is
               // of proper complex type
-              auto typeStr = nanobind::cast<std::string>(
-                  tensor.attr("dtype").attr("name"));
+              auto typeStr =
+                  std::string(nanobind::str(tensor.attr("dtype")).c_str());
               if (typeStr != "complex128")
                 throw std::runtime_error(
                     "invalid from_data operation on nanobind::object tensors - "
@@ -719,8 +697,8 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
 
             // We know this is a cupy device pointer. Start by ensuring it is of
             // complex type
-            auto typeStr = nanobind::cast<std::string>(
-                opaqueData.attr("dtype").attr("name"));
+            auto typeStr =
+                std::string(nanobind::str(opaqueData.attr("dtype")).c_str());
             if (typeStr.find("float") != std::string::npos)
               throw std::runtime_error(
                   "CuPy array with only floating point elements passed to "
@@ -848,7 +826,7 @@ index pair.
           [](state &self) {
             std::stringstream ss;
             self.dump(ss);
-            nanobind::print(ss.str().c_str());
+            nanobind::module_::import_("builtins").attr("print")(ss.str());
           },
           "Print the state to the console.")
       .def("__str__",
@@ -863,7 +841,7 @@ index pair.
           "Compute the overlap between the provided :class:`State`'s.")
       .def(
           "overlap",
-          [&holder](state &self, nanobind::object &other) {
+          [&holder](state &self, nanobind::object other) {
             if (self.get_num_tensors() != 1)
               throw std::runtime_error("overlap NumPy interop only supported "
                                        "for vector and matrix state data.");
@@ -891,7 +869,7 @@ index pair.
 
             // Start by ensuring it is of complex type
             auto typeStr =
-                nanobind::cast<std::string>(other.attr("dtype").attr("name"));
+                std::string(nanobind::str(other.attr("dtype")).c_str());
             if (typeStr.find("float") != std::string::npos)
               throw std::runtime_error(
                   "CuPy array with only floating point elements passed to "
diff --git a/python/runtime/cudaq/algorithms/py_translate.cpp b/python/runtime/cudaq/algorithms/py_translate.cpp
index 86faaa47c79..3c0cdc8e8a5 100644
--- a/python/runtime/cudaq/algorithms/py_translate.cpp
+++ b/python/runtime/cudaq/algorithms/py_translate.cpp
@@ -14,8 +14,10 @@
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
@@ -54,7 +56,7 @@ static std::string translate_impl(const std::string &shortName,
       cudaq::marshal_arguments_for_module_launch(mod, runtimeArguments, fn);
 
   return StringSwitch<std::function<std::string()>>(formatPair.first)
-      .Cases("qir", "qir-full", "qir-adaptive", "qir-base",
+      .Cases({"qir", "qir-full", "qir-adaptive", "qir-base"},
              [&]() {
                return cudaq::detail::lower_to_qir_llvm(shortName, mod, opaques,
                                                        format);
@@ -94,7 +96,6 @@ void cudaq::bindPyTranslate(nanobind::module_ &mod) {
         if (failed(pm.run(mod)))
           throw std::runtime_error("Conversion to " + format + " failed.");
         llvm::LLVMContext llvmContext;
-        llvmContext.setOpaquePointers(false);
         std::unique_ptr<llvm::Module> llvmModule =
             translateModuleToLLVMIR(mod, llvmContext);
         if (!llvmModule)
diff --git a/python/runtime/cudaq/algorithms/py_unitary.cpp b/python/runtime/cudaq/algorithms/py_unitary.cpp
index 3aefbbc957d..5d67ee17a01 100644
--- a/python/runtime/cudaq/algorithms/py_unitary.cpp
+++ b/python/runtime/cudaq/algorithms/py_unitary.cpp
@@ -10,7 +10,7 @@
 #include "cudaq/algorithms/unitary.h"
 #include "runtime/cudaq/operators/py_helpers.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "utils/NanobindAdaptors.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 
 using namespace cudaq;
 
@@ -24,7 +24,7 @@ static nanobind::object get_unitary_impl(const std::string &shortName,
 
   // Return as numpy array (dim, dim), complex128
   auto temp = contrib::get_unitary_cmat(std::move(f));
-  return nanobind::cast(details::cmat_to_numpy(temp));
+  return details::cmat_to_numpy(temp);
 }
 
 /// Bind the get_unitary cudaq function
diff --git a/python/runtime/cudaq/algorithms/py_utils.cpp b/python/runtime/cudaq/algorithms/py_utils.cpp
index e396f93c3a5..069dc67c154 100644
--- a/python/runtime/cudaq/algorithms/py_utils.cpp
+++ b/python/runtime/cudaq/algorithms/py_utils.cpp
@@ -25,16 +25,17 @@ nanobind::dict get_serializable_var_dict() {
       auto key = item.first;
       auto value = item.second;
 
-      if (nanobind::cast<std::string>(key).starts_with("__")) {
+      std::string keyStr(nanobind::str(key).c_str());
+      if (keyStr.starts_with("__")) {
         // Ignore items that start with "__" (like Python __builtins__, etc.)
       } else if (nanobind::hasattr(value, "to_json")) {
-        auto type = value.type();
-        std::string module =
-            nanobind::cast<std::string>(type.attr("__module__"));
-        std::string name = nanobind::cast<std::string>(type.attr("__name__"));
+        auto type = nanobind::handle(
+            reinterpret_cast<PyObject *>(Py_TYPE(value.ptr())));
+        std::string module(nanobind::str(type.attr("__module__")).c_str());
+        std::string name(nanobind::str(type.attr("__name__")).c_str());
         auto type_name = nanobind::str((module + "." + name).c_str());
-        auto json_key_name = nanobind::str(nanobind::str(key).c_str()) +
-                             nanobind::str("/") + type_name;
+        nanobind::str json_key_name(
+            (keyStr + "/" + module + "." + name).c_str());
         serialized_dict[json_key_name] =
             json.attr("loads")(value.attr("to_json")());
       } else if (nanobind::hasattr(value, "tolist")) {
@@ -44,12 +45,7 @@ nanobind::dict get_serializable_var_dict() {
         serialized_dict[key] = json.attr("loads")(json.attr("dumps")(value));
       }
     } catch (const nanobind::python_error &e) {
-      // Uncomment the following lines for debug, but all this really means is
-      // that we won't send this to the remote server.
-
-      // std::cout << "Failed to serialize key '"
-      //           << nanobind::cast<std::string>(item.first)
-      //           << "' : " + std::string(e.what()) << std::endl;
+      // Serialization failures are non-fatal - we just skip the entry.
     }
   };
 
@@ -60,7 +56,7 @@ nanobind::dict get_serializable_var_dict() {
   std::vector<nanobind::object> frame_vec;
   auto current_frame = inspect.attr("currentframe")();
   while (current_frame && !current_frame.is_none()) {
-    frame_vec.push_back(current_frame);
+    frame_vec.push_back(nanobind::object(current_frame));
     current_frame = current_frame.attr("f_back");
   }
 
@@ -68,8 +64,7 @@ nanobind::dict get_serializable_var_dict() {
   // globals first to locals last. This ensures that the overwrites give
   // precedence to closest-to-locals.
   for (auto it = frame_vec.rbegin(); it != frame_vec.rend(); ++it) {
-    nanobind::dict f_locals =
-        nanobind::cast<nanobind::dict>(it->attr("f_locals"));
+    nanobind::dict f_locals = it->attr("f_locals");
     for (const auto item : f_locals)
       try_to_add_item(item);
   }
@@ -133,20 +128,18 @@ std::string get_var_name_for_handle(const nanobind::handle &h) {
   // Search locals first, walking up the call stack
   auto current_frame = inspect.attr("currentframe")();
   while (current_frame && !current_frame.is_none()) {
-    nanobind::dict f_locals =
-        nanobind::cast<nanobind::dict>(current_frame.attr("f_locals"));
+    nanobind::dict f_locals = current_frame.attr("f_locals");
     for (auto item : f_locals)
       if (item.second.is(h))
-        return nanobind::cast<std::string>(nanobind::str(item.first));
+        return std::string(nanobind::str(item.first).c_str());
     current_frame = current_frame.attr("f_back");
   }
   // Search globals now
   current_frame = inspect.attr("currentframe")();
-  nanobind::dict f_globals =
-      nanobind::cast<nanobind::dict>(current_frame.attr("f_globals"));
+  nanobind::dict f_globals = current_frame.attr("f_globals");
   for (auto item : f_globals)
     if (item.second.is(h))
-      return nanobind::cast<std::string>(nanobind::str(item.first));
+      return std::string(nanobind::str(item.first).c_str());
   return std::string();
 }
 
@@ -163,6 +156,18 @@ void bindPyDataClassRegistry(nanobind::module_ &mod) {
                   "Is class registered\n")
       .def_static("getClassAttributes", &DataClassRegistry::getClassAttributes,
                   "Find registered class and its attributes\n")
-      .def_ro_static("classes", &DataClassRegistry::classes);
+      .def_static(
+          "get_classes",
+          []() -> decltype(DataClassRegistry::classes) & {
+            return DataClassRegistry::classes;
+          },
+          nanobind::rv_policy::reference, "Get all registered classes.")
+      .def_prop_ro_static(
+          "classes",
+          [](nanobind::handle /*cls*/)
+              -> decltype(DataClassRegistry::classes) & {
+            return DataClassRegistry::classes;
+          },
+          nanobind::rv_policy::reference, "Get all registered classes.");
 }
 } // namespace cudaq
diff --git a/python/runtime/cudaq/domains/plugins/CMakeLists.txt b/python/runtime/cudaq/domains/plugins/CMakeLists.txt
index 3bd2e991655..f92505aa221 100644
--- a/python/runtime/cudaq/domains/plugins/CMakeLists.txt
+++ b/python/runtime/cudaq/domains/plugins/CMakeLists.txt
@@ -17,10 +17,13 @@ add_library(cudaq-pyscf SHARED PySCFDriver.cpp)
 
 target_compile_options(cudaq-pyscf PRIVATE -Wno-cast-qual)
 
+target_include_directories(cudaq-pyscf PRIVATE
+    ${Python3_INCLUDE_DIRS}
+)
 if (SKBUILD)
   target_link_libraries(cudaq-pyscf
     PRIVATE
-      nanobind-static Python::Module
+      nanobind-static Python3::Module
       cudaq-chemistry cudaq-operator cudaq cudaq-py-utils cudaq-platform-default)
   # Apple's linker (ld64) doesn't support --unresolved-symbols flag
   if (NOT APPLE)
@@ -28,12 +31,9 @@ if (SKBUILD)
       PRIVATE -Wl,--unresolved-symbols=ignore-in-object-files)
   endif()
 else()
-  if (NOT Python_FOUND)
-    message(FATAL_ERROR "find_package(Python) not run?")
-  endif()
   target_link_libraries(cudaq-pyscf
     PRIVATE
-      nanobind-static Python::Python
+      nanobind-static Python3::Python
       cudaq-chemistry cudaq-operator cudaq cudaq-py-utils cudaq-platform-default)
 endif()
 
diff --git a/python/runtime/cudaq/dynamics/CMakeLists.txt b/python/runtime/cudaq/dynamics/CMakeLists.txt
index d7910fdf586..b4f9b49fd43 100644
--- a/python/runtime/cudaq/dynamics/CMakeLists.txt
+++ b/python/runtime/cudaq/dynamics/CMakeLists.txt
@@ -6,14 +6,31 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
-find_package(Python COMPONENTS Interpreter Development)
+find_package(CUDAToolkit REQUIRED)
 
-nanobind_add_module(nvqir_dynamics_bindings NB_STATIC pyDynamics.cpp)
+nanobind_add_module(nvqir_dynamics_bindings
+    NB_SHARED
+    NB_DOMAIN cudaq
+    pyDynamics.cpp)
+target_include_directories(nvqir_dynamics_bindings PRIVATE
+    ${Python3_INCLUDE_DIRS}
+    ${nanobind_INCLUDE_DIR}
+)
+find_file(CUDENSITYMAT_INC
+    NAMES   cudensitymat.h
+    HINTS   
+        $ENV{CUQUANTUM_INSTALL_PREFIX}/include      
+        /usr/include    
+        ENV CPATH
+    REQUIRED
+)
 
+get_filename_component(CUDENSITYMAT_INCLUDE_DIR ${CUDENSITYMAT_INC} DIRECTORY)
 target_include_directories(nvqir_dynamics_bindings 
     PRIVATE 
         ${CMAKE_SOURCE_DIR}/runtime
-        ${CMAKE_SOURCE_DIR}/runtime/nvqir/cudensitymat
+        ${CMAKE_SOURCE_DIR}/runtime/nvqir/cudensitymat 
+        ${CUDENSITYMAT_INCLUDE_DIR}
         ${CUDAToolkit_INCLUDE_DIRS})
 target_link_libraries(nvqir_dynamics_bindings PRIVATE
     cudaq-logger
@@ -29,12 +46,12 @@ endif()
 
 if(NOT SKBUILD)
   set_target_properties(nvqir_dynamics_bindings PROPERTIES
-    INSTALL_RPATH "${_origin_prefix}/../../lib;${_origin_prefix}/../../lib/plugins"
+    INSTALL_RPATH "${_origin_prefix}/../../lib;${_origin_prefix}/../../lib/plugins;${_origin_prefix}/../mlir/_mlir_libs"
     BUILD_RPATH   "${CMAKE_BINARY_DIR}/lib"
   )
 else()
   set_target_properties(nvqir_dynamics_bindings PROPERTIES
-    INSTALL_RPATH "${_origin_prefix}/../../lib;${_origin_prefix}/../../cuda_quantum.libs"
+    INSTALL_RPATH "${_origin_prefix}/../../lib;${_origin_prefix}/../../cuda_quantum.libs;${_origin_prefix}/../mlir/_mlir_libs"
     BUILD_RPATH   "${CMAKE_BINARY_DIR}/lib"
   )
 endif()
diff --git a/python/runtime/cudaq/operators/py_boson_op.cpp b/python/runtime/cudaq/operators/py_boson_op.cpp
index 6df75bd5a27..514ab1fcbb0 100644
--- a/python/runtime/cudaq/operators/py_boson_op.cpp
+++ b/python/runtime/cudaq/operators/py_boson_op.cpp
@@ -7,12 +7,11 @@
  ******************************************************************************/
 
 #include <complex>
-#include <nanobind/make_iterator.h>
 #include <nanobind/ndarray.h>
 #include <nanobind/operators.h>
 #include <nanobind/stl/complex.h>
 #include <nanobind/stl/map.h>
-#include <nanobind/stl/pair.h>
+#include <nanobind/stl/optional.h>
 #include <nanobind/stl/set.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/tuple.h>
@@ -108,11 +107,11 @@ void bindBosonOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](boson_op &self) {
-            return nanobind::make_iterator(nanobind::type<boson_op>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -166,13 +165,15 @@ void bindBosonOperator(nanobind::module_ &mod) {
 
       .def(
           "to_matrix",
-          [](const boson_op &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const boson_op &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -182,28 +183,39 @@ void bindBosonOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_matrix",
-          [](const boson_op &self, dimension_map &dimensions, bool invert_order,
-             const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const boson_op &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const boson_op &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
       .def(
           "to_sparse_matrix",
-          [](const boson_op &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            return self.to_sparse_matrix(dimensions, params, invert_order);
+          [](const boson_op &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            return self.to_sparse_matrix(dims, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
@@ -217,13 +229,12 @@ void bindBosonOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_sparse_matrix",
-          [](const boson_op &self, dimension_map &dimensions, bool invert_order,
-             const nanobind::kwargs &kwargs) {
-            return self.to_sparse_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const boson_op &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            return self.to_sparse_matrix(dimensions, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -280,6 +291,7 @@ void bindBosonOperator(nanobind::module_ &mod) {
       .def(nanobind::self -= boson_op_term(), nanobind::is_operator())
       .def(nanobind::self *= nanobind::self, nanobind::is_operator())
       .def(nanobind::self += nanobind::self, nanobind::is_operator())
+// see issue https://github.com/pybind/pybind11/issues/1893
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
@@ -361,17 +373,21 @@ void bindBosonOperator(nanobind::module_ &mod) {
       .def("dump", &boson_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &boson_op::trim, nanobind::arg("tol") = 0.0,
-           nanobind::arg("parameters") = parameter_map(),
-           "Removes all terms from the sum for which the absolute value of the "
-           "coefficient is below "
-           "the given tolerance.")
       .def(
           "trim",
-          [](boson_op &self, double tol, const nanobind::kwargs &kwargs) {
+          [](boson_op &self, double tol, std::optional<parameter_map> params) {
+            return self.trim(tol, params.value_or(parameter_map()));
+          },
+          nanobind::arg("tol") = 0.0,
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Removes all terms from the sum for which the absolute value of the "
+          "coefficient is below "
+          "the given tolerance.")
+      .def(
+          "trim",
+          [](boson_op &self, double tol, nanobind::kwargs kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -396,11 +412,11 @@ void bindBosonOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](boson_op_term &self) {
-            return nanobind::make_iterator(nanobind::type<boson_op_term>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -476,20 +492,26 @@ void bindBosonOperator(nanobind::module_ &mod) {
 
       // evaluations
 
-      .def("evaluate_coefficient", &boson_op_term::evaluate_coefficient,
-           nanobind::arg("parameters") = parameter_map(),
-           "Returns the evaluated coefficient of the product operator. The "
-           "parameters is a map of parameter names to their concrete, complex "
-           "values.")
+      .def(
+          "evaluate_coefficient",
+          [](const boson_op_term &self, std::optional<parameter_map> params) {
+            return self.evaluate_coefficient(params.value_or(parameter_map()));
+          },
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Returns the evaluated coefficient of the product operator. The "
+          "parameters is a map of parameter names to their concrete, complex "
+          "values.")
       .def(
           "to_matrix",
-          [](const boson_op_term &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const boson_op_term &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -499,28 +521,39 @@ void bindBosonOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_matrix",
-          [](const boson_op_term &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const boson_op_term &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const boson_op_term &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
       .def(
           "to_sparse_matrix",
-          [](const boson_op_term &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            return self.to_sparse_matrix(dimensions, params, invert_order);
+          [](const boson_op_term &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            return self.to_sparse_matrix(dims, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
@@ -534,13 +567,12 @@ void bindBosonOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_sparse_matrix",
-          [](const boson_op_term &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            return self.to_sparse_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const boson_op_term &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            return self.to_sparse_matrix(dimensions, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -636,8 +668,8 @@ void bindBosonOperator(nanobind::module_ &mod) {
 
       .def("is_identity", &boson_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note that this function returns true regardless of the value of "
-           "the coefficient.")
+           "Note: this function returns true regardless of the value of the "
+           "coefficient.")
       .def(
           "__str__", [](const boson_op_term &self) { return self.to_string(); },
           "Returns the string representation of the operator.")
diff --git a/python/runtime/cudaq/operators/py_fermion_op.cpp b/python/runtime/cudaq/operators/py_fermion_op.cpp
index 621f39c873f..c53c00ce56e 100644
--- a/python/runtime/cudaq/operators/py_fermion_op.cpp
+++ b/python/runtime/cudaq/operators/py_fermion_op.cpp
@@ -7,13 +7,11 @@
  ******************************************************************************/
 
 #include <complex>
-#include <nanobind/make_iterator.h>
 #include <nanobind/ndarray.h>
 #include <nanobind/operators.h>
 #include <nanobind/stl/complex.h>
-#include <nanobind/stl/function.h>
 #include <nanobind/stl/map.h>
-#include <nanobind/stl/pair.h>
+#include <nanobind/stl/optional.h>
 #include <nanobind/stl/set.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/tuple.h>
@@ -104,11 +102,11 @@ void bindFermionOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](fermion_op &self) {
-            return nanobind::make_iterator(nanobind::type<fermion_op>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -162,13 +160,15 @@ void bindFermionOperator(nanobind::module_ &mod) {
 
       .def(
           "to_matrix",
-          [](const fermion_op &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const fermion_op &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -178,28 +178,39 @@ void bindFermionOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_matrix",
-          [](const fermion_op &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const fermion_op &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const fermion_op &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
       .def(
           "to_sparse_matrix",
-          [](const fermion_op &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            return self.to_sparse_matrix(dimensions, params, invert_order);
+          [](const fermion_op &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            return self.to_sparse_matrix(dims, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
@@ -213,13 +224,12 @@ void bindFermionOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_sparse_matrix",
-          [](const fermion_op &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            return self.to_sparse_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const fermion_op &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            return self.to_sparse_matrix(dimensions, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -276,6 +286,7 @@ void bindFermionOperator(nanobind::module_ &mod) {
       .def(nanobind::self -= fermion_op_term(), nanobind::is_operator())
       .def(nanobind::self *= nanobind::self, nanobind::is_operator())
       .def(nanobind::self += nanobind::self, nanobind::is_operator())
+// see issue https://github.com/pybind/pybind11/issues/1893
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
@@ -357,17 +368,22 @@ void bindFermionOperator(nanobind::module_ &mod) {
       .def("dump", &fermion_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &fermion_op::trim, nanobind::arg("tol") = 0.0,
-           nanobind::arg("parameters") = parameter_map(),
-           "Removes all terms from the sum for which the absolute value of the "
-           "coefficient is below "
-           "the given tolerance.")
       .def(
           "trim",
-          [](fermion_op &self, double tol, const nanobind::kwargs &kwargs) {
+          [](fermion_op &self, double tol,
+             std::optional<parameter_map> params) {
+            return self.trim(tol, params.value_or(parameter_map()));
+          },
+          nanobind::arg("tol") = 0.0,
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Removes all terms from the sum for which the absolute value of the "
+          "coefficient is below "
+          "the given tolerance.")
+      .def(
+          "trim",
+          [](fermion_op &self, double tol, nanobind::kwargs kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -392,11 +408,11 @@ void bindFermionOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](fermion_op_term &self) {
-            return nanobind::make_iterator(nanobind::type<fermion_op_term>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -473,20 +489,27 @@ void bindFermionOperator(nanobind::module_ &mod) {
 
       // evaluations
 
-      .def("evaluate_coefficient", &fermion_op_term::evaluate_coefficient,
-           nanobind::arg("parameters") = parameter_map(),
-           "Returns the evaluated coefficient of the product operator. The "
-           "parameters is a map of parameter names to their concrete, complex "
-           "values.")
+      .def(
+          "evaluate_coefficient",
+          [](const fermion_op_term &self, std::optional<parameter_map> params) {
+            return self.evaluate_coefficient(params.value_or(parameter_map()));
+          },
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Returns the evaluated coefficient of the product operator. The "
+          "parameters is a map of parameter names to their concrete, complex "
+          "values.")
       .def(
           "to_matrix",
-          [](const fermion_op_term &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const fermion_op_term &self,
+             std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -496,28 +519,40 @@ void bindFermionOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_matrix",
-          [](const fermion_op_term &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const fermion_op_term &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const fermion_op_term &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
       .def(
           "to_sparse_matrix",
-          [](const fermion_op_term &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            return self.to_sparse_matrix(dimensions, params, invert_order);
+          [](const fermion_op_term &self,
+             std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            return self.to_sparse_matrix(dims, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
@@ -531,13 +566,12 @@ void bindFermionOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_sparse_matrix",
-          [](const fermion_op_term &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            return self.to_sparse_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const fermion_op_term &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            return self.to_sparse_matrix(dimensions, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -633,8 +667,8 @@ void bindFermionOperator(nanobind::module_ &mod) {
 
       .def("is_identity", &fermion_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note that this function returns true regardless of the value of "
-           "the coefficient.")
+           "Note: this function returns true regardless of the value of the "
+           "coefficient.")
       .def(
           "__str__",
           [](const fermion_op_term &self) { return self.to_string(); },
diff --git a/python/runtime/cudaq/operators/py_handlers.cpp b/python/runtime/cudaq/operators/py_handlers.cpp
index e8c2147e92b..ba44cc90d5f 100644
--- a/python/runtime/cudaq/operators/py_handlers.cpp
+++ b/python/runtime/cudaq/operators/py_handlers.cpp
@@ -12,6 +12,7 @@
 #include <nanobind/stl/complex.h>
 #include <nanobind/stl/function.h>
 #include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/vector.h>
@@ -75,23 +76,27 @@ void bindOperatorHandlers(nanobind::module_ &mod) {
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
-          [](const matrix_handler &self, dimension_map &dimensions,
-             const parameter_map &params) {
-            auto cmat = self.to_matrix(dimensions, params);
+          [](const matrix_handler &self,
+             std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
+          nanobind::arg("parameters") = nanobind::none(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
-          [](const matrix_handler &self, dimension_map &dimensions,
-             const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(dimensions,
-                                       details::kwargs_to_param_map(kwargs));
+          [](const matrix_handler &self,
+             std::optional<dimension_map> dimensions, nanobind::kwargs kwargs) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            auto cmat =
+                self.to_matrix(dims, details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
           nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.")
 
@@ -100,7 +105,7 @@ void bindOperatorHandlers(nanobind::module_ &mod) {
           "_define",
           [](std::string operator_id, std::vector<int64_t> expected_dimensions,
              const matrix_callback &func, bool overwrite,
-             const nanobind::kwargs &kwargs) {
+             nanobind::kwargs kwargs) {
             // we need to make sure the python function that is stored in
             // the static dictionary containing the operator definitions
             // is properly cleaned up - otherwise python will hang on exit...
@@ -136,23 +141,26 @@ void bindOperatorHandlers(nanobind::module_ &mod) {
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
-          [](const boson_handler &self, dimension_map &dimensions,
-             const parameter_map &params) {
-            auto cmat = self.to_matrix(dimensions, params);
+          [](const boson_handler &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
+          nanobind::arg("parameters") = nanobind::none(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
-          [](const boson_handler &self, dimension_map &dimensions,
-             const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(dimensions,
-                                       details::kwargs_to_param_map(kwargs));
+          [](const boson_handler &self, std::optional<dimension_map> dimensions,
+             nanobind::kwargs kwargs) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            auto cmat =
+                self.to_matrix(dims, details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
           nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 
@@ -171,23 +179,27 @@ void bindOperatorHandlers(nanobind::module_ &mod) {
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
-          [](const fermion_handler &self, dimension_map &dimensions,
-             const parameter_map &params) {
-            auto cmat = self.to_matrix(dimensions, params);
+          [](const fermion_handler &self,
+             std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
+          nanobind::arg("parameters") = nanobind::none(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
-          [](const fermion_handler &self, dimension_map &dimensions,
-             const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(dimensions,
-                                       details::kwargs_to_param_map(kwargs));
+          [](const fermion_handler &self,
+             std::optional<dimension_map> dimensions, nanobind::kwargs kwargs) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            auto cmat =
+                self.to_matrix(dims, details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
           nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 
@@ -208,23 +220,26 @@ void bindOperatorHandlers(nanobind::module_ &mod) {
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
-          [](const spin_handler &self, dimension_map &dimensions,
-             const parameter_map &params) {
-            auto cmat = self.to_matrix(dimensions, params);
+          [](const spin_handler &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
+          nanobind::arg("parameters") = nanobind::none(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
-          [](const spin_handler &self, dimension_map &dimensions,
-             const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(dimensions,
-                                       details::kwargs_to_param_map(kwargs));
+          [](const spin_handler &self, std::optional<dimension_map> dimensions,
+             nanobind::kwargs kwargs) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            auto cmat =
+                self.to_matrix(dims, details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = nanobind::none(),
           nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 }
diff --git a/python/runtime/cudaq/operators/py_helpers.cpp b/python/runtime/cudaq/operators/py_helpers.cpp
index b34212bce6e..e14ac5a1750 100644
--- a/python/runtime/cudaq/operators/py_helpers.cpp
+++ b/python/runtime/cudaq/operators/py_helpers.cpp
@@ -8,7 +8,6 @@
 
 #include "py_helpers.h"
 #include "cudaq/operators.h"
-#include <algorithm>
 #include <complex>
 #include <nanobind/nanobind.h>
 #include <nanobind/ndarray.h>
@@ -27,6 +26,14 @@ cudaq::parameter_map kwargs_to_param_map(const nanobind::kwargs &kwargs) {
   return params;
 }
 
+cudaq::parameter_map kwargs_to_param_map(nanobind::kwargs &kwargs,
+                                         bool &invert_order) {
+  nanobind::str invert_key("invert_order");
+  nanobind::object inv = kwargs.attr("pop")(invert_key, nanobind::bool_(false));
+  invert_order = nanobind::cast<bool>(inv);
+  return kwargs_to_param_map(static_cast<const nanobind::kwargs &>(kwargs));
+}
+
 std::unordered_map<std::string, std::string>
 kwargs_to_param_description(const nanobind::kwargs &kwargs) {
   std::unordered_map<std::string, std::string> param_desc;
@@ -39,23 +46,18 @@ kwargs_to_param_description(const nanobind::kwargs &kwargs) {
   return param_desc;
 }
 
-nanobind::ndarray<nanobind::numpy, std::complex<double>>
-cmat_to_numpy(complex_matrix &cmat) {
+nanobind::object cmat_to_numpy(complex_matrix &cmat) {
   auto rows = cmat.rows();
   auto cols = cmat.cols();
-  auto *src = cmat.get_data(complex_matrix::order::row_major);
-  std::size_t n = rows * cols;
-  std::size_t shape[2] = {rows, cols};
-
-  auto *copy = new std::complex<double>[n];
-  std::copy(src, src + n, copy);
-
-  nanobind::capsule owner(copy, [](void *p) noexcept {
-    delete[] static_cast<std::complex<double> *>(p);
-  });
-
-  return nanobind::ndarray<nanobind::numpy, std::complex<double>>(copy, 2,
-                                                                  shape, owner);
-}
+  auto *data = cmat.get_data(complex_matrix::order::row_major);
+
+  // Use .cast() to force immediate creation of the numpy array.
+  // Since no owner is specified, rv_policy::automatic will copy the data,
+  // making this safe even when cmat is a temporary (e.g. in get_unitary).
+  return nanobind::ndarray<nanobind::numpy, std::complex<double>,
+                           nanobind::shape<-1, -1>>(data, {rows, cols},
+                                                    nanobind::handle())
+      .cast();
+};
 
 } // namespace cudaq::details
diff --git a/python/runtime/cudaq/operators/py_helpers.h b/python/runtime/cudaq/operators/py_helpers.h
index e712281784f..026f6f9b2fe 100644
--- a/python/runtime/cudaq/operators/py_helpers.h
+++ b/python/runtime/cudaq/operators/py_helpers.h
@@ -12,8 +12,11 @@
 
 namespace cudaq::details {
 cudaq::parameter_map kwargs_to_param_map(const nanobind::kwargs &kwargs);
+/// Extracts parameter map from `kwargs`, also extracting an optional
+/// "invert_order" boolean (defaults to false if not present).
+cudaq::parameter_map kwargs_to_param_map(nanobind::kwargs &kwargs,
+                                         bool &invert_order);
 std::unordered_map<std::string, std::string>
 kwargs_to_param_description(const nanobind::kwargs &kwargs);
-nanobind::ndarray<nanobind::numpy, std::complex<double>>
-cmat_to_numpy(complex_matrix &cmat);
+nanobind::object cmat_to_numpy(complex_matrix &cmat);
 } // namespace cudaq::details
diff --git a/python/runtime/cudaq/operators/py_matrix.cpp b/python/runtime/cudaq/operators/py_matrix.cpp
index 48d37891e7f..32aa5f87a8d 100644
--- a/python/runtime/cudaq/operators/py_matrix.cpp
+++ b/python/runtime/cudaq/operators/py_matrix.cpp
@@ -28,18 +28,32 @@ void bindComplexMatrix(nanobind::module_ &mod) {
       "matrix of complex<double> elements.")
       .def(
           "__init__",
-          [](complex_matrix *self,
-             nanobind::ndarray<std::complex<double>, nanobind::ndim<2>,
-                               nanobind::c_contig, nanobind::numpy>
-                 arr) {
-            auto rows = arr.shape(0);
-            auto cols = arr.shape(1);
-            new (self) complex_matrix(rows, cols);
-            memcpy(self->get_data(complex_matrix::order::row_major), arr.data(),
-                   sizeof(std::complex<double>) * rows * cols);
+          [](complex_matrix *self, nanobind::object b) {
+            auto arr = nanobind::cast<nanobind::ndarray<>>(b);
+            if (arr.ndim() != 2)
+              throw std::runtime_error("ComplexMatrix requires a 2D array");
+            if (arr.shape(0) == 0 || arr.shape(1) == 0)
+              throw std::runtime_error("Matrix dimensions must be non-zero.");
+
+            new (self) complex_matrix(arr.shape(0), arr.shape(1));
+
+            // Stride-aware element-wise copy so both row-major (C) and
+            // column-major (Fortran) layouts are handled correctly.
+            // nanobind strides are counted in elements, not bytes.
+            auto *dest = self->get_data(complex_matrix::order::row_major);
+            auto *src = static_cast<std::complex<double> *>(arr.data());
+            auto stride0 = arr.stride(0);
+            auto stride1 = arr.stride(1);
+            for (size_t i = 0; i < arr.shape(0); ++i)
+              for (size_t j = 0; j < arr.shape(1); ++j)
+                dest[i * arr.shape(1) + j] = src[i * stride0 + j * stride1];
           },
           "Create a :class:`ComplexMatrix` from a buffer of data, such as a "
           "numpy.ndarray.")
+      .def(
+          "to_numpy",
+          [](complex_matrix &op) { return details::cmat_to_numpy(op); },
+          "Convert to a NumPy array.")
       .def(
           "num_rows", [](complex_matrix &m) { return m.rows(); },
           "Returns the number of rows in the matrix.")
diff --git a/python/runtime/cudaq/operators/py_matrix_op.cpp b/python/runtime/cudaq/operators/py_matrix_op.cpp
index 3883f86c9bd..071050ce0aa 100644
--- a/python/runtime/cudaq/operators/py_matrix_op.cpp
+++ b/python/runtime/cudaq/operators/py_matrix_op.cpp
@@ -7,11 +7,11 @@
  ******************************************************************************/
 
 #include <complex>
-#include <nanobind/make_iterator.h>
 #include <nanobind/ndarray.h>
 #include <nanobind/operators.h>
 #include <nanobind/stl/complex.h>
 #include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
 #include <nanobind/stl/set.h>
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/unordered_map.h>
@@ -109,11 +109,11 @@ void bindMatrixOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](matrix_op &self) {
-            return nanobind::make_iterator(nanobind::type<matrix_op>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -161,13 +161,15 @@ void bindMatrixOperator(nanobind::module_ &mod) {
 
       .def(
           "to_matrix",
-          [](const matrix_op &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const matrix_op &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -178,20 +180,29 @@ void bindMatrixOperator(nanobind::module_ &mod) {
 
       .def(
           "to_matrix",
-          [](const matrix_op &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const matrix_op &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const matrix_op &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
 
       // comparisons
 
@@ -240,6 +251,7 @@ void bindMatrixOperator(nanobind::module_ &mod) {
       .def(nanobind::self -= matrix_op_term(), nanobind::is_operator())
       .def(nanobind::self *= nanobind::self, nanobind::is_operator())
       .def(nanobind::self += nanobind::self, nanobind::is_operator())
+// see issue https://github.com/pybind/pybind11/issues/1893
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
@@ -315,17 +327,21 @@ void bindMatrixOperator(nanobind::module_ &mod) {
       .def("dump", &matrix_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &matrix_op::trim, nanobind::arg("tol") = 0.0,
-           nanobind::arg("parameters") = parameter_map(),
-           "Removes all terms from the sum for which the absolute value of the "
-           "coefficient is below "
-           "the given tolerance.")
       .def(
           "trim",
-          [](matrix_op &self, double tol, const nanobind::kwargs &kwargs) {
+          [](matrix_op &self, double tol, std::optional<parameter_map> params) {
+            return self.trim(tol, params.value_or(parameter_map()));
+          },
+          nanobind::arg("tol") = 0.0,
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Removes all terms from the sum for which the absolute value of the "
+          "coefficient is below "
+          "the given tolerance.")
+      .def(
+          "trim",
+          [](matrix_op &self, double tol, nanobind::kwargs kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -350,11 +366,11 @@ void bindMatrixOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](matrix_op_term &self) {
-            return nanobind::make_iterator(nanobind::type<matrix_op_term>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -434,20 +450,27 @@ void bindMatrixOperator(nanobind::module_ &mod) {
 
       // evaluations
 
-      .def("evaluate_coefficient", &matrix_op_term::evaluate_coefficient,
-           nanobind::arg("parameters") = parameter_map(),
-           "Returns the evaluated coefficient of the product operator. The "
-           "parameters is a map of parameter names to their concrete, complex "
-           "values.")
+      .def(
+          "evaluate_coefficient",
+          [](const matrix_op_term &self, std::optional<parameter_map> params) {
+            return self.evaluate_coefficient(params.value_or(parameter_map()));
+          },
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Returns the evaluated coefficient of the product operator. The "
+          "parameters is a map of parameter names to their concrete, complex "
+          "values.")
       .def(
           "to_matrix",
-          [](const matrix_op_term &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const matrix_op_term &self,
+             std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -457,20 +480,29 @@ void bindMatrixOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_matrix",
-          [](const matrix_op_term &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const matrix_op_term &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const matrix_op_term &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
 
       // comparisons
 
@@ -551,8 +583,8 @@ void bindMatrixOperator(nanobind::module_ &mod) {
 
       .def("is_identity", &matrix_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note that this function returns true regardless of the value of "
-           "the coefficient.")
+           "Note: this function returns true regardless of the value of the "
+           "coefficient.")
       .def(
           "__str__",
           [](const matrix_op_term &self) { return self.to_string(); },
diff --git a/python/runtime/cudaq/operators/py_spin_op.cpp b/python/runtime/cudaq/operators/py_spin_op.cpp
index e901dcac0cd..894b35f989a 100644
--- a/python/runtime/cudaq/operators/py_spin_op.cpp
+++ b/python/runtime/cudaq/operators/py_spin_op.cpp
@@ -7,12 +7,11 @@
  ******************************************************************************/
 
 #include <complex>
-#include <nanobind/make_iterator.h>
 #include <nanobind/ndarray.h>
 #include <nanobind/operators.h>
 #include <nanobind/stl/complex.h>
-#include <nanobind/stl/function.h>
 #include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
 #include <nanobind/stl/pair.h>
 #include <nanobind/stl/set.h>
 #include <nanobind/stl/string.h>
@@ -41,7 +40,7 @@ spin_op fromOpenFermionQubitOperator(nanobind::object &op) {
   for (auto term : terms) {
     auto termTuple = nanobind::cast<nanobind::tuple>(term);
     auto localTerm = spin_op::identity();
-    for (auto element : termTuple) {
+    for (nanobind::handle element : termTuple) {
       auto casted =
           nanobind::cast<std::pair<std::size_t, std::string>>(element);
       localTerm *= creatorMap[casted.second](casted.first);
@@ -136,11 +135,11 @@ void bindSpinOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](spin_op &self) {
-            return nanobind::make_iterator(nanobind::type<spin_op>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -237,13 +236,15 @@ void bindSpinOperator(nanobind::module_ &mod) {
 
       .def(
           "to_matrix",
-          [](const spin_op &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const spin_op &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -253,28 +254,39 @@ void bindSpinOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_matrix",
-          [](const spin_op &self, dimension_map &dimensions, bool invert_order,
-             const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const spin_op &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const spin_op &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
       .def(
           "to_sparse_matrix",
-          [](const spin_op &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            return self.to_sparse_matrix(dimensions, params, invert_order);
+          [](const spin_op &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            return self.to_sparse_matrix(dims, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
@@ -288,13 +300,12 @@ void bindSpinOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_sparse_matrix",
-          [](const spin_op &self, dimension_map &dimensions, bool invert_order,
-             const nanobind::kwargs &kwargs) {
-            return self.to_sparse_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const spin_op &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            return self.to_sparse_matrix(dimensions, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -351,6 +362,7 @@ void bindSpinOperator(nanobind::module_ &mod) {
       .def(nanobind::self -= spin_op_term(), nanobind::is_operator())
       .def(nanobind::self *= nanobind::self, nanobind::is_operator())
       .def(nanobind::self += nanobind::self, nanobind::is_operator())
+// see issue https://github.com/pybind/pybind11/issues/1893
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
@@ -443,18 +455,22 @@ void bindSpinOperator(nanobind::module_ &mod) {
             auto data = self.get_data_representation();
             return json.attr("dumps")(data);
           },
-          "Convert spin_op to a JSON string, e.g., '[d1, d2, d3, ...]'.")
-      .def("trim", &spin_op::trim, nanobind::arg("tol") = 0.0,
-           nanobind::arg("parameters") = parameter_map(),
-           "Removes all terms from the sum for which the absolute value of the "
-           "coefficient is below "
-           "the given tolerance.")
+          "Convert spin_op to JSON string: '[d1, d2, d3, ...]'")
+      .def(
+          "trim",
+          [](spin_op &self, double tol, std::optional<parameter_map> params) {
+            return self.trim(tol, params.value_or(parameter_map()));
+          },
+          nanobind::arg("tol") = 0.0,
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Removes all terms from the sum for which the absolute value of the "
+          "coefficient is below "
+          "the given tolerance.")
       .def(
           "trim",
-          [](spin_op &self, double tol, const nanobind::kwargs &kwargs) {
+          [](spin_op &self, double tol, nanobind::kwargs kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -626,11 +642,11 @@ void bindSpinOperator(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](spin_op_term &self) {
-            return nanobind::make_iterator(nanobind::type<spin_op_term>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the operator.")
 
       // properties
@@ -749,20 +765,26 @@ void bindSpinOperator(nanobind::module_ &mod) {
 
       // evaluations
 
-      .def("evaluate_coefficient", &spin_op_term::evaluate_coefficient,
-           nanobind::arg("parameters") = parameter_map(),
-           "Returns the evaluated coefficient of the product operator. The "
-           "parameters is a map of parameter names to their concrete, complex "
-           "values.")
+      .def(
+          "evaluate_coefficient",
+          [](const spin_op_term &self, std::optional<parameter_map> params) {
+            return self.evaluate_coefficient(params.value_or(parameter_map()));
+          },
+          nanobind::arg("parameters").none() = nanobind::none(),
+          "Returns the evaluated coefficient of the product operator. The "
+          "parameters is a map of parameter names to their concrete, complex "
+          "values.")
       .def(
           "to_matrix",
-          [](const spin_op_term &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            auto cmat = self.to_matrix(dimensions, params, invert_order);
+          [](const spin_op_term &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            auto cmat = self.to_matrix(dims, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
@@ -772,28 +794,39 @@ void bindSpinOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_matrix",
-          [](const spin_op_term &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            auto cmat = self.to_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const spin_op_term &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimensions, pm, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
           "can be inverted by setting the optional `invert_order` argument to "
           "`True`. "
           "See also the documentation for `degrees` for more detail.")
+      .def(
+          "to_matrix",
+          [](const spin_op_term &self, nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            auto cmat = self.to_matrix(dimension_map(), pm, invert_order);
+            return details::cmat_to_numpy(cmat);
+          },
+          "Returns the matrix representation of the operator, passing "
+          "parameters as keyword arguments.")
       .def(
           "to_sparse_matrix",
-          [](const spin_op_term &self, dimension_map &dimensions,
-             const parameter_map &params, bool invert_order) {
-            return self.to_sparse_matrix(dimensions, params, invert_order);
+          [](const spin_op_term &self, std::optional<dimension_map> dimensions,
+             std::optional<parameter_map> params, bool invert_order) {
+            dimension_map dims = dimensions.value_or(dimension_map());
+            parameter_map pm = params.value_or(parameter_map());
+            return self.to_sparse_matrix(dims, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions").none() = nanobind::none(),
+          nanobind::arg("parameters").none() = nanobind::none(),
           nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
@@ -807,13 +840,12 @@ void bindSpinOperator(nanobind::module_ &mod) {
           "See also the documentation for `degrees` for more detail.")
       .def(
           "to_sparse_matrix",
-          [](const spin_op_term &self, dimension_map &dimensions,
-             bool invert_order, const nanobind::kwargs &kwargs) {
-            return self.to_sparse_matrix(
-                dimensions, details::kwargs_to_param_map(kwargs), invert_order);
+          [](const spin_op_term &self, dimension_map dimensions,
+             nanobind::kwargs kwargs) {
+            bool invert_order;
+            auto pm = details::kwargs_to_param_map(kwargs, invert_order);
+            return self.to_sparse_matrix(dimensions, pm, invert_order);
           },
-          nanobind::arg("dimensions") = dimension_map(),
-          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -909,8 +941,8 @@ void bindSpinOperator(nanobind::module_ &mod) {
 
       .def("is_identity", &spin_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note that this function returns true regardless of the value of "
-           "the coefficient.")
+           "Note: this function returns true regardless of the value of the "
+           "coefficient.")
       .def(
           "__str__", [](const spin_op_term &self) { return self.to_string(); },
           "Returns the string representation of the operator.")
@@ -932,7 +964,7 @@ void bindSpinOperator(nanobind::module_ &mod) {
             auto data = spin_op(self).get_data_representation();
             return json.attr("dumps")(data);
           },
-          "Convert spin_op to a JSON string, e.g., '[d1, d2, d3, ...]'.")
+          "Convert spin_op to JSON string: '[d1, d2, d3, ...]'")
       // only exists for spin operators
       .def(
           "get_pauli_word",
diff --git a/python/runtime/cudaq/operators/py_super_op.cpp b/python/runtime/cudaq/operators/py_super_op.cpp
index 2c18dfbc820..89d3197dac4 100644
--- a/python/runtime/cudaq/operators/py_super_op.cpp
+++ b/python/runtime/cudaq/operators/py_super_op.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include <complex>
-#include <nanobind/make_iterator.h>
 #include <nanobind/ndarray.h>
 #include <nanobind/operators.h>
 #include <nanobind/stl/complex.h>
@@ -79,11 +78,11 @@ void bindSuperOperatorWrapper(nanobind::module_ &mod) {
       .def(
           "__iter__",
           [](super_op &self) {
-            return nanobind::make_iterator(nanobind::type<super_op>(),
-                                           "iterator", self.begin(),
-                                           self.end());
+            nanobind::list items;
+            for (auto it = self.begin(); it != self.end(); ++it)
+              items.append(nanobind::cast(*it));
+            return items.attr("__iter__")();
           },
-          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the super-operator.")
       .def(nanobind::self += nanobind::self, nanobind::is_operator());
 }
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index e78a1e34d57..464dd5ced97 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -26,14 +26,13 @@
 #include "runtime/cudaq/algorithms/py_utils.h"
 #include "runtime/cudaq/platform/PythonSignalCheck.h"
 #include "utils/LinkedLibraryHolder.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
 #include "utils/PyTypes.h"
-#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
 #include "mlir/CAPI/ExecutionEngine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
@@ -55,9 +54,6 @@
 #include <nanobind/stl/vector.h>
 
 using namespace mlir;
-using namespace cudaq_internal::compiler;
-using cudaq::JitEngine;
-using cudaq::PackingStyle;
 
 static std::function<std::string()> getTransportLayer = []() -> std::string {
   throw std::runtime_error("binding for kernel launch is incomplete");
@@ -89,7 +85,7 @@ static std::unique_ptr<PyStateStorage> cudaqStateStorage =
 
 static std::string createDataLayout() {
   // Setup the machine properties from the current architecture.
-  auto targetTriple = llvm::sys::getDefaultTargetTriple();
+  llvm::Triple targetTriple(llvm::sys::getDefaultTargetTriple());
   std::string errorMessage;
   const auto *target =
       llvm::TargetRegistry::lookupTarget(targetTriple, errorMessage);
@@ -98,11 +94,9 @@ static std::string createDataLayout() {
 
   std::string cpu(llvm::sys::getHostCPUName());
   llvm::SubtargetFeatures features;
-  llvm::StringMap<bool> hostFeatures;
-
-  if (llvm::sys::getHostCPUFeatures(hostFeatures))
-    for (auto &f : hostFeatures)
-      features.AddFeature(f.first(), f.second);
+  auto hostFeatures = llvm::sys::getHostCPUFeatures();
+  for (auto &f : hostFeatures)
+    features.AddFeature(f.first(), f.second);
 
   std::unique_ptr<llvm::TargetMachine> machine(target->createTargetMachine(
       targetTriple, cpu, features.getString(), {}, {}));
@@ -182,7 +176,7 @@ nanobind::args cudaq::simplifiedValidateInputArguments(nanobind::args &args) {
   return processed;
 }
 
-template <PackingStyle style>
+template <cudaq::PackingStyle style>
 void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
                                        mlir::Type memberType,
                                        nanobind::object value) {
@@ -214,7 +208,7 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
           // synthesis path: span {ptr, size_t}
           // argsCreator path: std::vector<T> {ptr, ptr, ptr}
           constexpr std::size_t copySize =
-              sizeof(std::conditional_t<style == PackingStyle::synthesis,
+              sizeof(std::conditional_t<style == cudaq::PackingStyle::synthesis,
                                         std::pair<char *, std::size_t>,
                                         std::vector<T>>);
           std::memcpy(((char *)data) + offset, values, copySize);
@@ -236,15 +230,16 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
               appendVectorValue(value, data, offset, double());
             })
             .Case([&](cudaq::cc::StdvecType innerVecType) {
-              if constexpr (style == PackingStyle::synthesis) {
+              if constexpr (style == cudaq::PackingStyle::synthesis) {
                 throw std::runtime_error(
                     "Type not supported for custom struct in kernel.");
               } else {
                 // Nested vector (e.g., list[list[int]]): delegate to
                 // handleVectorElements which handles the recursive case.
                 auto asList = nanobind::cast<nanobind::list>(value);
-                auto *values = handleVectorElements<PackingStyle::argsCreator>(
-                    innerVecType, asList);
+                auto *values =
+                    handleVectorElements<cudaq::PackingStyle::argsCreator>(
+                        innerVecType, asList);
                 std::memcpy(((char *)data) + offset, values,
                             sizeof(std::vector<std::vector<std::size_t>>));
               }
@@ -257,7 +252,7 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
       });
 }
 
-template <PackingStyle style>
+template <cudaq::PackingStyle style>
 void *cudaq::handleVectorElements(mlir::Type eleTy, nanobind::list list) {
   auto appendValue = []<typename T>(nanobind::list list,
                                     auto &&converter) -> void * {
@@ -373,7 +368,7 @@ std::string cudaq::mlirTypeToString(mlir::Type ty) {
   return msg;
 }
 
-template <PackingStyle style>
+template <cudaq::PackingStyle style>
 void cudaq::packArgs(
     OpaqueArguments &argData, nanobind::list args,
     mlir::ArrayRef<mlir::Type> mlirTys,
@@ -466,10 +461,11 @@ void cudaq::packArgs(
         .Case([&](cc::StructType ty) {
           auto mod = kernelFuncOp->getParentOfType<mlir::ModuleOp>();
           cc::StructType layoutTy = ty;
-          if constexpr (style == PackingStyle::argsCreator)
+          if constexpr (style == cudaq::PackingStyle::argsCreator)
             layoutTy = cast<cc::StructType>(
                 cudaq::opt::factory::convertToHostSideType(ty, mod));
-          auto [size, offsets] = getTargetLayout(mod, layoutTy);
+          auto [size, offsets] =
+              cudaq_internal::compiler::getTargetLayout(mod, layoutTy);
           auto memberTys = ty.getMembers();
           auto allocatedArg = std::malloc(size);
           if (ty.getName() == "tuple") {
@@ -577,7 +573,7 @@ void cudaq::packArgs(
   }
 }
 
-template <PackingStyle style>
+template <cudaq::PackingStyle style>
 void cudaq::packArgs(
     OpaqueArguments &argData, nanobind::args args,
     mlir::func::FuncOp kernelFuncOp,
@@ -636,8 +632,8 @@ static bool linkResolvedCallable(ModuleOp currMod, func::FuncOp entryPoint,
   auto loc = entryPoint.getLoc();
   Block &entry = entryPoint.front();
   builder.setInsertionPoint(&entry.front());
-  auto resolved = builder.create<func::ConstantOp>(
-      loc, callee.getFunctionType(), calleeName);
+  auto resolved = func::ConstantOp::create(
+      builder, loc, callee.getFunctionType(), calleeName);
   entry.getArgument(argPos).replaceAllUsesWith(resolved);
   return true;
 }
@@ -663,7 +659,8 @@ cudaq::OpaqueArguments *cudaq::toOpaqueArgs(nanobind::args &args,
 static void appendTheResultValue(ModuleOp module, const std::string &name,
                                  cudaq::OpaqueArguments &runtimeArgs,
                                  Type returnType) {
-  auto [bufferSize, offsets] = getResultBufferLayout(module, returnType);
+  auto [bufferSize, offsets] =
+      cudaq_internal::compiler::getResultBufferLayout(module, returnType);
   if (bufferSize == 0)
     return;
   auto *buf = std::calloc(1, bufferSize);
@@ -821,7 +818,8 @@ nanobind::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         auto name = ty.getName().str();
         // Handle tuples.
         if (name == "tuple") {
-          auto [size, offsets] = getTargetLayout(module, ty);
+          auto [size, offsets] =
+              cudaq_internal::compiler::getTargetLayout(module, ty);
           auto memberTys = ty.getMembers();
           nanobind::list list;
           for (std::size_t i = 0; i < offsets.size(); i++) {
@@ -850,7 +848,8 @@ nanobind::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
           fieldNames.emplace_back(nanobind::str(attr_name));
 
         // Read field values and create the constructor `kwargs`
-        auto [size, offsets] = getTargetLayout(module, ty);
+        auto [size, offsets] =
+            cudaq_internal::compiler::getTargetLayout(module, ty);
         auto memberTys = ty.getMembers();
         nanobind::dict kwargs;
         for (std::size_t i = 0; i < offsets.size(); i++) {
@@ -914,11 +913,11 @@ cudaq::OpaqueArguments cudaq::marshal_arguments_for_module_launch(
     return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
   };
   if (isLocalSimulator)
-    cudaq::packArgs<PackingStyle::argsCreator>(args, runtimeArgs, kernelFunc,
-                                               handler);
+    cudaq::packArgs<cudaq::PackingStyle::argsCreator>(args, runtimeArgs,
+                                                      kernelFunc, handler);
   else
-    cudaq::packArgs<PackingStyle::synthesis>(args, runtimeArgs, kernelFunc,
-                                             handler);
+    cudaq::packArgs<cudaq::PackingStyle::synthesis>(args, runtimeArgs,
+                                                    kernelFunc, handler);
   return args;
 }
 
@@ -1009,7 +1008,7 @@ static MlirModule synthesizeKernel(nanobind::object kernel,
   auto isLocalSimulator = platform.is_simulator() && !platform.is_emulated();
   auto isSimulator = isLocalSimulator || isRemoteSimulator;
 
-  ArgumentConverter argCon(name, mod);
+  cudaq_internal::compiler::ArgumentConverter argCon(name, mod);
   argCon.gen(args.getArgs());
 
   // Store kernel and substitution strings on the stack.
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.h b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
index 1a94c11bbbe..49e4dbe370b 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.h
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
@@ -10,9 +10,9 @@
 
 #include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/algorithms/run.h"
-#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
 #include "utils/PyTypes.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/complex.h>
 #include <nanobind/stl/function.h>
diff --git a/python/runtime/interop/CMakeLists.txt b/python/runtime/interop/CMakeLists.txt
index c20b2d8390a..02135cd4980 100644
--- a/python/runtime/interop/CMakeLists.txt
+++ b/python/runtime/interop/CMakeLists.txt
@@ -7,15 +7,16 @@
 # ============================================================================ #
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 add_compile_options(-Wno-attributes)
+if (NOT TARGET nanobind-static)
+  nanobind_build_library(nanobind-static)
+endif()
 add_library(cudaq-python-interop SHARED PythonCppInterop.cpp)
 target_include_directories(cudaq-python-interop PRIVATE
-    ${PYTHON_INCLUDE_DIRS}
+    ${Python3_INCLUDE_DIRS}
 )
-if (SKBUILD)
-  target_link_libraries(cudaq-python-interop PRIVATE nanobind-static Python::Module cudaq)
-else()
-  target_link_libraries(cudaq-python-interop PRIVATE nanobind-static Python::Python cudaq)
-endif()
+target_link_libraries(cudaq-python-interop
+    PRIVATE nanobind-static Python3::Module cudaq
+    PUBLIC cudaq-mlir-runtime-headers)
 install (FILES PythonCppInterop.h PythonCppInteropDecls.h DESTINATION include/cudaq/python/)
 
 install(TARGETS cudaq-python-interop EXPORT cudaq-python-interop-targets DESTINATION lib)
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index 4db6be7ed36..c6754b77281 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -6,8 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+#include "py_register_dialects.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CAPI/Dialects.h"
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -16,7 +18,10 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/InitAllPasses.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "utils/NanobindAdaptors.h"
+#ifdef __APPLE__
+#include "cudaq_internal/compiler/RuntimeMLIR.h"
+#endif
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/InitAllDialects.h"
 #include <fmt/core.h>
 #include <nanobind/stl/complex.h>
@@ -25,10 +30,10 @@
 
 using namespace mlir;
 
-namespace cudaq {
 static bool registered = false;
 
-void registerQuakeDialectAndTypes(nanobind::module_ &m) {
+static void registerQuakeDialectAndTypes(nanobind::module_ &m) {
+  using namespace mlir::python::nanobind_adaptors;
   auto quakeMod = m.def_submodule("quake");
 
   quakeMod.def(
@@ -40,16 +45,20 @@ void registerQuakeDialectAndTypes(nanobind::module_ &m) {
           mlirDialectHandleLoadDialect(handle, context);
 
         if (!registered) {
+#ifdef __APPLE__
+          cudaq_internal::compiler::initializeMLIR();
+#else
           cudaq::registerCudaqPassesAndPipelines();
+#endif
           registered = true;
         }
       },
       nanobind::arg("load") = true,
       nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
+  mlir_type_subclass(
       quakeMod, "RefType",
-      [](MlirType type) { return unwrap(type).isa<quake::RefType>(); })
+      [](MlirType type) { return mlir::isa<quake::RefType>(unwrap(type)); })
       .def_classmethod(
           "get",
           [](nanobind::object cls, MlirContext context) {
@@ -57,9 +66,9 @@ void registerQuakeDialectAndTypes(nanobind::module_ &m) {
           },
           nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
+  mlir_type_subclass(
       quakeMod, "MeasureType",
-      [](MlirType type) { return unwrap(type).isa<quake::MeasureType>(); })
+      [](MlirType type) { return mlir::isa<quake::MeasureType>(unwrap(type)); })
       .def_classmethod(
           "get",
           [](nanobind::object cls, MlirContext context) {
@@ -69,7 +78,7 @@ void registerQuakeDialectAndTypes(nanobind::module_ &m) {
 
   mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "VeqType",
-      [](MlirType type) { return unwrap(type).isa<quake::VeqType>(); })
+      [](MlirType type) { return mlir::isa<quake::VeqType>(unwrap(type)); })
       .def_classmethod(
           "get",
           [](nanobind::object cls, std::size_t size, MlirContext context) {
@@ -113,15 +122,15 @@ void registerQuakeDialectAndTypes(nanobind::module_ &m) {
       [](MlirType type) { return quake::getAllocationSize(unwrap(type)); },
       nanobind::arg("type"));
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
+  mlir_type_subclass(
       quakeMod, "StruqType",
-      [](MlirType type) { return unwrap(type).isa<quake::StruqType>(); })
+      [](MlirType type) { return mlir::isa<quake::StruqType>(unwrap(type)); })
       .def_classmethod(
           "get",
           [](nanobind::object cls, nanobind::list aggregateTypes,
              MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto t : aggregateTypes)
+            for (nanobind::handle t : aggregateTypes)
               inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(quake::StruqType::get(unwrap(context), inTys));
@@ -133,7 +142,7 @@ void registerQuakeDialectAndTypes(nanobind::module_ &m) {
           [](nanobind::object cls, const std::string &name,
              nanobind::list aggregateTypes, MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto t : aggregateTypes)
+            for (nanobind::handle t : aggregateTypes)
               inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(quake::StruqType::get(unwrap(context), name, inTys));
@@ -164,8 +173,8 @@ void registerQuakeDialectAndTypes(nanobind::module_ &m) {
       });
 }
 
-void registerCCDialectAndTypes(nanobind::module_ &m) {
-
+static void registerCCDialectAndTypes(nanobind::module_ &m) {
+  using namespace mlir::python::nanobind_adaptors;
   auto ccMod = m.def_submodule("cc");
 
   ccMod.def(
@@ -180,9 +189,10 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
       nanobind::arg("load") = true,
       nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
-      ccMod, "CharspanType",
-      [](MlirType type) { return unwrap(type).isa<cudaq::cc::CharspanType>(); })
+  mlir_type_subclass(ccMod, "CharspanType",
+                     [](MlirType type) {
+                       return mlir::isa<cudaq::cc::CharspanType>(unwrap(type));
+                     })
       .def_classmethod(
           "get",
           [](nanobind::object cls, MlirContext context) {
@@ -190,9 +200,9 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
           },
           nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
+  mlir_type_subclass(
       ccMod, "StateType",
-      [](MlirType type) { return unwrap(type).isa<quake::StateType>(); })
+      [](MlirType type) { return mlir::isa<quake::StateType>(unwrap(type)); })
       .def_classmethod(
           "get",
           [](nanobind::object cls, MlirContext context) {
@@ -200,9 +210,10 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
           },
           nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
-      ccMod, "PointerType",
-      [](MlirType type) { return unwrap(type).isa<cudaq::cc::PointerType>(); })
+  mlir_type_subclass(ccMod, "PointerType",
+                     [](MlirType type) {
+                       return mlir::isa<cudaq::cc::PointerType>(unwrap(type));
+                     })
       .def_classmethod(
           "getElementType",
           [](nanobind::object cls, MlirType type) {
@@ -223,9 +234,10 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
           nanobind::arg("cls"), nanobind::arg("elementType"),
           nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
-      ccMod, "ArrayType",
-      [](MlirType type) { return unwrap(type).isa<cudaq::cc::ArrayType>(); })
+  mlir_type_subclass(ccMod, "ArrayType",
+                     [](MlirType type) {
+                       return mlir::isa<cudaq::cc::ArrayType>(unwrap(type));
+                     })
       .def_classmethod(
           "getElementType",
           [](nanobind::object cls, MlirType type) {
@@ -248,15 +260,16 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
           nanobind::arg("size") = std::numeric_limits<std::int64_t>::min(),
           nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
-      ccMod, "StructType",
-      [](MlirType type) { return unwrap(type).isa<cudaq::cc::StructType>(); })
+  mlir_type_subclass(ccMod, "StructType",
+                     [](MlirType type) {
+                       return mlir::isa<cudaq::cc::StructType>(unwrap(type));
+                     })
       .def_classmethod(
           "get",
           [](nanobind::object cls, nanobind::list aggregateTypes,
              MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto t : aggregateTypes)
+            for (nanobind::handle t : aggregateTypes)
               inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(cudaq::cc::StructType::get(unwrap(context), inTys));
@@ -268,7 +281,7 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
           [](nanobind::object cls, const std::string &name,
              nanobind::list aggregateTypes, MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto t : aggregateTypes)
+            for (nanobind::handle t : aggregateTypes)
               inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(
@@ -299,19 +312,20 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
         return ty.getName().getValue().str();
       });
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
-      ccMod, "CallableType",
-      [](MlirType type) { return unwrap(type).isa<cudaq::cc::CallableType>(); })
+  mlir_type_subclass(ccMod, "CallableType",
+                     [](MlirType type) {
+                       return mlir::isa<cudaq::cc::CallableType>(unwrap(type));
+                     })
       .def_classmethod("get",
                        [](nanobind::object cls, MlirContext context,
                           nanobind::list inTypes, nanobind::list resTypes) {
                          // Nanobind builder: make the builder for this type
                          // look like that of a FunctionType.
                          SmallVector<Type> inTys;
-                         for (auto t : inTypes)
+                         for (nanobind::handle t : inTypes)
                            inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
                          SmallVector<Type> resTys;
-                         for (auto t : resTypes)
+                         for (nanobind::handle t : resTypes)
                            resTys.push_back(
                                unwrap(nanobind::cast<MlirType>(t)));
 
@@ -327,9 +341,10 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
             return wrap(callTy.getSignature());
           });
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
-      ccMod, "StdvecType",
-      [](MlirType type) { return unwrap(type).isa<cudaq::cc::StdvecType>(); })
+  mlir_type_subclass(ccMod, "StdvecType",
+                     [](MlirType type) {
+                       return mlir::isa<cudaq::cc::StdvecType>(unwrap(type));
+                     })
       .def_classmethod(
           "getElementType",
           [](nanobind::object cls, MlirType type) {
@@ -351,7 +366,7 @@ void registerCCDialectAndTypes(nanobind::module_ &m) {
           nanobind::arg("context") = nanobind::none());
 }
 
-void bindRegisterDialects(nanobind::module_ &mod) {
+void cudaq::bindRegisterDialects(nanobind::module_ &mod) {
   registerQuakeDialectAndTypes(mod);
   registerCCDialectAndTypes(mod);
 
@@ -363,13 +378,9 @@ void bindRegisterDialects(nanobind::module_ &mod) {
   });
 
   mod.def("register_all_dialects", [](MlirContext context) {
-    DialectRegistry registry;
-    registry.insert<quake::QuakeDialect, cudaq::cc::CCDialect>();
-    cudaq::opt::registerCodeGenDialect(registry);
-    registerAllDialects(registry);
+    ::cudaqRegisterAllDialects(context);
     auto *mlirContext = unwrap(context);
-    mlirContext->appendDialectRegistry(registry);
-    mlirContext->loadAllAvailableDialects();
+    mlirContext->getOrLoadDialect<cudaq::codegen::CodeGenDialect>();
   });
 
   mod.def("gen_vector_of_complex_constant", [](MlirLocation loc,
@@ -383,4 +394,3 @@ void bindRegisterDialects(nanobind::module_ &mod) {
     builder.genVectorOfConstants(unwrap(loc), modOp, name, newValues);
   });
 }
-} // namespace cudaq
diff --git a/python/runtime/utils/PyRemoteSimulatorQPU.cpp b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
index e64e3f5b4f2..868ee87599c 100644
--- a/python/runtime/utils/PyRemoteSimulatorQPU.cpp
+++ b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
@@ -8,6 +8,7 @@
 
 #include "common/ArgumentWrapper.h"
 #include "common/BaseRemoteSimulatorQPU.h"
+#include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include <mlir/IR/BuiltinOps.h>
 
 using namespace mlir;
@@ -110,7 +111,9 @@ static void launchKernelStreamlineImpl(
 template <typename Derived, typename Base>
 class PyRemoteSimulatorCommonBase : public Base {
 public:
-  using Base::Base;
+  PyRemoteSimulatorCommonBase() : Base() {
+    this->m_mlirContext = cudaq_internal::compiler::getOwningMLIRContext();
+  }
   PyRemoteSimulatorCommonBase(PyRemoteSimulatorCommonBase &&) = delete;
   virtual ~PyRemoteSimulatorCommonBase() = default;
 
diff --git a/python/tests/builder/test_NoiseModel.py b/python/tests/builder/test_NoiseModel.py
index c0baeef14cd..89723196bcb 100644
--- a/python/tests/builder/test_NoiseModel.py
+++ b/python/tests/builder/test_NoiseModel.py
@@ -44,7 +44,16 @@ def test_depolarization_channel(target: str):
     assert ('1' in counts)
 
 
-@pytest.mark.parametrize('target', ['density-matrix-cpu', 'stim'])
+from cudaq._metadata import assertions_enabled as _cudaq_assertions_enabled
+
+_skip_stim_p1 = pytest.mark.skipif(
+    _cudaq_assertions_enabled,
+    reason="https://github.com/NVIDIA/cuda-quantum/issues/4026")
+
+
+@pytest.mark.parametrize(
+    'target', ['density-matrix-cpu',
+               pytest.param('stim', marks=_skip_stim_p1)])
 def test_depolarization_channel_simple(target: str):
     """Tests the depolarization channel in the case of `probability = 1.0`"""
     cudaq.set_target(target)
@@ -116,7 +125,9 @@ def test_amplitude_damping_simple():
     cudaq.reset_target()
 
 
-@pytest.mark.parametrize('target', ['density-matrix-cpu', 'stim'])
+@pytest.mark.parametrize(
+    'target', ['density-matrix-cpu',
+               pytest.param('stim', marks=_skip_stim_p1)])
 def test_phase_flip_simple(target: str):
     """Tests the phase flip channel in the case of `probability = 1.0`"""
     cudaq.set_target(target)
@@ -153,7 +164,9 @@ def test_phase_flip_simple(target: str):
     cudaq.reset_target()
 
 
-@pytest.mark.parametrize('target', ['density-matrix-cpu', 'stim'])
+@pytest.mark.parametrize(
+    'target', ['density-matrix-cpu',
+               pytest.param('stim', marks=_skip_stim_p1)])
 def test_bit_flip_simple(target: str):
     """
     Tests the bit flip channel with the probability at `0.0` on qubit 0, 
@@ -313,7 +326,9 @@ def test_noise_u3():
     cudaq.reset_target()
 
 
-@pytest.mark.parametrize('target', ['density-matrix-cpu', 'stim'])
+@pytest.mark.parametrize(
+    'target', ['density-matrix-cpu',
+               pytest.param('stim', marks=_skip_stim_p1)])
 def test_all_qubit_channel(target: str):
     cudaq.set_target(target)
     cudaq.set_random_seed(13)
diff --git a/python/tests/interop/CMakeLists.txt b/python/tests/interop/CMakeLists.txt
index 8e921aa6001..d9dd1ef98f9 100644
--- a/python/tests/interop/CMakeLists.txt
+++ b/python/tests/interop/CMakeLists.txt
@@ -18,7 +18,6 @@ set(CMAKE_INSTALL_RPATH
 add_subdirectory(quantum_lib)
 
 nanobind_add_module(cudaq_test_cpp_algo
-  NB_STATIC
   test_cpp_quantum_algorithm_module.cpp
 )
 
@@ -37,6 +36,6 @@ target_include_directories(cudaq_test_cpp_algo
   PRIVATE
     ${CMAKE_SOURCE_DIR}/python
 )
-  
+
 add_dependencies(cudaq_test_cpp_algo nvq++)
 
diff --git a/python/tests/interop/quantum_lib/CMakeLists.txt b/python/tests/interop/quantum_lib/CMakeLists.txt
index 4ec95bc4ba4..caa6ca8790d 100644
--- a/python/tests/interop/quantum_lib/CMakeLists.txt
+++ b/python/tests/interop/quantum_lib/CMakeLists.txt
@@ -19,7 +19,7 @@ add_library(quantum_lib
 
 target_include_directories(quantum_lib
   PRIVATE
-    ${PYTHON_INCLUDE_DIRS}
+    ${Python3_INCLUDE_DIRS}
 )
 
 # Dependencies: quantum_lib uses nvq++ as its compiler, so we need the full
diff --git a/python/tests/kernel/test_assignments.py b/python/tests/kernel/test_assignments.py
index e12bbd219cf..a8fd5f5eb6b 100644
--- a/python/tests/kernel/test_assignments.py
+++ b/python/tests/kernel/test_assignments.py
@@ -205,7 +205,7 @@ def get_MyTuple(l1: list[int]) -> MyTuple:
     # return values with dynamically sized element types are not yet supported
     with pytest.raises(RuntimeError) as e:
         cudaq.run(get_MyTuple, [0, 0])
-    assert 'Tuple size mismatch' in str(e.value)
+    assert 'Unsupported element type in struct type' in str(e.value)
 
     with pytest.raises(RuntimeError) as e:
 
diff --git a/python/tests/kernel/test_explicit_measurements.py b/python/tests/kernel/test_explicit_measurements.py
index 3b3b216fdf4..2842edf74ec 100644
--- a/python/tests/kernel/test_explicit_measurements.py
+++ b/python/tests/kernel/test_explicit_measurements.py
@@ -10,6 +10,7 @@
 import pytest
 import os
 import numpy as np
+from cudaq._metadata import assertions_enabled as _cudaq_assertions_enabled
 
 skipIfBraketNotInstalled = pytest.mark.skipif(
     not (cudaq.has_target("braket")),
@@ -253,9 +254,16 @@ def mixed_basis_kernel():
     assert total_counts == 100
 
 
+_skip_stim_p1 = pytest.mark.skipif(
+    _cudaq_assertions_enabled,
+    reason="https://github.com/NVIDIA/cuda-quantum/issues/4026")
+
+
 # NOTE: Ref - https://github.com/NVIDIA/cuda-quantum/issues/1925
-@pytest.mark.parametrize("target",
-                         ["density-matrix-cpu", "nvidia", "qpp-cpu", "stim"])
+@pytest.mark.parametrize("target", [
+    "density-matrix-cpu", "nvidia", "qpp-cpu",
+    pytest.param('stim', marks=_skip_stim_p1)
+])
 def test_simulators(target):
 
     def can_set_target(name):
diff --git a/python/tests/kernel/test_kernel_features.py b/python/tests/kernel/test_kernel_features.py
index 856c2ea8311..2cc45e3ad98 100644
--- a/python/tests/kernel/test_kernel_features.py
+++ b/python/tests/kernel/test_kernel_features.py
@@ -17,6 +17,7 @@
 from cudaq import spin
 
 from test_helpers import h2_hamiltonian_4q
+from cudaq._metadata import assertions_enabled as _cudaq_assertions_enabled
 
 
 @pytest.fixture(autouse=True)
@@ -422,7 +423,13 @@ def kernel(theta: float):
     assert '11' in counts
 
 
-@pytest.mark.parametrize('target', ['default', 'stim'])
+_skip_stim_p1 = pytest.mark.skipif(
+    _cudaq_assertions_enabled,
+    reason="https://github.com/NVIDIA/cuda-quantum/issues/4026")
+
+
+@pytest.mark.parametrize(
+    'target', ['default', pytest.param('stim', marks=_skip_stim_p1)])
 def test_dynamic_circuit(target):
     """Test that we correctly handle circuits with 
        mid-circuit measurements and conditionals."""
diff --git a/python/tests/kernel/test_run_kernel.py b/python/tests/kernel/test_run_kernel.py
index 77a7cbe82b5..1d7e24637e8 100644
--- a/python/tests/kernel/test_run_kernel.py
+++ b/python/tests/kernel/test_run_kernel.py
@@ -14,6 +14,7 @@
 import numpy as np
 import warnings
 import pytest
+from cudaq._metadata import assertions_enabled as _cudaq_assertions_enabled
 
 skipIfBraketNotInstalled = pytest.mark.skipif(
     not (cudaq.has_target("braket")),
@@ -1131,9 +1132,16 @@ def bell_pair() -> int:
         in repr(error))
 
 
+_skip_stim_p1 = pytest.mark.skipif(
+    _cudaq_assertions_enabled,
+    reason="https://github.com/NVIDIA/cuda-quantum/issues/4026")
+
+
 # NOTE: Ref - https://github.com/NVIDIA/cuda-quantum/issues/1925
-@pytest.mark.parametrize("target",
-                         ["density-matrix-cpu", "nvidia", "qpp-cpu", "stim"])
+@pytest.mark.parametrize("target", [
+    "density-matrix-cpu", "nvidia", "qpp-cpu",
+    pytest.param('stim', marks=_skip_stim_p1)
+])
 def test_supported_simulators(target):
 
     def can_set_target(name):
diff --git a/python/tests/mlir/ast_break.py b/python/tests/mlir/ast_break.py
index b2580953056..2770fd229c6 100644
--- a/python/tests/mlir/ast_break.py
+++ b/python/tests/mlir/ast_break.py
@@ -44,14 +44,14 @@ def kernel(x: float):
 # CHECK:             %[[VAL_17:.*]] = math.fpowi %[[VAL_16]], %[[VAL_2]] : f64, i64
 # CHECK:             %[[VAL_18:.*]] = arith.addf %[[VAL_16]], %[[VAL_17]] : f64
 # CHECK:             %[[VAL_19:.*]] = arith.cmpf ogt, %[[VAL_18]], %[[VAL_1]] : f64
-# CHECK:             cf.cond_br %[[VAL_19]], ^bb1(%[[VAL_14]], %[[VAL_18]] : i64, f64), ^bb2(%[[VAL_14]], %[[VAL_18]] : i64, f64)
-# CHECK:           ^bb1(%[[VAL_20:.*]]: i64, %[[VAL_21:.*]]: f64):
-# CHECK:             cc.break %[[VAL_14]], %[[VAL_20]], %[[VAL_21]] : i64, i64, f64
-# CHECK:           ^bb2(%[[VAL_22:.*]]: i64, %[[VAL_23:.*]]: f64):
-# CHECK:             %[[VAL_24:.*]] = arith.remui %[[VAL_22]], %[[VAL_6]] : i64
+# CHECK:             cf.cond_br %[[VAL_19]], ^bb1, ^bb2
+# CHECK:           ^bb1:
+# CHECK:             cc.break %[[VAL_14]], %[[VAL_14]], %[[VAL_18]] : i64, i64, f64
+# CHECK:           ^bb2:
+# CHECK:             %[[VAL_24:.*]] = arith.remui %[[VAL_14]], %[[VAL_6]] : i64
 # CHECK:             %[[VAL_25:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_24]]] : (!quake.veq<4>, i64) -> !quake.ref
-# CHECK:             quake.ry (%[[VAL_23]]) %[[VAL_25]] : (f64, !quake.ref) -> ()
-# CHECK:             cc.continue %[[VAL_14]], %[[VAL_22]], %[[VAL_23]] : i64, i64, f64
+# CHECK:             quake.ry (%[[VAL_18]]) %[[VAL_25]] : (f64, !quake.ref) -> ()
+# CHECK:             cc.continue %[[VAL_14]], %[[VAL_14]], %[[VAL_18]] : i64, i64, f64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_26:.*]]: i64, %[[VAL_27:.*]]: i64, %[[VAL_28:.*]]: f64):
 # CHECK:             %[[VAL_29:.*]] = arith.addi %[[VAL_26]], %[[VAL_4]] : i64
diff --git a/python/tests/mlir/ast_continue.py b/python/tests/mlir/ast_continue.py
index 4b8275b9a21..08e39fc7f13 100644
--- a/python/tests/mlir/ast_continue.py
+++ b/python/tests/mlir/ast_continue.py
@@ -44,17 +44,17 @@ def kernel_ok(xmen: float):
     # CHECK:             %[[VAL_17:.*]] = math.fpowi %[[VAL_16]], %[[VAL_2]] : f64, i64
     # CHECK:             %[[VAL_18:.*]] = arith.addf %[[VAL_16]], %[[VAL_17]] : f64
     # CHECK:             %[[VAL_19:.*]] = arith.cmpf ogt, %[[VAL_18]], %[[VAL_1]] : f64
-    # CHECK:             cf.cond_br %[[VAL_19]], ^bb1(%[[VAL_14]], %[[VAL_18]] : i64, f64), ^bb2(%[[VAL_14]], %[[VAL_18]] : i64, f64)
-    # CHECK:           ^bb1(%[[VAL_20:.*]]: i64, %[[VAL_21:.*]]: f64):
-    # CHECK:             %[[VAL_22:.*]] = arith.remui %[[VAL_20]], %[[VAL_6]] : i64
+    # CHECK:             cf.cond_br %[[VAL_19]], ^bb1, ^bb2
+    # CHECK:           ^bb1:
+    # CHECK:             %[[VAL_22:.*]] = arith.remui %[[VAL_14]], %[[VAL_6]] : i64
     # CHECK:             %[[VAL_23:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_22]]] : (!quake.veq<4>, i64) -> !quake.ref
     # CHECK:             quake.x %[[VAL_23]] : (!quake.ref) -> ()
-    # CHECK:             cc.continue %[[VAL_14]], %[[VAL_20]], %[[VAL_21]] : i64, i64, f64
-    # CHECK:           ^bb2(%[[VAL_24:.*]]: i64, %[[VAL_25:.*]]: f64):
-    # CHECK:             %[[VAL_26:.*]] = arith.remui %[[VAL_24]], %[[VAL_6]] : i64
+    # CHECK:             cc.continue %[[VAL_14]], %[[VAL_14]], %[[VAL_18]] : i64, i64, f64
+    # CHECK:           ^bb2:
+    # CHECK:             %[[VAL_26:.*]] = arith.remui %[[VAL_14]], %[[VAL_6]] : i64
     # CHECK:             %[[VAL_27:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_26]]] : (!quake.veq<4>, i64) -> !quake.ref
-    # CHECK:             quake.ry (%[[VAL_25]]) %[[VAL_27]] : (f64, !quake.ref) -> ()
-    # CHECK:             cc.continue %[[VAL_14]], %[[VAL_24]], %[[VAL_25]] : i64, i64, f64
+    # CHECK:             quake.ry (%[[VAL_18]]) %[[VAL_27]] : (f64, !quake.ref) -> ()
+    # CHECK:             cc.continue %[[VAL_14]], %[[VAL_14]], %[[VAL_18]] : i64, i64, f64
     # CHECK:           } step {
     # CHECK:           ^bb0(%[[VAL_28:.*]]: i64, %[[VAL_29:.*]]: i64, %[[VAL_30:.*]]: f64):
     # CHECK:             %[[VAL_31:.*]] = arith.addi %[[VAL_28]], %[[VAL_4]] : i64
diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py
index b220d5272a6..01a9b6b542d 100644
--- a/python/tests/mlir/ast_list_comprehension.py
+++ b/python/tests/mlir/ast_list_comprehension.py
@@ -1196,7 +1196,6 @@ def kernel6(mask: int):
 # CHECK-LABEL: test_list_comprehension_filter:
 # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel1..
 # CHECK: cc.loop
-# CHECK: cc.if
 # CHECK: cc.stdvec_init
 # CHECK: return
 # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2..
diff --git a/python/tests/mlir/bug_1775.py b/python/tests/mlir/bug_1775.py
index e41a8659ff6..1a902e31c50 100644
--- a/python/tests/mlir/bug_1775.py
+++ b/python/tests/mlir/bug_1775.py
@@ -32,14 +32,12 @@ def test():
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__test
 # CHECK-SAME:      () attributes {"cudaq-entrypoint", "cudaq-kernel", qubitMeasurementFeedback = true} {
-# CHECK-DAG:       %[[VAL_0:.*]] = arith.constant true
-# CHECK-DAG:       %[[VAL_1:.*]] = quake.alloca !quake.ref
+# CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
 # CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "res" : (!quake.ref) -> !quake.measure
 # CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
 # CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] name "res" : (!quake.ref) -> !quake.measure
 # CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-# CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_0]] : i1
-# CHECK:           cc.if(%[[VAL_7]]) {
+# CHECK:           cc.if(%[[VAL_5]]) {
 # CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_1]] name "true_res" : (!quake.ref) -> !quake.measure
 # CHECK:           } else {
 # CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_1]] name "false_res" : (!quake.ref) -> !quake.measure
diff --git a/python/tests/mlir/bug_1777.py b/python/tests/mlir/bug_1777.py
index 0ed54ea9403..ef3d97b35dc 100644
--- a/python/tests/mlir/bug_1777.py
+++ b/python/tests/mlir/bug_1777.py
@@ -57,8 +57,7 @@ def test():
 # CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_20]], %[[VAL_1]] : i64
 # CHECK:             cc.continue %[[VAL_23]], %[[VAL_21]], %[[VAL_22]] : i64, i64, i1
 # CHECK:           }
-# CHECK:           %[[VAL_24:.*]] = arith.cmpi eq, %[[VAL_25:.*]]#2, %[[VAL_3]] : i1
-# CHECK:           cc.if(%[[VAL_24]]) {
+# CHECK:           cc.if(%[[VAL_7]]#2) {
 # CHECK:             %[[VAL_26:.*]] = quake.mz %[[VAL_6]] name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 # CHECK:           } else {
 # CHECK:           }
diff --git a/python/tests/mlir/call_qpu.py b/python/tests/mlir/call_qpu.py
index b041f369a3e..87611ec06ed 100644
--- a/python/tests/mlir/call_qpu.py
+++ b/python/tests/mlir/call_qpu.py
@@ -61,7 +61,7 @@ def main_kernel() -> int:
 # CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
 # CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
 # CHECK:           %[[VAL_7:.*]] = call @malloc(%[[VAL_5]]) : (i64) -> !cc.ptr<i8>
-# CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_6]], %[[VAL_5]], %[[VAL_1]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+# CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_7]], %[[VAL_6]], %[[VAL_5]], %[[VAL_1]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 # CHECK:           %[[VAL_8:.*]] = cc.stdvec_init %[[VAL_7]], %[[VAL_5]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
 # CHECK:           return %[[VAL_8]] : !cc.stdvec<i1>
 # CHECK:         }
@@ -79,10 +79,10 @@ def main_kernel() -> int:
 # CHECK:           %[[VAL_9:.*]] = cc.call_callable %[[VAL_1]], %[[VAL_8]] : (!cc.callable<(!quake.veq<?>) -> !cc.stdvec<i1>>, !quake.veq<?>) -> !cc.stdvec<i1> {symbol = "func_achat"}
 # CHECK:           %[[VAL_10:.*]] = cc.stdvec_data %[[VAL_9]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
 # CHECK:           %[[VAL_11:.*]] = cc.stdvec_size %[[VAL_9]] : (!cc.stdvec<i1>) -> i64
-# CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
 # CHECK:           %[[VAL_13:.*]] = cc.alloca i8{{\[}}%[[VAL_11]] : i64]
 # CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-# CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_14]], %[[VAL_12]], %[[VAL_11]], %[[VAL_2]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+# CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+# CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_14]], %[[VAL_12]], %[[VAL_11]], %[[VAL_2]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 # CHECK:           call @free(%[[VAL_12]]) : (!cc.ptr<i8>) -> ()
 # CHECK:           %[[VAL_15:.*]]:3 = cc.loop while ((%[[VAL_16:.*]] = %[[VAL_5]], %[[VAL_17:.*]] = %[[VAL_6]], %[[VAL_18:.*]] = %[[VAL_5]]) -> (i64, i1, i64)) {
 # CHECK:             %[[VAL_19:.*]] = arith.cmpi slt, %[[VAL_16]], %[[VAL_11]] : i64
diff --git a/python/tests/mlir/exp_pauli.py b/python/tests/mlir/exp_pauli.py
index 41c5f691f7d..2314843bd44 100644
--- a/python/tests/mlir/exp_pauli.py
+++ b/python/tests/mlir/exp_pauli.py
@@ -74,52 +74,52 @@ def kernel_noancilla_rotation(angles: list[float]):
 
 
 # CHECK-LABEL: define void @__nvqpp__mlirgen__kernel_ancilla_exp_pauli..
-# CHECK-SAME: ({ double*, i64 }
-# CHECK-SAME: %[[VAL_0:.*]], { i8*, i8* }
+# CHECK-SAME: ({ ptr, i64 }
+# CHECK-SAME: %[[VAL_0:.*]], { ptr, ptr }
 # CHECK-SAME: %[[VAL_1:.*]]) {
-# CHECK:         %[[VAL_2:.*]] = alloca [1 x { i8*, i64 }], align 8
+# CHECK:         %[[VAL_2:.*]] = alloca [1 x { ptr, i64 }], align 8
 # CHECK:         %[[VAL_3:.*]] = alloca [3 x double], align 8
-# CHECK:         %[[VAL_4:.*]] = bitcast [3 x double]* %[[VAL_3]] to double*
-# CHECK:         store double 3.400000e-01, double* %[[VAL_4]], align 8
-# CHECK:         %[[VAL_5:.*]] = getelementptr [3 x double], [3 x double]* %[[VAL_3]], i32 0, i32 1
-# CHECK:         store double 1.200000e+00, double* %[[VAL_5]], align 8
-# CHECK:         %[[VAL_6:.*]] = getelementptr [3 x double], [3 x double]* %[[VAL_3]], i32 0, i32 2
-# CHECK:         store double 1.600000e+00, double* %[[VAL_6]], align 8
-# CHECK:         %[[VAL_7:.*]] = call %[[VAL_8:.*]]* @__quantum__rt__qubit_allocate()
-# CHECK:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-# CHECK:         br label %[[VAL_11:.*]]
-# CHECK:         %[[VAL_12:.*]] = phi i64 [ %[[VAL_13:.*]], %[[VAL_14:.*]] ], [ 0, %[[VAL_15:.*]] ]
+# CHECK:         store double 3.400000e-01, ptr %[[VAL_3]], align 8
+# CHECK:         %[[VAL_5:.*]] = getelementptr [3 x double], ptr %[[VAL_3]], i32 0, i32 1
+# CHECK:         store double 1.200000e+00, ptr %[[VAL_5]], align 8
+# CHECK:         %[[VAL_6:.*]] = getelementptr [3 x double], ptr %[[VAL_3]], i32 0, i32 2
+# CHECK:         store double 1.600000e+00, ptr %[[VAL_6]], align 8
+# CHECK:         %[[VAL_7:.*]] = call ptr @__quantum__rt__qubit_allocate()
+# CHECK:         %[[VAL_9:.*]] = call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+# CHECK:         br label %[[HDR:[0-9]+]]
+# CHECK: [[HDR]]:{{.*}}
+# CHECK:         %[[VAL_12:.*]] = phi i64 [ %[[VAL_13:.*]], %[[VAL_14:[0-9]+]] ], [ 0, %[[VAL_15:[0-9]+]] ]
 # CHECK:         %[[VAL_16:.*]] = icmp slt i64 %[[VAL_12]], 3
-# CHECK:         br i1 %[[VAL_16]], label %[[VAL_14]], label %[[VAL_17:.*]]
-# CHECK:         %[[VAL_18:.*]] = phi i64 [ %[[VAL_12]], %[[VAL_11]] ]
-# CHECK:         %[[VAL_19:.*]] = getelementptr [3 x double], [3 x double]* %[[VAL_3]], i32 0, i64 %[[VAL_18]]
-# CHECK:         %[[VAL_20:.*]] = load double, double* %[[VAL_19]], align 8
-# CHECK:         %[[VAL_21:.*]] = call %[[VAL_8]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 %[[VAL_18]])
-# CHECK:         %[[VAL_22:.*]] = load %[[VAL_8]]*, %[[VAL_8]]** %[[VAL_21]], align 8
-# CHECK:         call void @__quantum__qis__rx(double %[[VAL_20]], %[[VAL_8]]* %[[VAL_22]])
-# CHECK:         %[[VAL_13]] = add i64 %[[VAL_18]], 1
-# CHECK:         br label %[[VAL_11]]
-# CHECK:         call void @__quantum__qis__h(%[[VAL_8]]* %[[VAL_7]])
-# CHECK:         %[[VAL_23:.*]] = call %[[VAL_10]]* @__quantum__rt__array_create_1d(i32 8, i64 1)
-# CHECK:         %[[VAL_24:.*]] = call %[[VAL_8]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_23]], i64 0)
-# CHECK:         store %[[VAL_8]]* %[[VAL_7]], %[[VAL_8]]** %[[VAL_24]], align 8
-# CHECK:         %[[VAL_25:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_2]] to i8**
-# CHECK:         store i8* getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.58495900, i32 0, i32 0), i8** %[[VAL_25]], align 8
-# CHECK:         %[[VAL_26:.*]] = getelementptr [1 x { i8*, i64 }], [1 x { i8*, i64 }]* %[[VAL_2]], i32 0, i32 0, i32 1
-# CHECK:         store i64 3, i64* %[[VAL_26]], align 8
-# CHECK:         %[[VAL_27:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_2]] to i8*
-# CHECK:         call void @__quantum__qis__exp_pauli__ctl(double 2.310000e+01, %[[VAL_10]]* %[[VAL_23]], %[[VAL_10]]* %[[VAL_9]], i8* %[[VAL_27]])
+# CHECK:         br i1 %[[VAL_16]], label %[[VAL_14]], label %[[VAL_17:[0-9]+]]
+# CHECK: [[VAL_14]]:{{.*}}
+# CHECK:         %[[VAL_19:.*]] = getelementptr [3 x double], ptr %[[VAL_3]], i32 0, i64 %[[VAL_12]]
+# CHECK:         %[[VAL_20:.*]] = load double, ptr %[[VAL_19]], align 8
+# CHECK:         %[[VAL_21:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 %[[VAL_12]])
+# CHECK:         %[[VAL_22:.*]] = load ptr, ptr %[[VAL_21]], align 8
+# CHECK:         call void @__quantum__qis__rx(double %[[VAL_20]], ptr %[[VAL_22]])
+# CHECK:         %[[VAL_13]] = add i64 %[[VAL_12]], 1
+# CHECK:         br label %[[HDR]]
+# CHECK: [[VAL_17]]:{{.*}}
+# CHECK:         call void @__quantum__qis__h(ptr %[[VAL_7]])
+# CHECK:         %[[VAL_23:.*]] = call ptr @__quantum__rt__array_create_1d(i32 8, i64 1)
+# CHECK:         %[[VAL_24:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_23]], i64 0)
+# CHECK:         store ptr %[[VAL_7]], ptr %[[VAL_24]], align 8
+# CHECK:         store ptr @cstr.58495900, ptr %[[VAL_2]], align 8
+# CHECK:         %[[VAL_26:.*]] = getelementptr [1 x { ptr, i64 }], ptr %[[VAL_2]], i32 0, i32 0, i32 1
+# CHECK:         store i64 3, ptr %[[VAL_26]], align 8
+# CHECK:         call void @__quantum__qis__exp_pauli__ctl(double 2.310000e+01, ptr %[[VAL_23]], ptr %[[VAL_9]], ptr %[[VAL_2]])
+# CHECK-DAG:     call void @__quantum__rt__qubit_release_array(ptr %[[VAL_9]])
+# CHECK-DAG:     call void @__quantum__rt__qubit_release(ptr %[[VAL_7]])
 # CHECK:         ret void
+# CHECK: }
 
 # CHECK-LABEL: define void @__nvqpp__mlirgen__U_exp_pauli..
-# CHECK-SAME:    %[[VAL_0:.*]]* %[[VAL_1:.*]]) {
-# CHECK:         %[[VAL_2:.*]] = alloca [1 x { i8*, i64 }], align 8
-# CHECK:         %[[VAL_3:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_2]] to i8**
-# CHECK:         store i8* getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.58495900, i32 0, i32 0), i8** %[[VAL_3]], align 8
-# CHECK:         %[[VAL_4:.*]] = getelementptr [1 x { i8*, i64 }], [1 x { i8*, i64 }]* %[[VAL_2]], i32 0, i32 0, i32 1
-# CHECK:         store i64 3, i64* %[[VAL_4]], align 8
-# CHECK:         %[[VAL_5:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_2]] to i8*
-# CHECK:         call void @__quantum__qis__exp_pauli(double 2.310000e+01, %[[VAL_0]]* %[[VAL_1]], i8* %[[VAL_5]])
+# CHECK-SAME: ptr %[[VAL_0:.*]]) {
+# CHECK:         %[[VAL_2:.*]] = alloca [1 x { ptr, i64 }], align 8
+# CHECK:         store ptr @cstr.58495900, ptr %[[VAL_2]], align 8
+# CHECK:         %[[VAL_4:.*]] = getelementptr [1 x { ptr, i64 }], ptr %[[VAL_2]], i32 0, i32 0, i32 1
+# CHECK:         store i64 3, ptr %[[VAL_4]], align 8
+# CHECK:         call void @__quantum__qis__exp_pauli(double 2.310000e+01, ptr %[[VAL_0]], ptr %[[VAL_2]])
 # CHECK:         ret void
 # CHECK:       }
 
@@ -160,68 +160,60 @@ def kernel_controlled_exp_pauli_loop(coefficients: list[float],
 
 
 # CHECK-LABEL: define void @__nvqpp__mlirgen__kernel_controlled_exp_pauli_loop..
-# CHECK:         %[[VAL_0:.*]] = alloca [1 x { i8*, i64 }], align 8
-# CHECK:         %[[VAL_1:.*]] = call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-# CHECK:         %[[VAL_3:.*]] = alloca [2 x { i8*, i64 }], align 8
-# CHECK:         %[[VAL_4:.*]] = bitcast [2 x { i8*, i64 }]* %[[VAL_3]] to { i8*, i64 }*
-# CHECK:         store { i8*, i64 } { i8* getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.5A5A00, i32 0, i32 0), i64 3 }, { i8*, i64 }* %[[VAL_4]], align 8
-# CHECK:         %[[VAL_5:.*]] = getelementptr [2 x { i8*, i64 }], [2 x { i8*, i64 }]* %[[VAL_3]], i32 0, i32 1
-# CHECK:         store { i8*, i64 } { i8* getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.585800, i32 0, i32 0), i64 3 }, { i8*, i64 }* %[[VAL_5]], align 8
+# CHECK:         %[[VAL_0:.*]] = alloca [1 x { ptr, i64 }], align 8
+# CHECK:         %[[VAL_1:.*]] = call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+# CHECK:         %[[VAL_3:.*]] = alloca [2 x { ptr, i64 }], align 8
+# CHECK:         store { ptr, i64 } { ptr @cstr.5A5A00, i64 3 }, ptr %[[VAL_3]], align 8
+# CHECK:         %[[VAL_5:.*]] = getelementptr [2 x { ptr, i64 }], ptr %[[VAL_3]], i32 0, i32 1
+# CHECK:         store { ptr, i64 } { ptr @cstr.585800, i64 3 }, ptr %[[VAL_5]], align 8
 # CHECK:         %[[VAL_6:.*]] = alloca [2 x double], align 8
-# CHECK:         %[[VAL_7:.*]] = bitcast [2 x double]* %[[VAL_6]] to double*
-# CHECK:         store double 1.000000e+00, double* %[[VAL_7]], align 8
-# CHECK:         %[[VAL_8:.*]] = getelementptr [2 x double], [2 x double]* %[[VAL_6]], i32 0, i32 1
-# CHECK:         store double 5.000000e-01, double* %[[VAL_8]], align 8
-# CHECK:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0)
-# CHECK:         %[[VAL_11:.*]] = load %[[VAL_10]]*, %[[VAL_10]]** %[[VAL_9]], align 8
-# CHECK:         %[[VAL_12:.*]] = call %[[VAL_2]]* @__quantum__rt__array_slice(%[[VAL_2]]* %[[VAL_1]], i32 1, i64 1, i64 1, i64 2)
-# CHECK:         call void @__quantum__qis__h(%[[VAL_10]]* %[[VAL_11]])
-# CHECK:         %[[VAL_13:.*]] = call %[[VAL_2]]* @__quantum__rt__array_create_1d(i32 8, i64 1)
-# CHECK:         %[[VAL_14:.*]] = call %[[VAL_10:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_13]], i64 0)
-# CHECK:         store %[[VAL_10]]* %[[VAL_11]], %[[VAL_10]]** %[[VAL_14]], align 8
-# CHECK:         br label %[[VAL_15:.*]]
-# CHECK:       17:                                               ; preds = %[[VAL_16:.*]], %[[VAL_17:.*]]
-# CHECK:         %[[VAL_18:.*]] = phi i64 [ %[[VAL_19:.*]], %[[VAL_16]] ], [ 0, %[[VAL_17]] ]
+# CHECK:         store double 1.000000e+00, ptr %[[VAL_6]], align 8
+# CHECK:         %[[VAL_8:.*]] = getelementptr [2 x double], ptr %[[VAL_6]], i32 0, i32 1
+# CHECK:         store double 5.000000e-01, ptr %[[VAL_8]], align 8
+# CHECK:         %[[VAL_9:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_1]], i64 0)
+# CHECK:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8
+# CHECK:         %[[VAL_12:.*]] = call ptr @__quantum__rt__array_slice(ptr %[[VAL_1]], i32 1, i64 1, i64 1, i64 2)
+# CHECK:         call void @__quantum__qis__h(ptr %[[VAL_11]])
+# CHECK:         %[[VAL_13:.*]] = call ptr @__quantum__rt__array_create_1d(i32 8, i64 1)
+# CHECK:         %[[VAL_14:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_13]], i64 0)
+# CHECK:         store ptr %[[VAL_11]], ptr %[[VAL_14]], align 8
+# CHECK:         br label %[[HDR:[0-9]+]]
+# CHECK: [[HDR]]:{{.*}}
+# CHECK:         %[[VAL_18:.*]] = phi i64 [ %[[VAL_19:.*]], %[[VAL_16:[0-9]+]] ], [ 0, %[[VAL_17:[0-9]+]] ]
 # CHECK:         %[[VAL_20:.*]] = icmp slt i64 %[[VAL_18]], 2
-# CHECK:         br i1 %[[VAL_20]], label %[[VAL_16]], label %[[VAL_21:.*]]
-# CHECK:       20:                                               ; preds = %[[VAL_15]]
-# CHECK:         %[[VAL_22:.*]] = phi i64 [ %[[VAL_18]], %[[VAL_15]] ]
-# CHECK:         %[[VAL_23:.*]] = getelementptr [2 x double], [2 x double]* %[[VAL_6]], i32 0, i64 %[[VAL_22]]
-# CHECK:         %[[VAL_24:.*]] = load double, double* %[[VAL_23]], align 8
-# CHECK:         %[[VAL_25:.*]] = getelementptr [2 x { i8*, i64 }], [2 x { i8*, i64 }]* %[[VAL_3]], i32 0, i64 %[[VAL_22]]
-# CHECK:         %[[VAL_26:.*]] = load { i8*, i64 }, { i8*, i64 }* %[[VAL_25]], align 8
-# CHECK:         %[[VAL_27:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_0]] to { i8*, i64 }*
-# CHECK:         store { i8*, i64 } %[[VAL_26]], { i8*, i64 }* %[[VAL_27]], align 8
-# CHECK:         %[[VAL_28:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_0]] to i8*
-# CHECK:         call void @__quantum__qis__exp_pauli__ctl(double %[[VAL_24]], %[[VAL_2]]* %[[VAL_13]], %[[VAL_2]]* %[[VAL_12]], i8* %[[VAL_28]])
-# CHECK:         %[[VAL_19]] = add i64 %[[VAL_22]], 1
-# CHECK:         br label %[[VAL_15]]
-# CHECK:       29:                                               ; preds = %[[VAL_15]]
-# CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
+# CHECK:         br i1 %[[VAL_20]], label %[[VAL_16]], label %[[VAL_21:[0-9]+]]
+# CHECK: [[VAL_16]]:{{.*}}
+# CHECK:         %[[VAL_23:.*]] = getelementptr [2 x double], ptr %[[VAL_6]], i32 0, i64 %[[VAL_18]]
+# CHECK:         %[[VAL_24:.*]] = load double, ptr %[[VAL_23]], align 8
+# CHECK:         %[[VAL_25:.*]] = getelementptr [2 x { ptr, i64 }], ptr %[[VAL_3]], i32 0, i64 %[[VAL_18]]
+# CHECK:         %[[VAL_26:.*]] = load { ptr, i64 }, ptr %[[VAL_25]], align 8
+# CHECK:         store { ptr, i64 } %[[VAL_26]], ptr %[[VAL_0]], align 8
+# CHECK:         call void @__quantum__qis__exp_pauli__ctl(double %[[VAL_24]], ptr %[[VAL_13]], ptr %[[VAL_12]], ptr %[[VAL_0]])
+# CHECK:         %[[VAL_19]] = add i64 %[[VAL_18]], 1
+# CHECK:         br label %[[HDR]]
+# CHECK: [[VAL_21]]:{{.*}}
+# CHECK:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_1]])
 # CHECK:         ret void
 
 # CHECK-LABEL: define void @__nvqpp__mlirgen__exp_pauli_loop..
-# CHECK:         %[[VAL_29:.*]] = alloca [1 x { i8*, i64 }], align 8
-# CHECK:         %[[VAL_30:.*]] = extractvalue { double*, i64 } %[[VAL_31:.*]], 1
-# CHECK:         br label %[[VAL_32:.*]]
-# CHECK:       7:                                                ; preds = %[[VAL_33:.*]], %[[VAL_34:.*]]
-# CHECK:         %[[VAL_35:.*]] = phi i64 [ %[[VAL_36:.*]], %[[VAL_33]] ], [ 0, %[[VAL_34]] ]
+# CHECK:         %[[VAL_29:.*]] = alloca [1 x { ptr, i64 }], align 8
+# CHECK:         %[[VAL_30:.*]] = extractvalue { ptr, i64 } %[[VAL_31:.*]], 1
+# CHECK:         br label %[[HDR2:[0-9]+]]
+# CHECK: [[HDR2]]:{{.*}}
+# CHECK:         %[[VAL_35:.*]] = phi i64 [ %[[VAL_36:.*]], %[[VAL_33:[0-9]+]] ], [ 0, %[[VAL_34:[0-9]+]] ]
 # CHECK:         %[[VAL_37:.*]] = icmp slt i64 %[[VAL_35]], %[[VAL_30]]
-# CHECK:         br i1 %[[VAL_37]], label %[[VAL_33]], label %[[VAL_38:.*]]
-# CHECK:       10:                                               ; preds = %[[VAL_32]]
-# CHECK:         %[[VAL_39:.*]] = phi i64 [ %[[VAL_35]], %[[VAL_32]] ]
-# CHECK:         %[[VAL_40:.*]] = extractvalue { double*, i64 } %[[VAL_31]], 0
-# CHECK:         %[[VAL_41:.*]] = getelementptr double, double* %[[VAL_40]], i64 %[[VAL_39]]
-# CHECK:         %[[VAL_42:.*]] = load double, double* %[[VAL_41]], align 8
+# CHECK:         br i1 %[[VAL_37]], label %[[VAL_33]], label %[[VAL_38:[0-9]+]]
+# CHECK: [[VAL_33]]:{{.*}}
+# CHECK:         %[[VAL_40:.*]] = extractvalue { ptr, i64 } %[[VAL_31]], 0
+# CHECK:         %[[VAL_41:.*]] = getelementptr double, ptr %[[VAL_40]], i64 %[[VAL_35]]
+# CHECK:         %[[VAL_42:.*]] = load double, ptr %[[VAL_41]], align 8
 # CHECK:         %[[VAL_43:.*]] = fmul double %[[VAL_42]], %[[VAL_44:.*]]
-# CHECK:         %[[VAL_45:.*]] = extractvalue { { i8*, i64 }*, i64 } %[[VAL_46:.*]], 0
-# CHECK:         %[[VAL_47:.*]] = getelementptr { i8*, i64 }, { i8*, i64 }* %[[VAL_45]], i64 %[[VAL_39]]
-# CHECK:         %[[VAL_48:.*]] = load { i8*, i64 }, { i8*, i64 }* %[[VAL_47]], align 8
-# CHECK:         %[[VAL_49:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_29]] to { i8*, i64 }*
-# CHECK:         store { i8*, i64 } %[[VAL_48]], { i8*, i64 }* %[[VAL_49]], align 8
-# CHECK:         %[[VAL_50:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_29]] to i8*
-# CHECK:         call void @__quantum__qis__exp_pauli(double %[[VAL_43]], %[[VAL_51:.*]]* %[[VAL_52:.*]], i8* %[[VAL_50]])
-# CHECK:         %[[VAL_36]] = add i64 %[[VAL_39]], 1
-# CHECK:         br label %[[VAL_32]]
-# CHECK:       22:                                               ; preds = %[[VAL_32]]
+# CHECK:         %[[VAL_45:.*]] = extractvalue { ptr, i64 } %[[VAL_46:.*]], 0
+# CHECK:         %[[VAL_47:.*]] = getelementptr { ptr, i64 }, ptr %[[VAL_45]], i64 %[[VAL_35]]
+# CHECK:         %[[VAL_48:.*]] = load { ptr, i64 }, ptr %[[VAL_47]], align 8
+# CHECK:         store { ptr, i64 } %[[VAL_48]], ptr %[[VAL_29]], align 8
+# CHECK:         call void @__quantum__qis__exp_pauli(double %[[VAL_43]], ptr %{{.*}}, ptr %[[VAL_29]])
+# CHECK:         %[[VAL_36]] = add i64 %[[VAL_35]], 1
+# CHECK:         br label %[[HDR2]]
+# CHECK: [[VAL_38]]:{{.*}}
 # CHECK:         ret void
diff --git a/python/tests/mlir/target/mapping.py b/python/tests/mlir/target/mapping.py
index 0911fd54810..9069c837386 100644
--- a/python/tests/mlir/target/mapping.py
+++ b/python/tests/mlir/target/mapping.py
@@ -27,16 +27,17 @@ def foo():
 
 print('most_probable "{}"'.format(result.most_probable()))
 
-# CHECK:         tail call void @__quantum__qis__x__body(%[[VAL_0:.*]]* null)
-# CHECK:         tail call void @__quantum__qis__x__body(%[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*))
-# CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* null, %[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*))
-# CHECK:         tail call void @__quantum__qis__swap__body(%[[VAL_0]]* null, %[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*))
-# CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*), %[[VAL_0]]* nonnull inttoptr (i64 2 to %[[VAL_0]]*))
-# CHECK:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Result* writeonly null)
-# CHECK:         tail call void @__quantum__qis__mz__body(%Qubit* null, %Result* nonnull writeonly inttoptr (i64 1 to %Result*))
-# CHECK:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Result* nonnull writeonly inttoptr (i64 2 to %Result*))
-# CHECK:         tail call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([9 x i8], [9 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-# CHECK:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 1 to %Result*), i8* nonnull getelementptr inbounds ([9 x i8], [9 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-# CHECK:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 2 to %Result*), i8* nonnull getelementptr inbounds ([9 x i8], [9 x i8]* @cstr.{{.*}}, i64 0, i64 0))
+# CHECK:         tail call void @__quantum__qis__x__body(ptr null)
+# CHECK:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 1 to ptr))
+# CHECK:         tail call void @__quantum__qis__cnot__body(ptr null, ptr nonnull inttoptr (i64 1 to ptr))
+# CHECK:         tail call void @__quantum__qis__swap__body(ptr null, ptr nonnull inttoptr (i64 1 to ptr))
+# CHECK:         tail call void @__quantum__qis__cnot__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 2 to ptr))
+# CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr writeonly null)
+# CHECK:         tail call void @__quantum__qis__mz__body(ptr null, ptr nonnull writeonly inttoptr (i64 1 to ptr))
+# CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull writeonly inttoptr (i64 2 to ptr))
+# CHECK:         tail call void @__quantum__rt__array_record_output(i64 3, ptr nonnull @cstr.{{.*}})
+# CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull null, ptr nonnull @cstr.{{.*}})
+# CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.{{.*}})
+# CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull @cstr.{{.*}})
 # CHECK:         ret void
 # CHECK:         most_probable "101"
diff --git a/python/tests/mlir/test_output_translate_qir.py b/python/tests/mlir/test_output_translate_qir.py
index fbb508aeb90..d4b3ab77e68 100644
--- a/python/tests/mlir/test_output_translate_qir.py
+++ b/python/tests/mlir/test_output_translate_qir.py
@@ -27,70 +27,64 @@ def ghz(numQubits: int):
 
 # CHECK-LABEL: define void @__nvqpp__mlirgen__ghz
 # CHECK-SAME:    (i64 %[[VAL_0:.*]]) {
-# CHECK:         %[[VAL_1:.*]] = call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-# CHECK:         %[[VAL_3:.*]] = call %[[VAL_4:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0)
-# CHECK:         %[[VAL_5:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_3]], align 8
-# CHECK:         call void @__quantum__qis__h(%[[VAL_4]]* %[[VAL_5]])
-# CHECK:         %[[VAL_6:.*]] = alloca [2 x i64], align 8
-# CHECK:         %[[VAL_7:.*]] = bitcast [2 x i64]* %[[VAL_6]] to i64*
-# CHECK:         store i64 0, i64* %[[VAL_7]], align 8
-# CHECK:         %[[VAL_8:.*]] = getelementptr [2 x i64], [2 x i64]* %[[VAL_6]], i32 0, i32 1
-# CHECK:         store i64 1, i64* %[[VAL_8]], align 8
-# CHECK:         %[[VAL_9:.*]] = load i64, i64* %[[VAL_7]], align 8
-# CHECK:         %[[VAL_10:.*]] = add i64 %[[VAL_9]], 1
-# CHECK:         %[[VAL_11:.*]] = call %[[VAL_4]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 %[[VAL_10]])
-# CHECK:         %[[VAL_12:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_11]], align 8
-# CHECK:         %[[VAL_13:.*]] = bitcast %[[VAL_4]]* %[[VAL_5]] to i8*
-# CHECK:         %[[VAL_14:.*]] = bitcast %[[VAL_4]]* %[[VAL_12]] to i8*
-# CHECK:         call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* bitcast (void (%[[VAL_2]]*, %[[VAL_4]]*)* @__quantum__qis__x__ctl to i8*), i8* %[[VAL_13]], i8* %[[VAL_14]])
-# CHECK:         %[[VAL_15:.*]] = load i64, i64* %[[VAL_8]], align 8
-# CHECK:         %[[VAL_16:.*]] = call %[[VAL_4]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 1)
-# CHECK:         %[[VAL_17:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_16]], align 8
-# CHECK:         %[[VAL_18:.*]] = add i64 %[[VAL_15]], 1
-# CHECK:         %[[VAL_19:.*]] = call %[[VAL_4]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 %[[VAL_18]])
-# CHECK:         %[[VAL_20:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_19]], align 8
-# CHECK:         %[[VAL_21:.*]] = bitcast %[[VAL_4]]* %[[VAL_17]] to i8*
-# CHECK:         %[[VAL_22:.*]] = bitcast %[[VAL_4]]* %[[VAL_20]] to i8*
-# CHECK:         call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* bitcast (void (%[[VAL_2]]*, %[[VAL_4]]*)* @__quantum__qis__x__ctl to i8*), i8* %[[VAL_21]], i8* %[[VAL_22]])
-# CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
+# CHECK:         %[[VAL_2:.*]] = call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+# CHECK:         %[[VAL_3:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 0)
+# CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_3]], align 8
+# CHECK:         call void @__quantum__qis__h(ptr %[[VAL_4]])
+# CHECK:         %[[VAL_5:.*]] = alloca [2 x i64], align 8
+# CHECK:         store i64 0, ptr %[[VAL_5]], align 8
+# CHECK:         %[[VAL_6:.*]] = getelementptr [2 x i64], ptr %[[VAL_5]], i32 0, i32 1
+# CHECK:         store i64 1, ptr %[[VAL_6]], align 8
+# CHECK:         %[[VAL_7:.*]] = load i64, ptr %[[VAL_5]], align 8
+# CHECK:         %[[VAL_8:.*]] = add i64 %[[VAL_7]], 1
+# CHECK:         %[[VAL_9:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 %[[VAL_8]])
+# CHECK:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_9]], align 8
+# CHECK:         call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr @__quantum__qis__x__ctl, ptr %[[VAL_4]], ptr %[[VAL_10]])
+# CHECK:         %[[VAL_11:.*]] = load i64, ptr %[[VAL_6]], align 8
+# CHECK:         %[[VAL_12:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 1)
+# CHECK:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_12]], align 8
+# CHECK:         %[[VAL_14:.*]] = add i64 %[[VAL_11]], 1
+# CHECK:         %[[VAL_15:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 %[[VAL_14]])
+# CHECK:         %[[VAL_16:.*]] = load ptr, ptr %[[VAL_15]], align 8
+# CHECK:         call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr @__quantum__qis__x__ctl, ptr %[[VAL_13]], ptr %[[VAL_16]])
+# CHECK:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_2]])
 # CHECK:         ret void
 # CHECK:       }
 
 # CHECK-LABEL: define void @__nvqpp__mlirgen__ghz
-# CHECK-SAME:    (i64 %[[VAL_0:.*]])
-# CHECK:         call void @__quantum__qis__h__body(%[[VAL_1:.*]]* null)
+# CHECK-SAME:    (i64 %[[VAL_0:.*]]) #0 {
+# CHECK:         call void @__quantum__qis__h__body(ptr null)
 # CHECK:         %[[VAL_2:.*]] = alloca [4 x i64], align 8
-# CHECK:         %[[VAL_3:.*]] = bitcast [4 x i64]* %[[VAL_2]] to i64*
-# CHECK:         store i64 0, i64* %[[VAL_3]], align 8
-# CHECK:         %[[VAL_4:.*]] = getelementptr [4 x i64], [4 x i64]* %[[VAL_2]], i32 0, i32 1
-# CHECK:         store i64 1, i64* %[[VAL_4]], align 8
-# CHECK:         %[[VAL_5:.*]] = getelementptr [4 x i64], [4 x i64]* %[[VAL_2]], i32 0, i32 2
-# CHECK:         store i64 2, i64* %[[VAL_5]], align 8
-# CHECK:         %[[VAL_6:.*]] = getelementptr [4 x i64], [4 x i64]* %[[VAL_2]], i32 0, i32 3
-# CHECK:         store i64 3, i64* %[[VAL_6]], align 8
-# CHECK:         %[[VAL_7:.*]] = load i64, i64* %[[VAL_3]], align 8
-# CHECK:         %[[VAL_8:.*]] = add i64 %[[VAL_7]], 1
-# CHECK:         %[[VAL_9:.*]] = getelementptr [5 x i64], [5 x i64]* @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_8]]
-# CHECK:         %[[VAL_10:.*]] = load i64, i64* %[[VAL_9]], align 8
-# CHECK:         %[[VAL_11:.*]] = inttoptr i64 %[[VAL_10]] to %[[VAL_1]]*
-# CHECK:         call void @__quantum__qis__cnot__body(%[[VAL_1]]* null, %[[VAL_1]]* %[[VAL_11]])
-# CHECK:         %[[VAL_12:.*]] = load i64, i64* %[[VAL_4]], align 8
-# CHECK:         %[[VAL_13:.*]] = add i64 %[[VAL_12]], 1
-# CHECK:         %[[VAL_14:.*]] = getelementptr [5 x i64], [5 x i64]* @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_13]]
-# CHECK:         %[[VAL_15:.*]] = load i64, i64* %[[VAL_14]], align 8
-# CHECK:         %[[VAL_16:.*]] = inttoptr i64 %[[VAL_15]] to %[[VAL_1]]*
-# CHECK:         call void @__quantum__qis__cnot__body(%[[VAL_1]]* inttoptr (i64 1 to %[[VAL_1]]*), %[[VAL_1]]* %[[VAL_16]])
-# CHECK:         %[[VAL_17:.*]] = load i64, i64* %[[VAL_5]], align 8
-# CHECK:         %[[VAL_18:.*]] = add i64 %[[VAL_17]], 1
-# CHECK:         %[[VAL_19:.*]] = getelementptr [5 x i64], [5 x i64]* @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_18]]
-# CHECK:         %[[VAL_20:.*]] = load i64, i64* %[[VAL_19]], align 8
-# CHECK:         %[[VAL_21:.*]] = inttoptr i64 %[[VAL_20]] to %[[VAL_1]]*
-# CHECK:         call void @__quantum__qis__cnot__body(%[[VAL_1]]* inttoptr (i64 2 to %[[VAL_1]]*), %[[VAL_1]]* %[[VAL_21]])
-# CHECK:         %[[VAL_22:.*]] = load i64, i64* %[[VAL_6]], align 8
-# CHECK:         %[[VAL_23:.*]] = add i64 %[[VAL_22]], 1
-# CHECK:         %[[VAL_24:.*]] = getelementptr [5 x i64], [5 x i64]* @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_23]]
-# CHECK:         %[[VAL_25:.*]] = load i64, i64* %[[VAL_24]], align 8
-# CHECK:         %[[VAL_26:.*]] = inttoptr i64 %[[VAL_25]] to %[[VAL_1]]*
-# CHECK:         call void @__quantum__qis__cnot__body(%[[VAL_1]]* inttoptr (i64 3 to %[[VAL_1]]*), %[[VAL_1]]* %[[VAL_26]])
+# CHECK:         store i64 0, ptr %[[VAL_2]], align 8
+# CHECK:         %[[VAL_3:.*]] = getelementptr [4 x i64], ptr %[[VAL_2]], i32 0, i32 1
+# CHECK:         store i64 1, ptr %[[VAL_3]], align 8
+# CHECK:         %[[VAL_4:.*]] = getelementptr [4 x i64], ptr %[[VAL_2]], i32 0, i32 2
+# CHECK:         store i64 2, ptr %[[VAL_4]], align 8
+# CHECK:         %[[VAL_5:.*]] = getelementptr [4 x i64], ptr %[[VAL_2]], i32 0, i32 3
+# CHECK:         store i64 3, ptr %[[VAL_5]], align 8
+# CHECK:         %[[VAL_6:.*]] = load i64, ptr %[[VAL_2]], align 8
+# CHECK:         %[[VAL_7:.*]] = add i64 %[[VAL_6]], 1
+# CHECK:         %[[VAL_8:.*]] = getelementptr [5 x i64], ptr @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_7]]
+# CHECK:         %[[VAL_9:.*]] = load i64, ptr %[[VAL_8]], align 8
+# CHECK:         %[[VAL_10:.*]] = inttoptr i64 %[[VAL_9]] to ptr
+# CHECK:         call void @__quantum__qis__cnot__body(ptr null, ptr %[[VAL_10]])
+# CHECK:         %[[VAL_11:.*]] = load i64, ptr %[[VAL_3]], align 8
+# CHECK:         %[[VAL_12:.*]] = add i64 %[[VAL_11]], 1
+# CHECK:         %[[VAL_13:.*]] = getelementptr [5 x i64], ptr @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_12]]
+# CHECK:         %[[VAL_14:.*]] = load i64, ptr %[[VAL_13]], align 8
+# CHECK:         %[[VAL_15:.*]] = inttoptr i64 %[[VAL_14]] to ptr
+# CHECK:         call void @__quantum__qis__cnot__body(ptr inttoptr (i64 1 to ptr), ptr %[[VAL_15]])
+# CHECK:         %[[VAL_16:.*]] = load i64, ptr %[[VAL_4]], align 8
+# CHECK:         %[[VAL_17:.*]] = add i64 %[[VAL_16]], 1
+# CHECK:         %[[VAL_18:.*]] = getelementptr [5 x i64], ptr @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_17]]
+# CHECK:         %[[VAL_19:.*]] = load i64, ptr %[[VAL_18]], align 8
+# CHECK:         %[[VAL_20:.*]] = inttoptr i64 %[[VAL_19]] to ptr
+# CHECK:         call void @__quantum__qis__cnot__body(ptr inttoptr (i64 2 to ptr), ptr %[[VAL_20]])
+# CHECK:         %[[VAL_21:.*]] = load i64, ptr %[[VAL_5]], align 8
+# CHECK:         %[[VAL_22:.*]] = add i64 %[[VAL_21]], 1
+# CHECK:         %[[VAL_23:.*]] = getelementptr [5 x i64], ptr @__nvqpp__mlirgen__ghz..{{.*}}.rodata_0, i32 0, i64 %[[VAL_22]]
+# CHECK:         %[[VAL_24:.*]] = load i64, ptr %[[VAL_23]], align 8
+# CHECK:         %[[VAL_25:.*]] = inttoptr i64 %[[VAL_24]] to ptr
+# CHECK:         call void @__quantum__qis__cnot__body(ptr inttoptr (i64 3 to ptr), ptr %[[VAL_25]])
 # CHECK:         ret void
 # CHECK:       }
diff --git a/python/tests/mlir/utils/target_env_var_check_fp32.py b/python/tests/mlir/utils/target_env_var_check_fp32.py
index e068c3ac170..0e4e0b78f87 100644
--- a/python/tests/mlir/utils/target_env_var_check_fp32.py
+++ b/python/tests/mlir/utils/target_env_var_check_fp32.py
@@ -19,8 +19,29 @@
 except:
     NUM_GPUS = 0
 
-os.environ["CUDAQ_DEFAULT_SIMULATOR"] = ("nvidia" if NUM_GPUS > 0 else
-                                         "density-matrix-cpu")
+
+def _target_is_usable(name):
+    if not cudaq.has_target(name):
+        return False
+    prev = os.environ.get("CUDAQ_DEFAULT_SIMULATOR")
+    os.environ["CUDAQ_DEFAULT_SIMULATOR"] = name
+    try:
+        cudaq.set_target(name)
+    except RuntimeError:
+        if prev is None:
+            os.environ.pop("CUDAQ_DEFAULT_SIMULATOR", None)
+        else:
+            os.environ["CUDAQ_DEFAULT_SIMULATOR"] = prev
+        return False
+    cudaq.reset_target()
+    return True
+
+
+if NUM_GPUS > 0 and _target_is_usable("nvidia"):
+    os.environ["CUDAQ_DEFAULT_SIMULATOR"] = "nvidia"
+else:
+    NUM_GPUS = 0
+    os.environ["CUDAQ_DEFAULT_SIMULATOR"] = "density-matrix-cpu"
 
 if cudaq.has_target(os.environ["CUDAQ_DEFAULT_SIMULATOR"]):
     cudaq.set_target(os.environ["CUDAQ_DEFAULT_SIMULATOR"])
diff --git a/python/tests/mlir/utils/target_env_var_check_stim.py b/python/tests/mlir/utils/target_env_var_check_stim.py
index 253abd69f36..327efafaf60 100644
--- a/python/tests/mlir/utils/target_env_var_check_stim.py
+++ b/python/tests/mlir/utils/target_env_var_check_stim.py
@@ -16,8 +16,14 @@
 
 import cudaq
 import numpy as np
+from cudaq._metadata import assertions_enabled as _cudaq_assertions_enabled
 
+skipStimP1 = pytest.mark.skipif(
+    _cudaq_assertions_enabled,
+    reason="https://github.com/NVIDIA/cuda-quantum/issues/4026")
 
+
+@skipStimP1
 def test_default_target():
     """Tests the default target set by environment variable"""
 
@@ -38,13 +44,15 @@ def test_default_target():
     assert '1' * 200 in result
 
 
+@skipStimP1
 def test_env_var_with_emulate():
     """Tests the target when emulating a hardware backend"""
 
     assert ("stim" == cudaq.get_target().name)
     cudaq.set_target("quantinuum", emulate=True)
     assert ("quantinuum" == cudaq.get_target().name)
-    # The underlying simulator (`stim`) used for emulation is a double-precision simulator
+    # The underlying simulator (`stim`) used for emulation is a double-precision
+    # simulator
     assert (cudaq.complex() is np.complex128)
 
     # `Stim` is used for emulation, hence can handle lots of qubits
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index b81577bbae2..7a8567c5e27 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -82,8 +82,10 @@ void checkArgumentType(nanobind::handle arg, int index,
         "kernel argument" + word + " type is '" +
         std::string(py_ext::typeName<T>()) + "'" +
         " but argument provided is not (argument " + std::to_string(index) +
-        ", value=" + nanobind::cast<std::string>(nanobind::str(arg)) +
-        ", type=" + nanobind::cast<std::string>(nanobind::str(arg.type())) +
+        ", value=" + std::string(nanobind::str(arg).c_str()) + ", type=" +
+        std::string(nanobind::str(nanobind::handle(reinterpret_cast<PyObject *>(
+                                      Py_TYPE(arg.ptr()))))
+                        .c_str()) +
         ").");
   }
 }
diff --git a/python/utils/PyTypes.h b/python/utils/PyTypes.h
index 6bba9f02fee..caeeec582f6 100644
--- a/python/utils/PyTypes.h
+++ b/python/utils/PyTypes.h
@@ -19,18 +19,11 @@ namespace py_ext {
 /// Includes `complex`, `numpy.complex64`, `numpy.complex128`.
 class Complex : public nanobind::object {
 public:
-  NB_OBJECT_DEFAULT(Complex, object, "complex", isComplex_)
-
-  // NOLINTNEXTLINE(google-explicit-constructor)
-  Complex(const nanobind::object &o)
-      : object(nanobind::steal(convert_(o.ptr()))) {
-    if (!m_ptr)
-      throw nanobind::python_error();
-  }
+  NB_OBJECT_DEFAULT(Complex, nanobind::object, "complex", isComplex_)
 
   Complex(double real, double imag)
-      : object(nanobind::steal(PyComplex_FromDoubles(real, imag))) {
-    if (!m_ptr) {
+      : nanobind::object(nanobind::steal(PyComplex_FromDoubles(real, imag))) {
+    if (!ptr()) {
       throw std::runtime_error("Could not allocate complex object!");
     }
   }
@@ -46,12 +39,12 @@ class Complex : public nanobind::object {
 
   // NOLINTNEXTLINE(google-explicit-constructor)
   operator std::complex<double>() {
-    auto value = PyComplex_AsCComplex(m_ptr);
+    auto value = PyComplex_AsCComplex(ptr());
     return std::complex<double>(value.real, value.imag);
   }
   // NOLINTNEXTLINE(google-explicit-constructor)
   operator std::complex<float>() {
-    auto value = PyComplex_AsCComplex(m_ptr);
+    auto value = PyComplex_AsCComplex(ptr());
     return std::complex<float>(value.real, value.imag);
   }
 
@@ -66,18 +59,6 @@ class Complex : public nanobind::object {
     }
     return false;
   }
-
-  static PyObject *convert_(PyObject *o) {
-    PyObject *ret = nullptr;
-    if (isComplex_(o)) {
-      double real = PyComplex_RealAsDouble(o);
-      double imag = PyComplex_ImagAsDouble(o);
-      ret = PyComplex_FromDoubles(real, imag);
-    } else {
-      PyErr_SetString(PyExc_TypeError, "Unexpected type");
-    }
-    return ret;
-  }
 };
 
 /// Extended python float object.
@@ -85,35 +66,27 @@ class Complex : public nanobind::object {
 /// Includes `float`, `numpy.float64`, `numpy.float32`.
 class Float : public nanobind::object {
 public:
-  NB_OBJECT_DEFAULT(Float, object, "float", isFloat_)
-
-  // Converting constructor
-  // NOLINTNEXTLINE(google-explicit-constructor)
-  Float(const nanobind::object &o)
-      : object(nanobind::steal(convert_(o.ptr()))) {
-    if (!m_ptr)
-      throw nanobind::python_error();
-  }
+  NB_OBJECT_DEFAULT(Float, nanobind::object, "float", isFloat_)
 
   // Allow implicit conversion from float/double:
   // NOLINTNEXTLINE(google-explicit-constructor)
   Float(float value)
-      : object(nanobind::steal(PyFloat_FromDouble((double)value))) {
-    if (!m_ptr) {
+      : nanobind::object(nanobind::steal(PyFloat_FromDouble((double)value))) {
+    if (!ptr()) {
       throw std::runtime_error("Could not allocate float object!");
     }
   }
   // NOLINTNEXTLINE(google-explicit-constructor)
   Float(double value = .0)
-      : object(nanobind::steal(PyFloat_FromDouble((double)value))) {
-    if (!m_ptr) {
+      : nanobind::object(nanobind::steal(PyFloat_FromDouble((double)value))) {
+    if (!ptr()) {
       throw std::runtime_error("Could not allocate float object!");
     }
   }
   // NOLINTNEXTLINE(google-explicit-constructor)
-  operator float() const { return (float)PyFloat_AsDouble(m_ptr); }
+  operator float() const { return (float)PyFloat_AsDouble(ptr()); }
   // NOLINTNEXTLINE(google-explicit-constructor)
-  operator double() const { return (double)PyFloat_AsDouble(m_ptr); }
+  operator double() const { return (double)PyFloat_AsDouble(ptr()); }
 
   static bool isFloat_(PyObject *o) {
     if (PyFloat_Check(o)) {
@@ -126,16 +99,6 @@ class Float : public nanobind::object {
     }
     return false;
   }
-
-  static PyObject *convert_(PyObject *o) {
-    PyObject *ret = nullptr;
-    if (isFloat_(o)) {
-      ret = PyFloat_FromDouble(PyFloat_AsDouble(o));
-    } else {
-      PyErr_SetString(PyExc_TypeError, "Unexpected type");
-    }
-    return ret;
-  }
 };
 
 /// Extended python int object.
@@ -143,31 +106,25 @@ class Float : public nanobind::object {
 /// Includes `int`, `numpy.intXXX`.
 class Int : public nanobind::object {
 public:
-  NB_OBJECT_DEFAULT(Int, object, "int", isInt_)
-
-  // Converting constructor
-  // NOLINTNEXTLINE(google-explicit-constructor)
-  Int(const nanobind::object &o) : object(nanobind::steal(convert_(o.ptr()))) {
-    if (!m_ptr)
-      throw nanobind::python_error();
-  }
+  NB_OBJECT_DEFAULT(Int, nanobind::object, "int", isInt_)
 
   // Allow implicit conversion from int:
   // NOLINTNEXTLINE(google-explicit-constructor)
-  Int(long value) : object(nanobind::steal(PyLong_FromLong((long)value))) {
-    if (!m_ptr) {
+  Int(long value)
+      : nanobind::object(nanobind::steal(PyLong_FromLong((long)value))) {
+    if (!ptr()) {
       throw std::runtime_error("Could not allocate int object!");
     }
   }
 
   // NOLINTNEXTLINE(google-explicit-constructor)
-  operator std::int8_t() const { return (std::int8_t)PyLong_AsLong(m_ptr); }
+  operator std::int8_t() const { return (std::int8_t)PyLong_AsLong(ptr()); }
   // NOLINTNEXTLINE(google-explicit-constructor)
-  operator std::int16_t() const { return (std::int16_t)PyLong_AsLong(m_ptr); }
+  operator std::int16_t() const { return (std::int16_t)PyLong_AsLong(ptr()); }
   // NOLINTNEXTLINE(google-explicit-constructor)
-  operator std::int32_t() const { return (std::int32_t)PyLong_AsLong(m_ptr); }
+  operator std::int32_t() const { return (std::int32_t)PyLong_AsLong(ptr()); }
   // NOLINTNEXTLINE(google-explicit-constructor)
-  operator std::int64_t() const { return (std::int64_t)PyLong_AsLong(m_ptr); }
+  operator std::int64_t() const { return (std::int64_t)PyLong_AsLong(ptr()); }
 
   static bool isInt_(PyObject *o) {
     if (PyLong_Check(o)) {
@@ -181,16 +138,6 @@ class Int : public nanobind::object {
     }
     return false;
   }
-
-  static PyObject *convert_(PyObject *o) {
-    PyObject *ret = nullptr;
-    if (isInt_(o)) {
-      ret = PyLong_FromLong(PyLong_AsLong(o));
-    } else {
-      PyErr_SetString(PyExc_TypeError, "Unexpected type");
-    }
-    return ret;
-  }
 };
 
 template <typename T>
diff --git a/realtime/docker/assets.Dockerfile b/realtime/docker/assets.Dockerfile
index 95c3a7098bc..49756d5ca77 100644
--- a/realtime/docker/assets.Dockerfile
+++ b/realtime/docker/assets.Dockerfile
@@ -44,7 +44,7 @@ RUN source /cuda-quantum/scripts/configure_build.sh install-cuda
 RUN source /cuda-quantum/scripts/configure_build.sh install-gcc
 
 # [>ToolchainConfiguration]
-ENV GCC_TOOLCHAIN="/opt/rh/gcc-toolset-11/root/usr/"
+ENV GCC_TOOLCHAIN="/opt/rh/gcc-toolset-12/root/usr/"
 ENV CXX="${GCC_TOOLCHAIN}/bin/g++"
 ENV CC="${GCC_TOOLCHAIN}/bin/gcc"
 ENV CUDACXX=/usr/local/cuda/bin/nvcc
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 190a5c50c89..e29042accc2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,9 +17,9 @@ psutil
 numpy==1.26.4
 notebook==7.5.6
 nbconvert==7.17.1
-llvmlite==0.44.0
+llvmlite==0.47.0
 scipy==1.16.3
-requests==2.33.1
+requests==2.32.3
 fastapi==0.111.0
 uvicorn==0.29.0
 pydantic==2.12.5
@@ -28,4 +28,5 @@ h5py==3.12.1
 matplotlib
 pyspelling==2.10
 pymdown-extensions==10.16.1
+nanobind>=2.9.0
 yapf==0.43.0
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index 39a623c9700..138e3d4c063 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -33,6 +33,19 @@ void setResourceCounts(cudaq::Resources &&);
 bool isUsingResourceCounterSimulator();
 } // namespace nvqir
 
+// When compiled into the Python extension, the LLVM Registry<T> Head/Tail
+// static-inline pointers have hidden visibility (local 'b' symbols) instead
+// of GNU-unique ('u') symbols. This means registry::get<T> in the Python
+// extension sees an empty list even though dlopen'd .so plugins registered
+// into libcudaq-common's unique-symbol registry.  These C-linkage helpers
+// perform the lookup inside libcudaq-common so it works across DSO boundaries.
+#ifdef CUDAQ_PYTHON_EXTENSION
+extern "C" cudaq::ServerHelper *cudaq_find_server_helper(const char *name);
+extern "C" bool cudaq_has_server_helper(const char *name);
+extern "C" cudaq::Executor *cudaq_find_executor(const char *name);
+extern "C" bool cudaq_has_executor(const char *name);
+#endif
+
 namespace cudaq {
 
 class BaseRemoteRESTQPU : public QPU {
@@ -207,18 +220,30 @@ class BaseRemoteRESTQPU : public QPU {
     // Set the qpu name
     qpuName = mutableBackend;
     // Create the ServerHelper for this QPU and give it the backend config
+#ifdef CUDAQ_PYTHON_EXTENSION
+    serverHelper.reset(cudaq_find_server_helper(qpuName.c_str()));
+#else
     serverHelper = cudaq::registry::get<cudaq::ServerHelper>(qpuName);
+#endif
     if (!serverHelper) {
       throw std::runtime_error("ServerHelper not found for target: " + qpuName);
     }
 
     serverHelper->initialize(backendConfig);
     CUDAQ_INFO("Retrieving executor with name {}", qpuName);
+#ifdef CUDAQ_PYTHON_EXTENSION
+    bool hasExecutor = cudaq_has_executor(qpuName.c_str());
+    CUDAQ_INFO("Is this executor registered? {}", hasExecutor);
+    executor = hasExecutor ? std::unique_ptr<cudaq::Executor>(
+                                 cudaq_find_executor(qpuName.c_str()))
+                           : std::make_unique<cudaq::Executor>();
+#else
     CUDAQ_INFO("Is this executor registered? {}",
                cudaq::registry::isRegistered<cudaq::Executor>(qpuName));
     executor = cudaq::registry::isRegistered<cudaq::Executor>(qpuName)
                    ? cudaq::registry::get<cudaq::Executor>(qpuName)
                    : std::make_unique<cudaq::Executor>();
+#endif
 
     // Give the server helper to the executor
     executor->setServerHelper(serverHelper.get());
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 41490691fd7..3cede7eb5cb 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -137,13 +137,13 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
       func->setAttr(cudaq::entryPointAttrName, builder.getUnitAttr());
     mlir::ModuleOp moduleOp = [&]() {
       if constexpr (cloneAgain) {
-        auto moduleOp = builder.create<mlir::ModuleOp>();
+        auto moduleOp = mlir::ModuleOp::create(builder, builder.getLoc());
         moduleOp->setAttrs(module->getAttrDictionary());
         for (auto &op : module) {
           if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
             // Add quantum kernels defined in the module.
             if (funcOp->hasAttr(cudaq::kernelAttrName) ||
-                funcOp.getName().startswith("__nvqpp__mlirgen__") ||
+                funcOp.getName().starts_with("__nvqpp__mlirgen__") ||
                 funcOp.getBody().empty())
               moduleOp.push_back(funcOp.clone());
           }
diff --git a/runtime/common/CodeGenConfig.h b/runtime/common/CodeGenConfig.h
index 074ed2898fa..00927cdd9b7 100644
--- a/runtime/common/CodeGenConfig.h
+++ b/runtime/common/CodeGenConfig.h
@@ -20,19 +20,19 @@ enum struct QirVersion { version_0_1, version_1_0 };
 /// possible platforms.
 struct CodeGenConfig {
   // Profile name
-  std::string profile;
+  std::string profile = {};
   // True if this is a QIR profile.
-  bool isQIRProfile;
+  bool isQIRProfile = false;
   // QIR profile version enum
-  QirVersion version;
+  QirVersion version = QirVersion::version_0_1;
   // QIR profile major version
-  std::uint32_t qir_major_version;
+  std::uint32_t qir_major_version = 0;
   // QIR profile minor version
-  std::uint32_t qir_minor_version;
+  std::uint32_t qir_minor_version = 0;
   // True if this is an adaptive QIR profile.
-  bool isAdaptiveProfile;
+  bool isAdaptiveProfile = false;
   // True if this is a base QIR profile.
-  bool isBaseProfile;
+  bool isBaseProfile = false;
   // True if integer computation is enabled.
   bool integerComputations = false;
   // True if floating-point computation is enabled.
diff --git a/runtime/common/Executor.cpp b/runtime/common/Executor.cpp
index fbdb56b2e9f..ba00cc69f7f 100644
--- a/runtime/common/Executor.cpp
+++ b/runtime/common/Executor.cpp
@@ -66,3 +66,14 @@ Executor::~Executor() = default;
 } // namespace cudaq
 
 CUDAQ_INSTANTIATE_REGISTRY(cudaq::Executor::RegistryType)
+
+// Bridge so the Python extension can look up Executor subtypes from this DSO's
+// registry (same pattern as cudaq_find_server_helper).
+extern "C" cudaq::Executor *cudaq_find_executor(const char *name) {
+  auto exec = cudaq::registry::get<cudaq::Executor>(std::string(name));
+  return exec.release();
+}
+
+extern "C" bool cudaq_has_executor(const char *name) {
+  return cudaq::registry::isRegistered<cudaq::Executor>(std::string(name));
+}
diff --git a/runtime/common/NoiseModel.h b/runtime/common/NoiseModel.h
index af98b417d69..303752e63ba 100644
--- a/runtime/common/NoiseModel.h
+++ b/runtime/common/NoiseModel.h
@@ -589,7 +589,9 @@ class noise_model {
                    const kraus_channel &channel) {
     std::vector<std::string> names;
     std::apply(
-        [&](const auto &...elements) { (names.push_back(elements.name), ...); },
+        [&](const auto &...elements) {
+          (names.emplace_back(elements.name), ...);
+        },
         std::tuple<QuantumOp...>());
     for (auto &name : names)
       add_channel(name, qubits, channel);
@@ -601,7 +603,9 @@ class noise_model {
   void add_channel(const PredicateFuncTy &pred) {
     std::vector<std::string> names;
     std::apply(
-        [&](const auto &...elements) { (names.push_back(elements.name), ...); },
+        [&](const auto &...elements) {
+          (names.emplace_back(elements.name), ...);
+        },
         std::tuple<QuantumOp...>());
     for (auto &name : names)
       add_channel(name, pred);
@@ -614,7 +618,9 @@ class noise_model {
                              int numControls = 0) {
     std::vector<std::string> names;
     std::apply(
-        [&](const auto &...elements) { (names.push_back(elements.name), ...); },
+        [&](const auto &...elements) {
+          (names.emplace_back(elements.name), ...);
+        },
         std::tuple<QuantumOp...>());
     for (auto &name : names)
       add_all_qubit_channel(name, channel, numControls);
@@ -636,7 +642,8 @@ class noise_model {
                const std::vector<std::size_t> &controlQubits = {},
                const std::vector<double> &params = {}) const {
     QuantumOp op;
-    return get_channels(op.name, targetQubits, controlQubits, params);
+    return get_channels(std::string(op.name), targetQubits, controlQubits,
+                        params);
   }
 };
 
diff --git a/runtime/common/RestClient.cpp b/runtime/common/RestClient.cpp
index 7e8aca210c4..9d4ef08b038 100644
--- a/runtime/common/RestClient.cpp
+++ b/runtime/common/RestClient.cpp
@@ -7,6 +7,9 @@
  ******************************************************************************/
 
 #include "RestClient.h"
+
+#ifdef CUDAQ_RESTCLIENT_AVAILABLE
+
 #include "FmtCore.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq/utils/cudaq_utils.h"
@@ -44,8 +47,6 @@ RestClient::RestClient() : sslOptions(std::make_unique<cpr::SslOptions>()) {
     sslOptions->SetOption(cpr::ssl::CaInfo(std::move(caInfo)));
 }
 
-// Must define this in the cpp file instead of the header file
-// because CPR headers aren't included in RestClient.h.
 RestClient::~RestClient() = default;
 
 nlohmann::json
@@ -66,7 +67,6 @@ RestClient::post(const std::string_view remoteUrl, const std::string_view path,
   for (const auto &kv : cookies)
     cprCookies.emplace_back({kv.first, kv.second});
 
-  // Allow caller to disable logging for things like passwords/tokens
   if (enableLogging)
     CUDAQ_INFO("Posting to {}/{} with data = {}", remoteUrl, path, post.dump());
 
@@ -79,7 +79,6 @@ RestClient::post(const std::string_view remoteUrl, const std::string_view path,
                              std::to_string(r.status_code) + ": " +
                              r.error.message + ": " + r.text);
 
-  // Update the cookies map
   if (cookiesOut)
     for (const auto &cookie : r.cookies)
       (*cookiesOut)[cookie.GetName()] = cookie.GetValue();
@@ -101,7 +100,6 @@ void RestClient::put(const std::string_view remoteUrl,
   cpr::Cookies cprCookies;
   for (const auto &kv : cookies)
     cprCookies.emplace_back({kv.first, kv.second});
-  // Allow caller to disable logging for things like passwords/tokens
   if (enableLogging)
     CUDAQ_INFO("Putting to {}/{} with data = {}", remoteUrl, path,
                putData.dump());
@@ -194,15 +192,67 @@ void RestClient::download(const std::string_view remoteUrl,
                remoteUrl, filePath);
 
   try {
-    // Write the downloaded content to file.
     std::ofstream outfile(filePath, std::ofstream::binary | std::ios::out);
     outfile.write(r.text.c_str(), r.text.size());
     outfile.close();
   } catch (std::exception &e) {
-    // Rethrow it with a descriptive message
     throw std::runtime_error(fmt::format(
         "Failed to write downloaded contents to file {}. Exception: {}.",
         filePath, e.what()));
   }
 }
 } // namespace cudaq
+
+#else // !CUDAQ_RESTCLIENT_AVAILABLE
+
+namespace cpr {
+struct SslOptions {};
+} // namespace cpr
+
+namespace cudaq {
+
+static void throwNoRest [[noreturn]] () {
+  throw std::runtime_error(
+      "REST client is not available. Build with CUDAQ_ENABLE_REST=ON and "
+      "OpenSSL to enable REST support.");
+}
+
+RestClient::RestClient() : sslOptions(std::make_unique<cpr::SslOptions>()) {}
+RestClient::~RestClient() = default;
+
+nlohmann::json RestClient::post(const std::string_view, const std::string_view,
+                                nlohmann::json &,
+                                std::map<std::string, std::string> &, bool,
+                                bool,
+                                const std::map<std::string, std::string> &,
+                                std::map<std::string, std::string> *) {
+  throwNoRest();
+}
+void RestClient::put(const std::string_view, const std::string_view,
+                     nlohmann::json &, std::map<std::string, std::string> &,
+                     bool, bool, const std::map<std::string, std::string> &) {
+  throwNoRest();
+}
+std::string RestClient::getRawText(const std::string_view,
+                                   const std::string_view,
+                                   std::map<std::string, std::string> &, bool,
+                                   const std::map<std::string, std::string> &) {
+  throwNoRest();
+}
+nlohmann::json RestClient::get(const std::string_view, const std::string_view,
+                               std::map<std::string, std::string> &, bool,
+                               const std::map<std::string, std::string> &) {
+  throwNoRest();
+}
+void RestClient::del(const std::string_view, const std::string_view,
+                     std::map<std::string, std::string> &, bool, bool,
+                     const std::map<std::string, std::string> &) {
+  throwNoRest();
+}
+void RestClient::download(const std::string_view, const std::string &, bool,
+                          bool, const std::map<std::string, std::string> &) {
+  throwNoRest();
+}
+} // namespace cudaq
+
+#endif // CUDAQ_RESTCLIENT_AVAILABLE
diff --git a/runtime/common/ServerHelper.cpp b/runtime/common/ServerHelper.cpp
index 602f8946f8c..513f639c9df 100644
--- a/runtime/common/ServerHelper.cpp
+++ b/runtime/common/ServerHelper.cpp
@@ -79,3 +79,15 @@ void ServerHelper::parseConfigForCommonParams(const BackendConfig &config) {
 } // namespace cudaq
 
 CUDAQ_INSTANTIATE_REGISTRY(cudaq::ServerHelper::RegistryType)
+
+// Bridge so the Python extension (which has hidden-visibility Head/Tail for
+// Registry<ServerHelper>) can look up server helpers registered in this DSO's
+// unique-symbol registry (populated by dlopen'd serverhelper .so files).
+extern "C" cudaq::ServerHelper *cudaq_find_server_helper(const char *name) {
+  auto helper = cudaq::registry::get<cudaq::ServerHelper>(std::string(name));
+  return helper.release();
+}
+
+extern "C" bool cudaq_has_server_helper(const char *name) {
+  return cudaq::registry::isRegistered<cudaq::ServerHelper>(std::string(name));
+}
diff --git a/runtime/cudaq/builder/QuakeValue.cpp b/runtime/cudaq/builder/QuakeValue.cpp
index f9c1f25c618..bb2ec1743a8 100644
--- a/runtime/cudaq/builder/QuakeValue.cpp
+++ b/runtime/cudaq/builder/QuakeValue.cpp
@@ -83,7 +83,7 @@ QuakeValue::QuakeValue(mlir::ImplicitLocOpBuilder &builder, double v)
     : opBuilder(builder) {
   llvm::APFloat d(v);
   value = std::make_shared<QuakeValue::ValueHolder>(
-      opBuilder.create<arith::ConstantFloatOp>(d, opBuilder.getF64Type()));
+      arith::ConstantFloatOp::create(opBuilder, opBuilder.getF64Type(), d));
 }
 
 QuakeValue::QuakeValue(mlir::ImplicitLocOpBuilder &builder, Value v)
@@ -113,26 +113,27 @@ QuakeValue QuakeValue::operator[](const std::size_t idx) {
                              typeName + ").");
   }
 
-  Value indexVar = opBuilder.create<arith::ConstantIntOp>(idx, 32);
+  Value indexVar = arith::ConstantIntOp::create(opBuilder, idx, 32);
 
   if (isa<quake::VeqType>(type)) {
     Value extractedQubit =
-        opBuilder.create<quake::ExtractRefOp>(vectorValue, indexVar);
+        quake::ExtractRefOp::create(opBuilder, vectorValue, indexVar);
     return QuakeValue(opBuilder, extractedQubit);
   }
 
   // must be a std vec type
   value->addUniqueExtraction(idx);
 
-  Type eleTy = vectorValue.getType().cast<cc::StdvecType>().getElementType();
+  Type eleTy =
+      mlir::cast<cc::StdvecType>(vectorValue.getType()).getElementType();
 
   auto arrPtrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-  Value vecPtr = opBuilder.create<cc::StdvecDataOp>(arrPtrTy, vectorValue);
+  Value vecPtr = cc::StdvecDataOp::create(opBuilder, arrPtrTy, vectorValue);
   std::int32_t idx32 = static_cast<std::int32_t>(idx);
   auto elePtrTy = cc::PointerType::get(eleTy);
-  Value eleAddr = opBuilder.create<cc::ComputePtrOp>(
-      elePtrTy, vecPtr, ArrayRef<cc::ComputePtrArg>{idx32});
-  Value loaded = opBuilder.create<cc::LoadOp>(eleAddr);
+  Value eleAddr = cc::ComputePtrOp::create(opBuilder, elePtrTy, vecPtr,
+                                           ArrayRef<cc::ComputePtrArg>{idx32});
+  Value loaded = cc::LoadOp::create(opBuilder, eleAddr);
   return QuakeValue(opBuilder, loaded);
 }
 
@@ -154,7 +155,7 @@ QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
 
   if (isa<quake::VeqType>(type)) {
     Value extractedQubit =
-        opBuilder.create<quake::ExtractRefOp>(vectorValue, indexVar);
+        quake::ExtractRefOp::create(opBuilder, vectorValue, indexVar);
     return QuakeValue(opBuilder, extractedQubit);
   }
 
@@ -162,13 +163,14 @@ QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
   // been passed in correctly.
   canValidateVectorNumElements = false;
 
-  Type eleTy = vectorValue.getType().cast<cc::StdvecType>().getElementType();
+  Type eleTy =
+      mlir::cast<cc::StdvecType>(vectorValue.getType()).getElementType();
   auto arrEleTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-  Value vecPtr = opBuilder.create<cc::StdvecDataOp>(arrEleTy, vectorValue);
+  Value vecPtr = cc::StdvecDataOp::create(opBuilder, arrEleTy, vectorValue);
   auto elePtrTy = cc::PointerType::get(eleTy);
-  Value eleAddr = opBuilder.create<cc::ComputePtrOp>(
-      elePtrTy, vecPtr, ArrayRef<cc::ComputePtrArg>{indexVar});
-  Value loaded = opBuilder.create<cc::LoadOp>(eleAddr);
+  Value eleAddr = cc::ComputePtrOp::create(
+      opBuilder, elePtrTy, vecPtr, ArrayRef<cc::ComputePtrArg>{indexVar});
+  Value loaded = cc::LoadOp::create(opBuilder, eleAddr);
   return QuakeValue(opBuilder, loaded);
 }
 
@@ -181,9 +183,9 @@ QuakeValue QuakeValue::size() {
   Type i64Ty = opBuilder.getI64Type();
   Value ret;
   if (isa<cc::StdvecType>(type))
-    ret = opBuilder.create<cc::StdvecSizeOp>(i64Ty, vectorValue);
+    ret = cc::StdvecSizeOp::create(opBuilder, i64Ty, vectorValue);
   else
-    ret = opBuilder.create<quake::VeqSizeOp>(i64Ty, vectorValue);
+    ret = quake::VeqSizeOp::create(opBuilder, i64Ty, vectorValue);
 
   return QuakeValue(opBuilder, ret);
 }
@@ -206,21 +208,21 @@ QuakeValue QuakeValue::slice(const std::size_t startIdx,
   if (count == 0)
     throw std::runtime_error("QuakeValue::slice requesting slice of size 0.");
 
-  Value startIdxValue = opBuilder.create<arith::ConstantIntOp>(startIdx, 64);
-  Value countValue = opBuilder.create<arith::ConstantIntOp>(count, 64);
-  if (auto veqType = type.dyn_cast_or_null<quake::VeqType>()) {
+  Value startIdxValue = arith::ConstantIntOp::create(opBuilder, startIdx, 64);
+  Value countValue = arith::ConstantIntOp::create(opBuilder, count, 64);
+  if (auto veqType = mlir::dyn_cast_if_present<quake::VeqType>(type)) {
     auto veqSize = veqType.getSize();
     if (startIdx + count > veqSize)
       throw std::runtime_error("Invalid number of elements requested in slice, "
                                "must be less than size of array (" +
                                std::to_string(veqSize) + ").");
 
-    auto one = opBuilder.create<arith::ConstantIntOp>(1, 64);
-    Value offset = opBuilder.create<arith::AddIOp>(startIdxValue, countValue);
-    offset = opBuilder.create<arith::SubIOp>(offset, one);
+    auto one = arith::ConstantIntOp::create(opBuilder, 1, 64);
+    Value offset = arith::AddIOp::create(opBuilder, startIdxValue, countValue);
+    offset = arith::SubIOp::create(opBuilder, offset, one);
     auto sizedVecTy = quake::VeqType::get(opBuilder.getContext(), count);
-    Value subVeq = opBuilder.create<quake::SubVeqOp>(sizedVecTy, vectorValue,
-                                                     startIdxValue, offset);
+    Value subVeq = quake::SubVeqOp::create(opBuilder, sizedVecTy, vectorValue,
+                                           startIdxValue, offset);
     return QuakeValue(opBuilder, subVeq);
   }
 
@@ -235,22 +237,22 @@ QuakeValue QuakeValue::slice(const std::size_t startIdx,
     // actually appear in CodeGen when lowering this to the LLVM-IR dialect.
     eleTy = opBuilder.getI8Type();
     auto ptrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-    vecPtr = opBuilder.create<cc::StdvecDataOp>(ptrTy, vectorValue);
+    vecPtr = cc::StdvecDataOp::create(opBuilder, ptrTy, vectorValue);
     auto bits = svecTy.getElementType().getIntOrFloatBitWidth();
     assert(bits > 0);
-    auto scale = opBuilder.create<arith::ConstantIntOp>(
-        (bits + 7) / 8, startIdxValue.getType());
-    offset = opBuilder.create<arith::MulIOp>(scale, startIdxValue);
+    auto scale = arith::ConstantIntOp::create(
+        opBuilder, startIdxValue.getType(), (bits + 7) / 8);
+    offset = arith::MulIOp::create(opBuilder, scale, startIdxValue);
   } else {
     auto ptrTy = cc::PointerType::get(cc::ArrayType::get(eleTy));
-    vecPtr = opBuilder.create<cc::StdvecDataOp>(ptrTy, vectorValue);
+    vecPtr = cc::StdvecDataOp::create(opBuilder, ptrTy, vectorValue);
     offset = startIdxValue;
   }
-  auto ptr = opBuilder.create<cc::ComputePtrOp>(
-      cudaq::cc::PointerType::get(eleTy), vecPtr,
-      ArrayRef<cc::ComputePtrArg>{offset});
-  Value subVeqInit = opBuilder.create<cc::StdvecInitOp>(vectorValue.getType(),
-                                                        ptr, countValue);
+  auto ptr =
+      cc::ComputePtrOp::create(opBuilder, cudaq::cc::PointerType::get(eleTy),
+                               vecPtr, ArrayRef<cc::ComputePtrArg>{offset});
+  Value subVeqInit = cc::StdvecInitOp::create(opBuilder, vectorValue.getType(),
+                                              ptr, countValue);
 
   // If this is a slice, then we know we have
   // unique extraction on the elements of the slice,
@@ -268,7 +270,7 @@ QuakeValue QuakeValue::operator-() const {
   if (!v.getType().isIntOrFloat())
     throw std::runtime_error("Can only negate double/float QuakeValues.");
 
-  Value negated = opBuilder.create<arith::NegFOp>(v.getType(), v);
+  Value negated = arith::NegFOp::create(opBuilder, v.getType(), v);
   return QuakeValue(opBuilder, negated);
 }
 
@@ -279,8 +281,8 @@ QuakeValue QuakeValue::operator*(const double constValue) {
 
   llvm::APFloat d(constValue);
   Value constant =
-      opBuilder.create<arith::ConstantFloatOp>(d, opBuilder.getF64Type());
-  Value multiplied = opBuilder.create<arith::MulFOp>(v.getType(), constant, v);
+      arith::ConstantFloatOp::create(opBuilder, opBuilder.getF64Type(), d);
+  Value multiplied = arith::MulFOp::create(opBuilder, v.getType(), constant, v);
   return QuakeValue(opBuilder, multiplied);
 }
 
@@ -293,7 +295,7 @@ QuakeValue QuakeValue::operator*(QuakeValue other) {
   if (!otherV.getType().isIntOrFloat())
     throw std::runtime_error("Can only multiply double/float QuakeValues.");
 
-  Value multiplied = opBuilder.create<arith::MulFOp>(v.getType(), v, otherV);
+  Value multiplied = arith::MulFOp::create(opBuilder, v.getType(), v, otherV);
   return QuakeValue(opBuilder, multiplied);
 }
 
@@ -304,8 +306,8 @@ QuakeValue QuakeValue::operator/(const double constValue) {
 
   llvm::APFloat d(constValue);
   Value constant =
-      opBuilder.create<arith::ConstantFloatOp>(d, opBuilder.getF64Type());
-  Value div = opBuilder.create<arith::DivFOp>(v.getType(), v, constant);
+      arith::ConstantFloatOp::create(opBuilder, opBuilder.getF64Type(), d);
+  Value div = arith::DivFOp::create(opBuilder, v.getType(), v, constant);
   return QuakeValue(opBuilder, div);
 }
 
@@ -318,7 +320,7 @@ QuakeValue QuakeValue::operator/(QuakeValue other) {
   if (!otherV.getType().isIntOrFloat())
     throw std::runtime_error("Can only divide double/float QuakeValues.");
 
-  Value div = opBuilder.create<arith::DivFOp>(v.getType(), v, otherV);
+  Value div = arith::DivFOp::create(opBuilder, v.getType(), v, otherV);
   return QuakeValue(opBuilder, div);
 }
 
@@ -329,8 +331,8 @@ QuakeValue QuakeValue::operator+(const double constValue) {
 
   llvm::APFloat d(constValue);
   Value constant =
-      opBuilder.create<arith::ConstantFloatOp>(d, opBuilder.getF64Type());
-  Value added = opBuilder.create<arith::AddFOp>(v.getType(), constant, v);
+      arith::ConstantFloatOp::create(opBuilder, opBuilder.getF64Type(), d);
+  Value added = arith::AddFOp::create(opBuilder, v.getType(), constant, v);
   return QuakeValue(opBuilder, added);
 }
 
@@ -340,8 +342,8 @@ QuakeValue QuakeValue::operator+(const int constValue) {
     throw std::runtime_error("Can only add integral QuakeValues.");
 
   Value constant =
-      opBuilder.create<arith::ConstantIntOp>(constValue, v.getType());
-  Value added = opBuilder.create<arith::AddIOp>(v.getType(), constant, v);
+      arith::ConstantIntOp::create(opBuilder, v.getType(), constValue);
+  Value added = arith::AddIOp::create(opBuilder, v.getType(), constant, v);
   return QuakeValue(opBuilder, added);
 }
 
@@ -354,7 +356,7 @@ QuakeValue QuakeValue::operator+(QuakeValue other) {
   if (!otherV.getType().isIntOrFloat())
     throw std::runtime_error("Can only add double/float QuakeValues.");
 
-  Value added = opBuilder.create<arith::AddFOp>(v.getType(), v, otherV);
+  Value added = arith::AddFOp::create(opBuilder, v.getType(), v, otherV);
   return QuakeValue(opBuilder, added);
 }
 
@@ -365,8 +367,8 @@ QuakeValue QuakeValue::operator-(const double constValue) {
 
   llvm::APFloat d(constValue);
   Value constant =
-      opBuilder.create<arith::ConstantFloatOp>(d, opBuilder.getF64Type());
-  Value subtracted = opBuilder.create<arith::SubFOp>(v.getType(), v, constant);
+      arith::ConstantFloatOp::create(opBuilder, opBuilder.getF64Type(), d);
+  Value subtracted = arith::SubFOp::create(opBuilder, v.getType(), v, constant);
   return QuakeValue(opBuilder, subtracted);
 }
 
@@ -376,9 +378,9 @@ QuakeValue QuakeValue::operator-(const int constValue) {
     throw std::runtime_error("Can only subtract double/float QuakeValues.");
 
   Value constant =
-      opBuilder.create<arith::ConstantIntOp>(constValue, v.getType());
+      arith::ConstantIntOp::create(opBuilder, v.getType(), constValue);
 
-  Value subtracted = opBuilder.create<arith::SubIOp>(v.getType(), v, constant);
+  Value subtracted = arith::SubIOp::create(opBuilder, v.getType(), v, constant);
   return QuakeValue(opBuilder, subtracted);
 }
 
@@ -391,7 +393,7 @@ QuakeValue QuakeValue::operator-(QuakeValue other) {
   if (!otherV.getType().isIntOrFloat())
     throw std::runtime_error("Can only subtract double/float QuakeValues.");
 
-  Value subtracted = opBuilder.create<arith::SubFOp>(v.getType(), v, otherV);
+  Value subtracted = arith::SubFOp::create(opBuilder, v.getType(), v, otherV);
   return QuakeValue(opBuilder, subtracted);
 }
 
@@ -399,9 +401,9 @@ QuakeValue QuakeValue::inverse() const {
   auto v = value->asMLIR();
   if (!v.getType().isIntOrFloat())
     throw std::runtime_error("Can only inverse double/float QuakeValues.");
-  Value constantOne = opBuilder.create<arith::ConstantFloatOp>(
-      llvm::APFloat(1.0), opBuilder.getF64Type());
-  Value inv = opBuilder.create<arith::DivFOp>(v.getType(), constantOne, v);
+  Value constantOne = arith::ConstantFloatOp::create(
+      opBuilder, opBuilder.getF64Type(), llvm::APFloat(1.0));
+  Value inv = arith::DivFOp::create(opBuilder, v.getType(), constantOne, v);
   return QuakeValue(opBuilder, inv);
 }
 } // namespace cudaq
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index f926078a44e..1b89be7bdac 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "kernel_builder.h"
+#include "common/Environment.h"
 #include "common/FmtCore.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
@@ -36,8 +37,10 @@
 #include <numeric>
 
 using namespace mlir;
-using namespace cudaq_internal::compiler;
 
+// FIXME: include the header file and type out the namespace in every definition
+// below as appropriate and get rid of wrapping the entire file in this
+// namespace.
 namespace cudaq::details {
 
 /// @brief Track unique measurement register names.
@@ -130,7 +133,7 @@ KernelBuilderType convertArgumentTypeToMLIR(cudaq::state *&) {
 
 MLIRContext *initializeContext() {
   CUDAQ_INFO("Initializing the MLIR infrastructure.");
-  return getOwningMLIRContext().release();
+  return cudaq_internal::compiler::getOwningMLIRContext().release();
 }
 void deleteContext(MLIRContext *context) { delete context; }
 void deleteJitEngine(ExecutionEngine *jit) { delete jit; }
@@ -163,7 +166,7 @@ initializeBuilder(MLIRContext *context,
   kernelName += fmt::format("_{}", os.str());
   CUDAQ_INFO("kernel_builder name set to {}", kernelName);
 
-  FunctionType funcTy = opBuilder->getFunctionType(types, std::nullopt);
+  FunctionType funcTy = opBuilder->getFunctionType(types, {});
   auto kernel = opBuilder->create<func::FuncOp>(kernelName, funcTy);
   auto *entryBlock = kernel.addEntryBlock();
 
@@ -197,8 +200,9 @@ void exp_pauli(ImplicitLocOpBuilder &builder, const QuakeValue &theta,
     for (auto &v : qubits)
       values.push_back(v.getValue());
 
-    qubitsVal = builder.create<quake::ConcatOp>(
-        quake::VeqType::get(builder.getContext(), qubits.size()), values);
+    qubitsVal = quake::ConcatOp::create(
+        builder, quake::VeqType::get(builder.getContext(), qubits.size()),
+        values);
   }
 
   auto thetaVal = theta.getValue();
@@ -210,8 +214,8 @@ void exp_pauli(ImplicitLocOpBuilder &builder, const QuakeValue &theta,
                              "type as first argument.");
   CUDAQ_INFO("kernel_builder apply exp_pauli {}", pauliWord);
 
-  builder.create<quake::ExpPauliOp>(ValueRange{thetaVal}, ValueRange{},
-                                    ValueRange{qubitsVal}, pauliWord);
+  quake::ExpPauliOp::create(builder, ValueRange{thetaVal}, ValueRange{},
+                            ValueRange{qubitsVal}, pauliWord);
 }
 
 /// @brief Search the given `FuncOp` for all `CallOps` recursively.
@@ -324,7 +328,7 @@ void call(ImplicitLocOpBuilder &builder, std::string &name,
     if (inAsVeqTy && argAsVeqTy) {
       // make sure they are both the same veq<...> type
       if (inAsVeqTy.hasSpecifiedSize() && !argAsVeqTy.hasSpecifiedSize())
-        value = builder.create<quake::RelaxSizeOp>(argAsVeqTy, value);
+        value = quake::RelaxSizeOp::create(builder, argAsVeqTy, value);
     } else if (inType != argType) {
       std::string inS, argS;
       {
@@ -340,7 +344,7 @@ void call(ImplicitLocOpBuilder &builder, std::string &name,
   }
 
   // Hook up the call op
-  builder.create<func::CallOp>(otherFuncCloned, mlirValues);
+  func::CallOp::create(builder, otherFuncCloned, mlirValues);
 }
 
 void applyControlOrAdjoint(ImplicitLocOpBuilder &builder, std::string &name,
@@ -377,7 +381,7 @@ void applyControlOrAdjoint(ImplicitLocOpBuilder &builder, std::string &name,
     if (inAsVeqTy && argAsVeqTy) {
       // make sure they are both the same veq<...> type
       if (inAsVeqTy.hasSpecifiedSize() && !argAsVeqTy.hasSpecifiedSize())
-        value = builder.create<quake::RelaxSizeOp>(argAsVeqTy, value);
+        value = quake::RelaxSizeOp::create(builder, argAsVeqTy, value);
     } else if (inType != argType) {
       std::string inS, argS;
       {
@@ -393,9 +397,9 @@ void applyControlOrAdjoint(ImplicitLocOpBuilder &builder, std::string &name,
   }
 
   auto realName = std::string(cudaq::runtime::cudaqGenPrefixName) + name;
-  builder.create<quake::ApplyOp>(
-      TypeRange{}, SymbolRefAttr::get(builder.getContext(), realName),
-      isAdjoint, controls, mlirValues);
+  quake::ApplyOp::create(builder, TypeRange{},
+                         SymbolRefAttr::get(builder.getContext(), realName),
+                         isAdjoint, controls, mlirValues);
 }
 
 void control(ImplicitLocOpBuilder &builder, std::string &name,
@@ -425,18 +429,18 @@ void adjoint(ImplicitLocOpBuilder &builder, std::string &name,
 void forLoop(ImplicitLocOpBuilder &builder, Value &startVal, Value &end,
              std::function<void(QuakeValue &)> &body) {
   auto i64Ty = builder.getI64Type();
-  Value castEnd = builder.create<cudaq::cc::CastOp>(
-      i64Ty, end, cudaq::cc::CastOpMode::Unsigned);
-  Value castStart = builder.create<cudaq::cc::CastOp>(
-      i64Ty, startVal, cudaq::cc::CastOpMode::Unsigned);
-  Value totalIters = builder.create<arith::SubIOp>(i64Ty, castEnd, castStart);
+  Value castEnd = cudaq::cc::CastOp::create(builder, i64Ty, end,
+                                            cudaq::cc::CastOpMode::Unsigned);
+  Value castStart = cudaq::cc::CastOp::create(builder, i64Ty, startVal,
+                                              cudaq::cc::CastOpMode::Unsigned);
+  Value totalIters = arith::SubIOp::create(builder, i64Ty, castEnd, castStart);
   cudaq::opt::factory::createInvariantLoop(
       builder, builder.getLoc(), totalIters,
       [&](OpBuilder &nestedBuilder, Location nestedLoc, Region &,
           Block &block) {
         Value iv = block.getArgument(0);
         // shift iv -> iv + start
-        iv = builder.create<arith::AddIOp>(iv.getType(), iv, castStart);
+        iv = arith::AddIOp::create(builder, iv.getType(), iv, castStart);
         OpBuilder::InsertionGuard guard(nestedBuilder);
         QuakeValue idxQuakeVal(builder, iv);
         body(idxQuakeVal);
@@ -452,21 +456,21 @@ void forLoop(ImplicitLocOpBuilder &builder, QuakeValue &startVal,
 
 void forLoop(ImplicitLocOpBuilder &builder, std::size_t start, std::size_t end,
              std::function<void(QuakeValue &)> &body) {
-  Value startVal = builder.create<arith::ConstantIntOp>(start, 64);
-  Value endVal = builder.create<arith::ConstantIntOp>(end, 64);
+  Value startVal = arith::ConstantIntOp::create(builder, start, 64);
+  Value endVal = arith::ConstantIntOp::create(builder, end, 64);
   forLoop(builder, startVal, endVal, body);
 }
 
 void forLoop(ImplicitLocOpBuilder &builder, std::size_t start, QuakeValue &end,
              std::function<void(QuakeValue &)> &body) {
-  Value startVal = builder.create<arith::ConstantIntOp>(start, 64);
+  Value startVal = arith::ConstantIntOp::create(builder, start, 64);
   auto e = end.getValue();
   forLoop(builder, startVal, e, body);
 }
 
 void forLoop(ImplicitLocOpBuilder &builder, QuakeValue &start, std::size_t end,
              std::function<void(QuakeValue &)> &body) {
-  Value e = builder.create<arith::ConstantIntOp>(end, 64);
+  Value e = arith::ConstantIntOp::create(builder, end, 64);
   auto s = start.getValue();
   forLoop(builder, s, e, body);
 }
@@ -479,7 +483,7 @@ Type KernelBuilderType::create(MLIRContext *ctx) { return creator(ctx); }
 
 QuakeValue qalloc(ImplicitLocOpBuilder &builder) {
   CUDAQ_INFO("kernel_builder allocating a single qubit");
-  Value qubit = builder.create<quake::AllocaOp>();
+  Value qubit = quake::AllocaOp::create(builder);
   return QuakeValue(builder, qubit);
 }
 
@@ -488,7 +492,7 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, const std::size_t nQubits) {
 
   auto context = builder.getContext();
   Value qubits =
-      builder.create<quake::AllocaOp>(quake::VeqType::get(context, nQubits));
+      quake::AllocaOp::create(builder, quake::VeqType::get(context, nQubits));
 
   return QuakeValue(builder, qubits);
 }
@@ -502,18 +506,18 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) {
   if (auto stdvecTy = dyn_cast<cc::StdvecType>(type)) {
     // get the size
     auto ptrTy = cc::PointerType::get(stdvecTy.getElementType());
-    Value initials = builder.create<cc::StdvecDataOp>(ptrTy, value);
+    Value initials = cc::StdvecDataOp::create(builder, ptrTy, value);
     auto i64Ty = builder.getI64Type();
-    Value size = builder.create<cc::StdvecSizeOp>(i64Ty, value);
+    Value size = cc::StdvecSizeOp::create(builder, i64Ty, value);
     auto stateTy = cc::PointerType::get(quake::StateType::get(context));
-    auto state = builder.create<quake::CreateStateOp>(stateTy, initials, size);
-    Value numQubits = builder.create<quake::GetNumberOfQubitsOp>(i64Ty, state);
+    auto state = quake::CreateStateOp::create(builder, stateTy, initials, size);
+    Value numQubits = quake::GetNumberOfQubitsOp::create(builder, i64Ty, state);
     // allocate the number of qubits we need
     auto veqTy = quake::VeqType::getUnsized(context);
-    Value qubits = builder.create<quake::AllocaOp>(veqTy, numQubits);
+    Value qubits = quake::AllocaOp::create(builder, veqTy, numQubits);
 
-    qubits = builder.create<quake::InitializeStateOp>(veqTy, qubits, state);
-    builder.create<quake::DeleteStateOp>(state);
+    qubits = quake::InitializeStateOp::create(builder, veqTy, qubits, state);
+    quake::DeleteStateOp::create(builder, state);
     return QuakeValue(builder, qubits);
   }
 
@@ -521,14 +525,14 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) {
     auto eleTy = statePtrTy.getElementType();
     if (auto stateTy = dyn_cast<quake::StateType>(eleTy)) {
       // get the number of qubits
-      auto numQubits = builder.create<quake::GetNumberOfQubitsOp>(
-          builder.getI64Type(), value);
+      auto numQubits = quake::GetNumberOfQubitsOp::create(
+          builder, builder.getI64Type(), value);
       // allocate the number of qubits we need
       auto veqTy = quake::VeqType::getUnsized(context);
-      Value qubits = builder.create<quake::AllocaOp>(veqTy, numQubits);
+      Value qubits = quake::AllocaOp::create(builder, veqTy, numQubits);
       // Add the initialize state op
-      qubits = builder.create<quake::InitializeStateOp>(qubits.getType(),
-                                                        qubits, value);
+      qubits = quake::InitializeStateOp::create(builder, qubits.getType(),
+                                                qubits, value);
       return QuakeValue(builder, qubits);
     }
   }
@@ -537,8 +541,8 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) {
     throw std::runtime_error(
         "Invalid parameter passed to qalloc (must be integer type).");
 
-  Value qubits = builder.create<quake::AllocaOp>(
-      quake::VeqType::getUnsized(context), value);
+  Value qubits = quake::AllocaOp::create(
+      builder, quake::VeqType::getUnsized(context), value);
 
   return QuakeValue(builder, qubits);
 }
@@ -628,29 +632,29 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder,
   static_assert(sizeof(std::intptr_t) * 8 == 64);
   std::intptr_t vecStor = reinterpret_cast<std::intptr_t>(&stateVectorStorage);
 
-  auto vecPtr = builder.create<arith::ConstantIntOp>(vecStor, 64);
-  auto idxOp = builder.create<arith::ConstantIntOp>(index, 64);
+  auto vecPtr = arith::ConstantIntOp::create(builder, vecStor, 64);
+  auto idxOp = arith::ConstantIntOp::create(builder, index, 64);
 
   // Use callback to determine the size of the captured vector `state` at
   // runtime.
   auto i64Ty = builder.getI64Type();
-  auto size = builder.create<func::CallOp>(i64Ty, getLengthCallBack,
-                                           ValueRange{vecPtr, idxOp});
+  auto size = func::CallOp::create(builder, i64Ty, getLengthCallBack,
+                                   ValueRange{vecPtr, idxOp});
 
   // Allocate the qubits
-  Value qubits = builder.create<quake::AllocaOp>(
-      quake::VeqType::getUnsized(context), size.getResult(0));
+  Value qubits = quake::AllocaOp::create(
+      builder, quake::VeqType::getUnsized(context), size.getResult(0));
 
   // Use callback to retrieve the data pointer of the captured vector `state` at
   // runtime.
   auto complexTy = ComplexType::get(componentTy);
   auto ptrComplexTy = cc::PointerType::get(complexTy);
-  auto dataPtr = builder.create<func::CallOp>(ptrComplexTy, getDataCallBack,
-                                              ValueRange{vecPtr, idxOp});
+  auto dataPtr = func::CallOp::create(builder, ptrComplexTy, getDataCallBack,
+                                      ValueRange{vecPtr, idxOp});
 
   // Add the initialize state op
-  qubits = builder.create<quake::InitializeStateOp>(qubits.getType(), qubits,
-                                                    dataPtr.getResult(0));
+  qubits = quake::InitializeStateOp::create(builder, qubits.getType(), qubits,
+                                            dataPtr.getResult(0));
   return QuakeValue(builder, qubits);
 }
 
@@ -658,22 +662,22 @@ QuakeValue qalloc(mlir::ImplicitLocOpBuilder &builder, cudaq::state *state,
                   StateVectorStorage &stateVectorStorage) {
   auto *context = builder.getContext();
   auto statePtrTy = cudaq::cc::PointerType::get(quake::StateType::get(context));
-  auto statePtr = builder.create<cc::CastOp>(
-      builder.getLoc(), statePtrTy,
-      builder.create<arith::ConstantIntOp>(
-          reinterpret_cast<std::intptr_t>(state), 64));
+  auto statePtr = cc::CastOp::create(
+      builder, builder.getLoc(), statePtrTy,
+      arith::ConstantIntOp::create(builder,
+                                   reinterpret_cast<std::intptr_t>(state), 64));
   // Add the initialize state op
-  Value qubits = builder.create<quake::AllocaOp>(
-      quake::VeqType::get(context, state->get_num_qubits()));
-  qubits = builder.create<quake::InitializeStateOp>(qubits.getType(), qubits,
-                                                    statePtr);
+  Value qubits = quake::AllocaOp::create(
+      builder, quake::VeqType::get(context, state->get_num_qubits()));
+  qubits = quake::InitializeStateOp::create(builder, qubits.getType(), qubits,
+                                            statePtr);
   return QuakeValue(builder, qubits);
 }
 
 QuakeValue constantVal(ImplicitLocOpBuilder &builder, double val) {
   llvm::APFloat d(val);
   Value constant =
-      builder.create<arith::ConstantFloatOp>(d, builder.getF64Type());
+      arith::ConstantFloatOp::create(builder, builder.getF64Type(), d);
   return QuakeValue(builder, constant);
 }
 
@@ -683,13 +687,13 @@ void handleOneQubitBroadcast(ImplicitLocOpBuilder &builder, auto param,
   CUDAQ_INFO("kernel_builder handling operation broadcast on qvector.");
 
   auto loc = builder.getLoc();
-  Value rank = builder.create<quake::VeqSizeOp>(builder.getI64Type(), veq);
+  Value rank = quake::VeqSizeOp::create(builder, builder.getI64Type(), veq);
   auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &,
                          Block &block) {
     Value ref =
-        builder.create<quake::ExtractRefOp>(loc, veq, block.getArgument(0));
+        quake::ExtractRefOp::create(builder, loc, veq, block.getArgument(0));
 
-    builder.create<QuakeOp>(loc, adjoint, param, ValueRange(), ref);
+    QuakeOp::create(builder, loc, adjoint, param, ValueRange(), ref);
   };
   cudaq::opt::factory::createInvariantLoop(builder, loc, rank, bodyBuilder);
 }
@@ -697,7 +701,7 @@ void handleOneQubitBroadcast(ImplicitLocOpBuilder &builder, auto param,
 template <typename QuakeOp>
 void applyOneQubitOp(ImplicitLocOpBuilder &builder, auto &&params, auto &&ctrls,
                      Value qubit, bool adjoint = false) {
-  builder.create<QuakeOp>(adjoint, params, ctrls, qubit);
+  QuakeOp::create(builder, adjoint, params, ctrls, qubit);
 }
 
 #define CUDAQ_ONE_QUBIT_IMPL(NAME, QUAKENAME)                                  \
@@ -765,8 +769,8 @@ void u3(ImplicitLocOpBuilder &builder, std::vector<QuakeValue> &parameters,
   std::transform(ctrls.begin(), ctrls.end(), std::back_inserter(ctrlValues),
                  [](auto &el) { return el.getValue(); });
   std::vector<Value> qubitValues{target.getValue()};
-  builder.create<quake::U3Op>(adjoint, parameterValues, ctrlValues,
-                              qubitValues);
+  quake::U3Op::create(builder, adjoint, parameterValues, ctrlValues,
+                      qubitValues);
 }
 
 template <typename QuakeMeasureOp>
@@ -794,13 +798,11 @@ QuakeValue applyMeasure(ImplicitLocOpBuilder &builder, Value value,
   Value measureResult;
   if (strAttr)
     measureResult =
-        builder.template create<QuakeMeasureOp>(measTy, value, strAttr)
-            .getMeasOut();
+        QuakeMeasureOp::create(builder, measTy, value, strAttr).getMeasOut();
   else
-    measureResult =
-        builder.template create<QuakeMeasureOp>(measTy, value).getMeasOut();
+    measureResult = QuakeMeasureOp::create(builder, measTy, value).getMeasOut();
 
-  Value bits = builder.create<quake::DiscriminateOp>(resTy, measureResult);
+  Value bits = quake::DiscriminateOp::create(builder, resTy, measureResult);
   return QuakeValue(builder, bits);
 }
 
@@ -820,7 +822,7 @@ QuakeValue mz(ImplicitLocOpBuilder &builder, QuakeValue &qubitOrQvec,
 }
 
 void reset(ImplicitLocOpBuilder &builder, const QuakeValue &qubitOrQvec) {
-  builder.create<quake::ResetOp>(TypeRange{}, qubitOrQvec.getValue());
+  quake::ResetOp::create(builder, TypeRange{}, qubitOrQvec.getValue());
 }
 
 void swap(ImplicitLocOpBuilder &builder, const std::vector<QuakeValue> &ctrls,
@@ -832,7 +834,8 @@ void swap(ImplicitLocOpBuilder &builder, const std::vector<QuakeValue> &ctrls,
                  [](auto &el) { return el.getValue(); });
   std::transform(qubits.begin(), qubits.end(), std::back_inserter(qubitValues),
                  [](auto &el) { return el.getValue(); });
-  builder.create<quake::SwapOp>(adjoint, ValueRange(), ctrlValues, qubitValues);
+  quake::SwapOp::create(builder, adjoint, ValueRange(), ctrlValues,
+                        qubitValues);
 }
 
 void checkAndUpdateRegName(quake::MeasurementInterface &measure) {
@@ -887,7 +890,7 @@ void tagEntryPoint(ImplicitLocOpBuilder &builder, ModuleOp &module,
       function->setAttr(cudaq::kernelAttrName, builder.getUnitAttr());
     if (!function->hasAttr(cudaq::entryPointAttrName) &&
         !hasAnyQubitTypes(function.getFunctionType()) &&
-        (symbolName.empty() || function.getSymName().equals(symbolName)))
+        (symbolName.empty() || function.getSymName() == symbolName))
       function->setAttr(cudaq::entryPointAttrName, builder.getUnitAttr());
 
     return WalkResult::advance();
@@ -982,6 +985,16 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
     pm.addPass(cudaq::opt::createConvertToQIR());
     pm.addPass(createCanonicalizerPass());
 
+    auto enablePrintMLIREachPass =
+        cudaq::getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
+    auto disableThreading =
+        cudaq::getEnvBool("CUDAQ_MLIR_DISABLE_THREADING", false);
+    if (enablePrintMLIREachPass || disableThreading) {
+      module->getContext()->disableMultithreading();
+      if (enablePrintMLIREachPass)
+        pm.enableIRPrinting();
+    }
+
     if (failed(pm.run(module)))
       throw std::runtime_error(
           "cudaq::builder failed to JIT compile the Quake representation.");
@@ -999,26 +1012,26 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
 
   CUDAQ_INFO("- Pass manager was applied.");
   ExecutionEngineOptions opts;
-  opts.transformer = [](llvm::Module *m) { return llvm::ErrorSuccess(); };
-  opts.jitCodeGenOptLevel = llvm::CodeGenOpt::None;
+  auto transformerTemp = [](llvm::Module *m) { return llvm::ErrorSuccess(); };
+  opts.transformer = std::move(transformerTemp);
+  opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::None;
   SmallVector<StringRef, 4> sharedLibs;
   for (auto &lib : extraLibPaths) {
     CUDAQ_INFO("Extra library loaded: {}", lib);
     sharedLibs.push_back(lib);
   }
   opts.sharedLibPaths = sharedLibs;
-  opts.llvmModuleBuilder =
+  auto llvmModuleBuilderTemp =
       [](Operation *module,
          llvm::LLVMContext &llvmContext) -> std::unique_ptr<llvm::Module> {
-    llvmContext.setOpaquePointers(false);
     auto llvmModule = translateModuleToLLVMIR(module, llvmContext);
     if (!llvmModule) {
       llvm::errs() << "Failed to emit LLVM IR\n";
       return nullptr;
     }
-    ExecutionEngine::setupTargetTriple(llvmModule.get());
     return llvmModule;
   };
+  opts.llvmModuleBuilder = std::move(llvmModuleBuilderTemp);
 
   CUDAQ_INFO(" - Creating the MLIR ExecutionEngine");
   auto jitOrError = ExecutionEngine::create(module, opts);
diff --git a/runtime/cudaq/cudaq.cpp b/runtime/cudaq/cudaq.cpp
index 11a7e322c89..4ed81322816 100644
--- a/runtime/cudaq/cudaq.cpp
+++ b/runtime/cudaq/cudaq.cpp
@@ -270,7 +270,7 @@ void __nvqpp_initializer_list_to_vector_bool(std::vector<bool> &result,
                                              char *initList, std::size_t size) {
   // result is a sret return value. Make sure it is default initialized. Takes
   // advantage of default empty vector being all 0s.
-  std::memset(&result, 0, sizeof(result));
+  std::memset(reinterpret_cast<void *>(&result), 0, sizeof(result));
   // Allocate space.
   result.reserve(size);
   // Copy in the initialization list data.
diff --git a/runtime/cudaq/operators.h b/runtime/cudaq/operators.h
index 8dcbccc2027..8abb3606393 100644
--- a/runtime/cudaq/operators.h
+++ b/runtime/cudaq/operators.h
@@ -27,25 +27,27 @@ enum class pauli;
 
 #define HANDLER_SPECIFIC_TEMPLATE(ConcreteTy)                                  \
   template <typename T = HandlerTy,                                            \
-            std::enable_if_t<std::is_same<T, ConcreteTy>::value &&             \
-                                 std::is_same<HandlerTy, T>::value,            \
-                             bool> = true>
+            typename = std::enable_if_t<std::is_same<T, ConcreteTy>::value &&  \
+                                        std::is_same<HandlerTy, T>::value>,    \
+            ConcreteTy * = nullptr>
 
 #define PROPERTY_SPECIFIC_TEMPLATE(property)                                   \
   template <typename T = HandlerTy,                                            \
-            std::enable_if_t<std::is_same<HandlerTy, T>::value && property,    \
-                             std::true_type> = std::true_type()>
+            typename = std::enable_if_t<std::is_same<HandlerTy, T>::value &&   \
+                                        property>,                             \
+            std::true_type = std::true_type{}>
 
 #define PROPERTY_AGNOSTIC_TEMPLATE(property)                                   \
   template <typename T = HandlerTy,                                            \
-            std::enable_if_t<std::is_same<HandlerTy, T>::value && !property,   \
-                             std::false_type> = std::false_type()>
+            typename = std::enable_if_t<std::is_same<HandlerTy, T>::value &&   \
+                                        !property>,                            \
+            std::false_type = std::false_type{}>
 
 #define SPIN_OPS_BACKWARD_COMPATIBILITY(deprecation_message)                   \
   template <typename T = HandlerTy,                                            \
-            std::enable_if_t<std::is_same<HandlerTy, spin_handler>::value &&   \
-                                 std::is_same<HandlerTy, T>::value,            \
-                             bool> = true>                                     \
+            typename = std::enable_if_t<                                       \
+                std::is_same<HandlerTy, spin_handler>::value &&                \
+                std::is_same<HandlerTy, T>::value>>                            \
   [[deprecated(deprecation_message)]]
 
 /// @brief Represents a sum of operator products in a quantum operator algebra.
@@ -192,11 +194,10 @@ class sum_op {
   /// product_op<HandlerTy> types.
   /// @param args One or more product operator objects used in the summation
   /// operation.
-  template <typename... Args,
-            std::enable_if_t<std::conjunction<std::is_same<
-                                 product_op<HandlerTy>, Args>...>::value &&
-                                 sizeof...(Args),
-                             bool> = true>
+  template <typename... Args, typename = std::enable_if_t<
+                                  std::conjunction<std::is_same<
+                                      product_op<HandlerTy>, Args>...>::value &&
+                                  sizeof...(Args)>>
   sum_op(Args &&...args);
 
   /// @brief Constructs a sum_op instance from a given product_op instance.
@@ -208,10 +209,9 @@ class sum_op {
   /// instantiated with a different type.
   /// @tparam T The type of the other sum_op object, which must not be HandlerTy
   /// and must be constructible to HandlerTy.
-  template <typename T,
-            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                                 std::is_constructible<HandlerTy, T>::value,
-                             bool> = true>
+  template <typename T, typename = std::enable_if_t<
+                            !std::is_same<T, HandlerTy>::value &&
+                            std::is_constructible<HandlerTy, T>::value>>
   sum_op(const sum_op<T> &other);
 
   /// @brief Constructs a new sum_op object from an existing sum_op of a
@@ -220,11 +220,10 @@ class sum_op {
   /// construction.
   /// @param behavior The commutation behavior to be applied during
   /// construction.
-  template <typename T,
-            std::enable_if_t<std::is_same<HandlerTy, matrix_handler>::value &&
-                                 !std::is_same<T, HandlerTy>::value &&
-                                 std::is_constructible<HandlerTy, T>::value,
-                             bool> = true>
+  template <typename T, typename = std::enable_if_t<
+                            std::is_same<HandlerTy, matrix_handler>::value &&
+                            !std::is_same<T, HandlerTy>::value &&
+                            std::is_constructible<HandlerTy, T>::value>>
   sum_op(const sum_op<T> &other,
          const matrix_handler::commutation_behavior &behavior);
 
@@ -244,10 +243,9 @@ class sum_op {
   /// sum_op<HandlerTy>. It is only enabled when T is not the same as HandlerTy
   /// and when HandlerTy is constructible from T. This constraint ensures that
   /// only compatible types are allowed in the assignment operation.
-  template <typename T,
-            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                                 std::is_constructible<HandlerTy, T>::value,
-                             bool> = true>
+  template <typename T, typename = std::enable_if_t<
+                            !std::is_same<T, HandlerTy>::value &&
+                            std::is_constructible<HandlerTy, T>::value>>
   sum_op<HandlerTy> &operator=(const product_op<T> &other);
 
   /// @brief Assign a product_op object to a sum_op object.
@@ -267,10 +265,9 @@ class sum_op {
   /// @tparam T The type of the sum_op object being assigned from.
   /// @param other The sum_op object with type T to be assigned.
   /// @return A reference to the current sum_op object after assignment.
-  template <typename T,
-            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                                 std::is_constructible<HandlerTy, T>::value,
-                             bool> = true>
+  template <typename T, typename = std::enable_if_t<
+                            !std::is_same<T, HandlerTy>::value &&
+                            std::is_constructible<HandlerTy, T>::value>>
   sum_op<HandlerTy> &operator=(const sum_op<T> &other);
 
   /// @brief Performs a copy assignment of one sum_op to another.
@@ -941,10 +938,8 @@ class product_op {
   std::vector<HandlerTy> operators;
   scalar_operator coefficient;
 
-  template <typename... Args,
-            std::enable_if_t<
-                std::conjunction<std::is_same<HandlerTy, Args>...>::value,
-                bool> = true>
+  template <typename... Args, typename = std::enable_if_t<std::conjunction<
+                                  std::is_same<HandlerTy, Args>...>::value>>
   product_op(scalar_operator coefficient, Args &&...args);
 
   // keep this constructor protected (otherwise it needs to ensure canonical
@@ -1116,10 +1111,9 @@ class product_op {
   /// if HandlerTy can be constructed from T. It allows implicit conversion
   /// between different instantiations of product_op.
   /// @param other The product_op instance to copy from.
-  template <typename T,
-            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                                 std::is_constructible<HandlerTy, T>::value,
-                             bool> = true>
+  template <typename T, typename = std::enable_if_t<
+                            !std::is_same<T, HandlerTy>::value &&
+                            std::is_constructible<HandlerTy, T>::value>>
   product_op(const product_op<T> &other);
 
   /// @brief Constructs a product operator from an existing product operator
@@ -1134,11 +1128,10 @@ class product_op {
   /// object.
   /// @param behavior The commutation behavior to be used with the
   /// matrix_handler.
-  template <typename T,
-            std::enable_if_t<std::is_same<HandlerTy, matrix_handler>::value &&
-                                 !std::is_same<T, HandlerTy>::value &&
-                                 std::is_constructible<HandlerTy, T>::value,
-                             bool> = true>
+  template <typename T, typename = std::enable_if_t<
+                            std::is_same<HandlerTy, matrix_handler>::value &&
+                            !std::is_same<T, HandlerTy>::value &&
+                            std::is_constructible<HandlerTy, T>::value>>
   product_op(const product_op<T> &other,
              const matrix_handler::commutation_behavior &behavior);
 
@@ -1170,10 +1163,9 @@ class product_op {
   /// product_op instance of type T into one of type HandlerTy.
   /// @tparam T The type of the product_op to be assigned from, which must
   /// satisfy that it is not HandlerTy and is constructible as HandlerTy.
-  template <typename T,
-            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                                 std::is_constructible<HandlerTy, T>::value,
-                             bool> = true>
+  template <typename T, typename = std::enable_if_t<
+                            !std::is_same<T, HandlerTy>::value &&
+                            std::is_constructible<HandlerTy, T>::value>>
   product_op<HandlerTy> &operator=(const product_op<T> &other);
 
   /// @brief Assignment operator for the product_op class.
diff --git a/runtime/cudaq/operators/product_op.cpp b/runtime/cudaq/operators/product_op.cpp
index df29d4c51d2..72389cf4afb 100644
--- a/runtime/cudaq/operators/product_op.cpp
+++ b/runtime/cudaq/operators/product_op.cpp
@@ -22,14 +22,10 @@
 namespace cudaq {
 
 #define PROPERTY_SPECIFIC_TEMPLATE_DEFINITION(HandlerTy, property)             \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<HandlerTy, T>::value && property,    \
-                             std::true_type>>
+  template <typename T, typename, std::true_type>
 
 #define PROPERTY_AGNOSTIC_TEMPLATE_DEFINITION(HandlerTy, property)             \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<HandlerTy, T>::value && !property,   \
-                             std::false_type>>
+  template <typename T, typename, std::false_type>
 
 // private methods
 
@@ -460,9 +456,7 @@ product_op<HandlerTy>::product_op(HandlerTy &&atomic) : coefficient(1.) {
 }
 
 template <typename HandlerTy>
-template <typename... Args,
-          std::enable_if_t<
-              std::conjunction<std::is_same<HandlerTy, Args>...>::value, bool>>
+template <typename... Args, typename>
 product_op<HandlerTy>::product_op(scalar_operator coefficient, Args &&...args)
     : coefficient(std::move(coefficient)) {
   this->operators.reserve(sizeof...(Args));
@@ -499,10 +493,7 @@ product_op<HandlerTy>::product_op(scalar_operator coefficient,
 }
 
 template <typename HandlerTy>
-template <typename T,
-          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                               std::is_constructible<HandlerTy, T>::value,
-                           bool>>
+template <typename T, typename>
 product_op<HandlerTy>::product_op(const product_op<T> &other)
     : coefficient(other.coefficient) {
   this->operators.reserve(other.operators.size());
@@ -513,11 +504,7 @@ product_op<HandlerTy>::product_op(const product_op<T> &other)
 }
 
 template <typename HandlerTy>
-template <typename T,
-          std::enable_if_t<std::is_same<HandlerTy, matrix_handler>::value &&
-                               !std::is_same<T, HandlerTy>::value &&
-                               std::is_constructible<HandlerTy, T>::value,
-                           bool>>
+template <typename T, typename>
 product_op<HandlerTy>::product_op(
     const product_op<T> &other,
     const matrix_handler::commutation_behavior &behavior)
@@ -628,10 +615,7 @@ INSTANTIATE_PRODUCT_PRIVATE_FRIEND_CONSTRUCTORS(fermion_handler);
 // assignments
 
 template <typename HandlerTy>
-template <typename T,
-          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                               std::is_constructible<HandlerTy, T>::value,
-                           bool>>
+template <typename T, typename>
 product_op<HandlerTy> &
 product_op<HandlerTy>::operator=(const product_op<T> &other) {
   *this = product_op<HandlerTy>(other);
@@ -1211,8 +1195,7 @@ INSTANTIATE_PRODUCT_LHCOMPOSITE_OPS(fermion_handler);
 // arithmetics that require conversions
 
 #define PRODUCT_CONVERSIONS_OPS(op, returnTy)                                  \
-  template <typename LHtype, typename RHtype,                                  \
-            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>                        \
+  template <typename LHtype, typename RHtype, typename>                        \
   returnTy<matrix_handler> operator op(const product_op<LHtype> &other,        \
                                        const product_op<RHtype> &self) {       \
     return product_op<matrix_handler>(other) op self;                          \
@@ -1397,10 +1380,7 @@ INSTANTIATE_PRODUCT_UTILITY_FUNCTIONS(fermion_handler);
 
 #define HANDLER_SPECIFIC_TEMPLATE_DEFINITION(ConcreteTy)                       \
   template <typename HandlerTy>                                                \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<T, ConcreteTy>::value &&             \
-                                 std::is_same<HandlerTy, T>::value,            \
-                             bool>>
+  template <typename T, typename, ConcreteTy *>
 
 HANDLER_SPECIFIC_TEMPLATE_DEFINITION(spin_handler)
 std::size_t product_op<HandlerTy>::num_qubits() const {
@@ -1549,10 +1529,7 @@ template mdiag_sparse_matrix product_op<boson_handler>::to_diagonal_matrix(
 
 #define SPIN_OPS_BACKWARD_COMPATIBILITY_DEFINITION                             \
   template <typename HandlerTy>                                                \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<HandlerTy, spin_handler>::value &&   \
-                                 std::is_same<HandlerTy, T>::value,            \
-                             bool>>
+  template <typename T, typename>
 
 SPIN_OPS_BACKWARD_COMPATIBILITY_DEFINITION
 std::string product_op<HandlerTy>::to_string(bool printCoeffs) const {
diff --git a/runtime/cudaq/operators/sum_op.cpp b/runtime/cudaq/operators/sum_op.cpp
index 46c6833aeb4..9d2ca9a3702 100644
--- a/runtime/cudaq/operators/sum_op.cpp
+++ b/runtime/cudaq/operators/sum_op.cpp
@@ -20,14 +20,10 @@
 namespace cudaq {
 
 #define PROPERTY_SPECIFIC_TEMPLATE_DEFINITION(HandlerTy, property)             \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<HandlerTy, T>::value && property,    \
-                             std::true_type>>
+  template <typename T, typename, std::true_type>
 
 #define PROPERTY_AGNOSTIC_TEMPLATE_DEFINITION(HandlerTy, property)             \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<HandlerTy, T>::value && !property,   \
-                             std::false_type>>
+  template <typename T, typename, std::false_type>
 
 // private methods
 
@@ -264,12 +260,7 @@ sum_op<HandlerTy>::sum_op(const product_op<HandlerTy> &prod)
 }
 
 template <typename HandlerTy>
-template <
-    typename... Args,
-    std::enable_if_t<
-        std::conjunction<std::is_same<product_op<HandlerTy>, Args>...>::value &&
-            sizeof...(Args),
-        bool>>
+template <typename... Args, typename>
 sum_op<HandlerTy>::sum_op(Args &&...args) : is_default(false) {
   this->coefficients.reserve(sizeof...(Args));
   this->term_map.reserve(sizeof...(Args));
@@ -278,10 +269,7 @@ sum_op<HandlerTy>::sum_op(Args &&...args) : is_default(false) {
 }
 
 template <typename HandlerTy>
-template <typename T,
-          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                               std::is_constructible<HandlerTy, T>::value,
-                           bool>>
+template <typename T, typename>
 sum_op<HandlerTy>::sum_op(const sum_op<T> &other)
     : is_default(other.is_default), coefficients(other.coefficients) {
   this->term_map.reserve(other.terms.size());
@@ -297,11 +285,7 @@ sum_op<HandlerTy>::sum_op(const sum_op<T> &other)
 }
 
 template <typename HandlerTy>
-template <typename T,
-          std::enable_if_t<std::is_same<HandlerTy, matrix_handler>::value &&
-                               !std::is_same<T, HandlerTy>::value &&
-                               std::is_constructible<HandlerTy, T>::value,
-                           bool>>
+template <typename T, typename>
 sum_op<HandlerTy>::sum_op(const sum_op<T> &other,
                           const matrix_handler::commutation_behavior &behavior)
     : is_default(other.is_default), coefficients(other.coefficients) {
@@ -417,10 +401,7 @@ INSTANTIATE_SUM_PRIVATE_FRIEND_CONSTRUCTORS(fermion_handler);
 // assignments
 
 template <typename HandlerTy>
-template <typename T,
-          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                               std::is_constructible<HandlerTy, T>::value,
-                           bool>>
+template <typename T, typename>
 sum_op<HandlerTy> &sum_op<HandlerTy>::operator=(const product_op<T> &other) {
   *this = product_op<HandlerTy>(other);
   return *this;
@@ -454,10 +435,7 @@ sum_op<HandlerTy> &sum_op<HandlerTy>::operator=(product_op<HandlerTy> &&other) {
 }
 
 template <typename HandlerTy>
-template <typename T,
-          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
-                               std::is_constructible<HandlerTy, T>::value,
-                           bool>>
+template <typename T, typename>
 sum_op<HandlerTy> &sum_op<HandlerTy>::operator=(const sum_op<T> &other) {
   *this = sum_op<HandlerTy>(other);
   return *this;
@@ -1248,22 +1226,19 @@ INSTANTIATE_SUM_LHCOMPOSITE_OPS(fermion_handler);
 
 #define SUM_CONVERSIONS_OPS(op)                                                \
                                                                                \
-  template <typename LHtype, typename RHtype,                                  \
-            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>                        \
+  template <typename LHtype, typename RHtype, typename>                        \
   sum_op<matrix_handler> operator op(const sum_op<LHtype> &other,              \
                                      const product_op<RHtype> &self) {         \
     return sum_op<matrix_handler>(other) op self;                              \
   }                                                                            \
                                                                                \
-  template <typename LHtype, typename RHtype,                                  \
-            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>                        \
+  template <typename LHtype, typename RHtype, typename>                        \
   sum_op<matrix_handler> operator op(const product_op<LHtype> &other,          \
                                      const sum_op<RHtype> &self) {             \
     return product_op<matrix_handler>(other) op self;                          \
   }                                                                            \
                                                                                \
-  template <typename LHtype, typename RHtype,                                  \
-            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>                        \
+  template <typename LHtype, typename RHtype, typename>                        \
   sum_op<matrix_handler> operator op(const sum_op<LHtype> &other,              \
                                      const sum_op<RHtype> &self) {             \
     return sum_op<matrix_handler>(other) op self;                              \
@@ -1405,10 +1380,7 @@ sum_op<fermion_handler>::identity(std::size_t target);
 
 #define HANDLER_SPECIFIC_TEMPLATE_DEFINITION(ConcreteTy)                       \
   template <typename HandlerTy>                                                \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<T, ConcreteTy>::value &&             \
-                                 std::is_same<HandlerTy, T>::value,            \
-                             bool>>
+  template <typename T, typename, ConcreteTy *>
 
 HANDLER_SPECIFIC_TEMPLATE_DEFINITION(matrix_handler)
 product_op<T> sum_op<HandlerTy>::number(std::size_t target) {
@@ -1692,10 +1664,7 @@ INSTANTIATE_SUM_UTILITY_FUNCTIONS(fermion_handler);
 
 #define HANDLER_SPECIFIC_TEMPLATE_DEFINITION(ConcreteTy)                       \
   template <typename HandlerTy>                                                \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<T, ConcreteTy>::value &&             \
-                                 std::is_same<HandlerTy, T>::value,            \
-                             bool>>
+  template <typename T, typename, ConcreteTy *>
 
 HANDLER_SPECIFIC_TEMPLATE_DEFINITION(spin_handler)
 std::size_t sum_op<HandlerTy>::num_qubits() const {
@@ -1951,10 +1920,7 @@ sum_op<spin_handler>::get_data_representation() const;
 
 #define SPIN_OPS_BACKWARD_COMPATIBILITY_DEFINITION                             \
   template <typename HandlerTy>                                                \
-  template <typename T,                                                        \
-            std::enable_if_t<std::is_same<HandlerTy, spin_handler>::value &&   \
-                                 std::is_same<HandlerTy, T>::value,            \
-                             bool>>
+  template <typename T, typename>
 
 SPIN_OPS_BACKWARD_COMPATIBILITY_DEFINITION
 sum_op<HandlerTy>::sum_op(const std::vector<double> &input_vec,
diff --git a/runtime/cudaq/operators/templates.h b/runtime/cudaq/operators/templates.h
index 40841920025..4f50ebae871 100644
--- a/runtime/cudaq/operators/templates.h
+++ b/runtime/cudaq/operators/templates.h
@@ -5,15 +5,6 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
-
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
 #include "operator_leafs.h"
 #include <complex>
 #include <type_traits>
@@ -32,11 +23,10 @@ template <typename HandlerTy>
 class sum_op;
 
 #define TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)                             \
-  std::enable_if_t<!std::is_same<LHtype, RHtype>::value &&                     \
-                       !std::is_same<matrix_handler, LHtype>::value &&         \
-                       std::is_base_of<operator_handler, LHtype>::value &&     \
-                       std::is_base_of<operator_handler, RHtype>::value,       \
-                   bool>
+  typename = std::enable_if_t < !std::is_same<LHtype, RHtype>::value &&        \
+             !std::is_same<matrix_handler, LHtype>::value &&                   \
+             std::is_base_of<operator_handler, LHtype>::value &&               \
+             std::is_base_of<operator_handler, RHtype>::value >
 
 template <typename HandlerTy>
 product_op<HandlerTy> operator*(const scalar_operator &other,
@@ -58,15 +48,15 @@ sum_op<HandlerTy> operator-(const scalar_operator &other,
                             product_op<HandlerTy> &&self);
 
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 product_op<matrix_handler> operator*(const product_op<LHtype> &other,
                                      const product_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator+(const product_op<LHtype> &other,
                                  const product_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator-(const product_op<LHtype> &other,
                                  const product_op<RHtype> &self);
 
@@ -90,39 +80,39 @@ sum_op<HandlerTy> operator-(const scalar_operator &other,
                             sum_op<HandlerTy> &&self);
 
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator*(const sum_op<LHtype> &other,
                                  const product_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator+(const sum_op<LHtype> &other,
                                  const product_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator-(const sum_op<LHtype> &other,
                                  const product_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator*(const product_op<LHtype> &other,
                                  const sum_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator+(const product_op<LHtype> &other,
                                  const sum_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator-(const product_op<LHtype> &other,
                                  const sum_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator*(const sum_op<LHtype> &other,
                                  const sum_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator+(const sum_op<LHtype> &other,
                                  const sum_op<RHtype> &self);
 template <typename LHtype, typename RHtype,
-          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)>
 sum_op<matrix_handler> operator-(const sum_op<LHtype> &other,
                                  const sum_op<RHtype> &self);
 
diff --git a/runtime/cudaq/platform/default/CMakeLists.txt b/runtime/cudaq/platform/default/CMakeLists.txt
index 7f9a7b18e0e..ff550edf4d4 100644
--- a/runtime/cudaq/platform/default/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/CMakeLists.txt
@@ -7,7 +7,8 @@
 # ============================================================================ #
 
 set(LIBRARY_NAME cudaq-platform-default)
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ctad-maybe-unsupported")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ctad-maybe-unsupported")
+set(INTERFACE_POSITION_INDEPENDENT_CODE ON)
 
 set(CUDAQ_DEFAULTPLATFORM_SRC
   DefaultQuantumPlatform.cpp
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 177baffdc81..2a9abb52461 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -12,7 +12,6 @@
 #include "common/Environment.h"
 #include "common/ExecutionContext.h"
 #include "common/RuntimeTarget.h"
-#include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
@@ -28,6 +27,8 @@
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "runtime/cudaq/platform/PythonSignalCheck.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Export.h"
@@ -41,8 +42,6 @@ void setResourceCounts(cudaq::Resources &&);
 }
 
 using namespace mlir;
-using namespace cudaq_internal::compiler;
-using cudaq::JitEngine;
 
 static void specializeKernel(const std::string &name, ModuleOp module,
                              const std::vector<void *> &rawArgs,
@@ -53,7 +52,7 @@ static void specializeKernel(const std::string &name, ModuleOp module,
   PassManager pm(module.getContext());
   cudaq::addPythonSignalInstrumentation(pm);
   pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
-  ArgumentConverter argCon(name, module);
+  cudaq_internal::compiler::ArgumentConverter argCon(name, module);
   // Look up the kernel's type signature.
   argCon.gen(name, module, rawArgs);
   SmallVector<std::string> kernels;
@@ -69,13 +68,13 @@ static void specializeKernel(const std::string &name, ModuleOp module,
   }
 
   // Collect references for the argument synthesis.
-  SmallVector<StringRef> kernelRefs{kernels.begin(), kernels.end()};
-  SmallVector<StringRef> substRefs{substs.begin(), substs.end()};
+  llvm::SmallVector<llvm::StringRef> kernelRefs{kernels.begin(), kernels.end()};
+  llvm::SmallVector<llvm::StringRef> substRefs{substs.begin(), substs.end()};
 
   // Run a pass manager to specialize & optimize the kernel to be launched.
   pm.addPass(cudaq::opt::createArgumentSynthesisPass(
       kernelRefs, substRefs, /*changeSemantics=*/false));
-  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addPass(cudaq::opt::createLambdaLifting({.constantPropagation = true}));
   // We must inline these lambda calls before apply specialization as it does
   // not perform control/adjoint specialization across function call boundary.
@@ -91,7 +90,7 @@ static void specializeKernel(const std::string &name, ModuleOp module,
     pm.addPass(cudaq::opt::createGenerateKernelExecution(
         {.positNullary = isFullySpecialized, .ignoreHostFunction = true}));
   }
-  pm.addPass(createSymbolDCEPass());
+  pm.addPass(mlir::createSymbolDCEPass());
   if (enablePythonCodegenDump) {
     module.getContext()->disableMultithreading();
     pm.enableIRPrinting();
@@ -146,7 +145,7 @@ static void substitutePipelinePlaceholders(
 /// so those are not interleaved here. Targets needing passes from those stages
 /// (e.g., apply-control-negations) should include them in their own config
 /// fields. Only reads top-level config:, not configuration-matrix entries.
-static void runTargetPassPipeline(ModuleOp module) {
+static void runTargetPassPipeline(mlir::ModuleOp module) {
   auto *rt = cudaq::get_platform().get_runtime_target();
   if (!rt)
     return;
@@ -169,7 +168,7 @@ static void runTargetPassPipeline(ModuleOp module) {
     pm.enableIRPrinting();
   std::string errMsg;
   llvm::raw_string_ostream errOS(errMsg);
-  if (failed(parsePassPipeline(pipeline, pm, errOS)))
+  if (mlir::failed(mlir::parsePassPipeline(pipeline, pm, errOS)))
     throw std::runtime_error("Failed to parse target pipeline: " + errMsg);
   if (failed(cudaq::runPassManagerReleasingGIL(pm, module)))
     throw std::runtime_error("Pass pipeline failed.");
@@ -179,12 +178,13 @@ static void runTargetPassPipeline(ModuleOp module) {
 /// transport layer. If \p kernelName and \p args are provided, they will
 /// specialize the selected entry-point kernel.
 std::string cudaq::detail::lower_to_qir_llvm(const std::string &name,
-                                             ModuleOp module,
+                                             mlir::ModuleOp module,
                                              OpaqueArguments &args,
                                              const std::string &format) {
   ScopedTraceWithContext(cudaq::TIMING_JIT, "getQIR", name);
   // Translate the module to QIR transport layer (as LLVM code).
-  mergeAllCallableClosures(module, name, args.getArgs());
+  cudaq_internal::compiler::mergeAllCallableClosures(module, name,
+                                                     args.getArgs());
   specializeKernel(name, module, args.getArgs());
   runTargetPassPipeline(module);
   PassManager pm(module.getContext());
@@ -198,9 +198,8 @@ std::string cudaq::detail::lower_to_qir_llvm(const std::string &name,
   if (failed(cudaq::verifier::checkQIRLLVMIRDialect(module, format)))
     throw std::runtime_error("QIR conformance failed.");
   llvm::LLVMContext llvmContext;
-  llvmContext.setOpaquePointers(false);
   std::unique_ptr<llvm::Module> llvmModule =
-      translateModuleToLLVMIR(module, llvmContext);
+      mlir::translateModuleToLLVMIR(module, llvmContext);
   if (!llvmModule)
     return "{translation failed}";
   std::string result;
@@ -214,11 +213,12 @@ std::string cudaq::detail::lower_to_qir_llvm(const std::string &name,
 /// QASM` code. \p kernelName and \p args should be provided, as they will
 /// specialize the selected entry-point kernel.
 std::string cudaq::detail::lower_to_openqasm(const std::string &name,
-                                             ModuleOp module,
+                                             mlir::ModuleOp module,
                                              OpaqueArguments &args) {
   ScopedTraceWithContext(cudaq::TIMING_JIT, "getASM", name);
   // Translate module to OpenQASM2 transport layer.
-  mergeAllCallableClosures(module, name, args.getArgs());
+  cudaq_internal::compiler::mergeAllCallableClosures(module, name,
+                                                     args.getArgs());
   specializeKernel(name, module, args.getArgs());
   runTargetPassPipeline(module);
   auto *ctx = module.getContext();
@@ -238,14 +238,14 @@ std::string cudaq::detail::lower_to_openqasm(const std::string &name,
     throw std::runtime_error("Pass pipeline failed.");
   std::string result;
   llvm::raw_string_ostream os(result);
-  if (failed(cudaq::translateToOpenQASM(module, os)))
+  if (mlir::failed(cudaq::translateToOpenQASM(module, os)))
     return "{translation failed}";
   os.flush();
   return result;
 }
 
 /// Scan \p module and set flags in the current platform context accordingly.
-static void updateExecutionContext(ModuleOp module) {
+static void updateExecutionContext(mlir::ModuleOp module) {
   auto *currentExecCtx = cudaq::getExecutionContext();
   if (!currentExecCtx)
     return;
@@ -263,7 +263,7 @@ static void updateExecutionContext(ModuleOp module) {
   }
 }
 
-static std::optional<JitEngine>
+static std::optional<cudaq::JitEngine>
 alreadyBuiltJITCode(const std::string &name,
                     const std::vector<void *> &rawArgs) {
   auto *currentExecCtx = cudaq::getExecutionContext();
@@ -288,7 +288,7 @@ alreadyBuiltJITCode(const std::string &name,
 /// cached so that it can be called many times in a loop without being
 /// recompiled. This exploits the fact that the arguments processed at the
 /// sample callsite are invariant by the definition of a `CUDA-Q` kernel.
-static void cacheJITForPerformance(JitEngine jit) {
+static void cacheJITForPerformance(cudaq::JitEngine jit) {
   auto *currentExecCtx = cudaq::getExecutionContext();
   if (currentExecCtx && currentExecCtx->allowJitEngineCaching) {
     if (!currentExecCtx->jitEng)
@@ -299,19 +299,20 @@ static void cacheJITForPerformance(JitEngine jit) {
 /// When the execution context is "resource-count", extract gate counts and
 /// depth metrics from the optimized MLIR IR. Pre-counted gates are erased
 /// from the module, so the subsequent JIT compiles a near-empty module.
-static void precountResources(ModuleOp module) {
+static void precountResources(mlir::ModuleOp module) {
   auto *ctx = cudaq::getExecutionContext();
   if (!ctx || ctx->name != "resource-count")
     return;
   auto counts = cudaq::opt::countResourcesFromIR(module);
-  if (failed(counts))
+  if (mlir::failed(counts))
     return;
   nvqir::setResourceCounts(std::move(*counts));
 }
 
 namespace {
 struct PythonLauncher : public cudaq::ModuleLauncher {
-  cudaq::CompiledModule compileModule(const std::string &name, ModuleOp module,
+  cudaq::CompiledModule compileModule(const std::string &name,
+                                      mlir::ModuleOp module,
                                       const std::vector<void *> &rawArgs,
                                       bool isEntryPoint) override {
 
@@ -322,14 +323,15 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
 
     std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
 
-    auto funcOp = module.lookupSymbol<func::FuncOp>(fullName);
+    auto funcOp = module.lookupSymbol<mlir::func::FuncOp>(fullName);
     if (!funcOp)
       throw std::runtime_error("no kernel named " + name + " found in module");
-    Type resultTy = cudaq::runtime::getReturnType(funcOp);
+    mlir::Type resultTy = cudaq::runtime::getReturnType(funcOp);
 
     const bool hasResult = !!resultTy;
     auto resultInfo =
-        CompiledModuleHelper::createResultInfo(resultTy, isEntryPoint, module);
+        cudaq_internal::compiler::CompiledModuleHelper::createResultInfo(
+            resultTy, isEntryPoint, module);
 
     // Determine whether the kernel needs argument packing (argsCreator) by
     // checking if any non-callable arguments are present. This must be done
@@ -360,10 +362,11 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
     }
 
     if (auto jit = alreadyBuiltJITCode(name, rawArgs)) {
-      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
-          name, *jit, resultInfo, isFullySpecialized);
-      return CompiledModuleHelper::createCompiledModule(name, resultInfo,
-                                                        jitArtifacts);
+      auto jitArtifacts =
+          cudaq_internal::compiler::CompiledModuleHelper::createJitArtifacts(
+              name, *jit, resultInfo, isFullySpecialized);
+      return cudaq_internal::compiler::CompiledModuleHelper::
+          createCompiledModule(name, resultInfo, jitArtifacts);
     }
 
     // 1. Check that this call is sane.
@@ -371,11 +374,11 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
       module.dump();
 
     // 2. Merge other modules (e.g., if there are device kernel calls).
-    mergeAllCallableClosures(module, name, rawArgs);
+    cudaq_internal::compiler::mergeAllCallableClosures(module, name, rawArgs);
 
     // Mark all newly merged kernels private.
     for (auto &op : module)
-      if (auto f = dyn_cast<func::FuncOp>(op))
+      if (auto f = mlir::dyn_cast<mlir::func::FuncOp>(op))
         if (f != funcOp)
           f.setPrivate();
 
@@ -396,16 +399,42 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
     precountResources(module);
 
     // 4. Lower to QIR and JIT compile.
-    auto jit = createJITEngine(module, "qir:");
+    auto jit = cudaq_internal::compiler::createJITEngine(module, "qir:");
     cacheJITForPerformance(jit);
     cudaq::compiler_artifact::saveArtifact(name, jit);
 
-    auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
-        name, jit, resultInfo, isFullySpecialized);
-    return CompiledModuleHelper::createCompiledModule(
+    auto jitArtifacts =
+        cudaq_internal::compiler::CompiledModuleHelper::createJitArtifacts(
+            name, jit, resultInfo, isFullySpecialized);
+    return cudaq_internal::compiler::CompiledModuleHelper::createCompiledModule(
         name, std::move(resultInfo), jitArtifacts);
   }
 };
 } // namespace
 
-CUDAQ_REGISTER_TYPE(cudaq::ModuleLauncher, PythonLauncher, default)
+// PythonLauncher registration. This TU only builds into the Python extension
+// (_quakeDialects.so), but `launchModule` / `specializeModule` live in
+// libcudaq.so. CUDA-Q Registry uses `static inline Head/Tail`, so each DSO
+// that instantiates the template gets its own copy — `CUDAQ_REGISTER_TYPE`
+// would add the node to the extension's (unseen-by-libcudaq) registry. We
+// instead call the `cudaq_add_module_launcher_node` bridge defined in
+// libcudaq.so so the registration lands in the registry that `launchModule`
+// actually reads. Mirrors the `cudaq_add_qpu_node` pattern used for QPUs.
+extern "C" void cudaq_add_module_launcher_node(void *node_ptr);
+
+namespace {
+struct PythonLauncherRegistration {
+  cudaq::RegistryEntry<cudaq::ModuleLauncher> entry;
+  cudaq::Registry<cudaq::ModuleLauncher>::node node;
+  PythonLauncherRegistration()
+      : entry("default", &PythonLauncherRegistration::ctorFn), node(entry) {
+    cudaq_add_module_launcher_node(&node);
+  }
+  static std::unique_ptr<cudaq::ModuleLauncher> ctorFn() {
+    return std::make_unique<PythonLauncher>();
+  }
+};
+static PythonLauncherRegistration s_pythonLauncherRegistration;
+} // namespace
+
+extern "C" void cudaq_ensure_default_launcher_linked(void) {}
diff --git a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
index 53173b65b29..2c505a784cc 100644
--- a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
@@ -8,6 +8,8 @@
 
 #include "common/BaseRemoteRESTQPU.h"
 
+#include <memory>
+
 using namespace mlir;
 
 namespace {
@@ -33,4 +35,28 @@ class RemoteRESTQPU : public cudaq::BaseRemoteRESTQPU {
 };
 } // namespace
 
+// When compiled into the standalone libcudaq-rest-qpu.so, use
+// CUDAQ_REGISTER_TYPE directly (same DSO as the registry instantiation's
+// consumer). When compiled into the Python extension, we must register into
+// libcudaq's QPU registry via the C-linkage hook, same pattern as
+// PythonLauncher.
+#ifdef CUDAQ_PYTHON_EXTENSION
+extern "C" void cudaq_add_qpu_node(void *node_ptr);
+
+namespace {
+struct RemoteRESTQPURegistration {
+  cudaq::RegistryEntry<cudaq::QPU> entry;
+  cudaq::Registry<cudaq::QPU>::node node;
+  RemoteRESTQPURegistration()
+      : entry("remote_rest", &RemoteRESTQPURegistration::ctorFn), node(entry) {
+    cudaq_add_qpu_node(&node);
+  }
+  static std::unique_ptr<cudaq::QPU> ctorFn() {
+    return std::make_unique<RemoteRESTQPU>();
+  }
+};
+static RemoteRESTQPURegistration s_remoteRESTQPURegistration;
+} // namespace
+#else
 CUDAQ_REGISTER_TYPE(cudaq::QPU, RemoteRESTQPU, remote_rest)
+#endif
diff --git a/runtime/cudaq/platform/default/rest_server/CMakeLists.txt b/runtime/cudaq/platform/default/rest_server/CMakeLists.txt
index cb4185889e6..6d87bc36f83 100644
--- a/runtime/cudaq/platform/default/rest_server/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest_server/CMakeLists.txt
@@ -6,11 +6,64 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
-# Minizip is a zlib addon installed by install_prerequisites.sh.
-find_package(Minizip REQUIRED)
+# Unzip utility based on libz.
+# Minizip is an addon library, not included by default in the official libz distribution.
+# Hence, we require libz installation via the `install_prerequisites.sh` script, which does install minizip.
 add_library(unzip_util STATIC helpers/UnzipUtils.cpp)
-target_link_libraries(unzip_util PRIVATE fmt::fmt-header-only Minizip::Minizip)
+target_link_libraries(unzip_util PRIVATE fmt::fmt-header-only)
 target_include_directories(unzip_util PRIVATE $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime>)
+find_package(PkgConfig)
+# By default, Minizip has package config (.pc) file.
+# If CMake can find PkgConfig, use it to find minizip
+
+find_path(MINIZIP_PKG_CONFIG_DIR NAMES minizip.pc
+  HINTS 
+    ${ZLIB_ROOT}/lib/pkgconfig
+    $ENV{ZLIB_INSTALL_PREFIX}/lib/pkgconfig
+    ${ZLIB_INCLUDE_DIR}/../lib/pkgconfig
+)
+if (PkgConfig_FOUND AND MINIZIP_PKG_CONFIG_DIR) 
+  set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:${MINIZIP_PKG_CONFIG_DIR}")
+  pkg_check_modules(MINI_ZIP IMPORTED_TARGET minizip)
+  # Make sure that we link to minizip static library
+  if (MINI_ZIP_FOUND)
+    find_library(MINI_ZIP_LIB NAMES libminizip.a
+      HINTS 
+        ${MINI_ZIP_LIBRARY_DIRS}
+    )
+    target_link_libraries(unzip_util PRIVATE ${MINI_ZIP_LIB})
+    target_include_directories(unzip_util PRIVATE ${ZLIB_INCLUDE_DIR} ${MINI_ZIP_INCLUDE_DIRS})  
+  endif()
+else()
+  # No PkgConfig, locate the lib manually
+  # Make sure that we find minizip static library
+  find_library(MINI_ZIP_LIB NAMES libminizip.a
+    HINTS 
+      ${ZLIB_ROOT}/lib
+      $ENV{ZLIB_INSTALL_PREFIX}/lib
+      ${ZLIB_INCLUDE_DIR}/../lib
+  )
+  get_filename_component(MINI_LIB_DIR ${MINI_ZIP_LIB} DIRECTORY)
+  find_file(MINI_UNZIP_INC NAMES unzip.h 
+    HINTS 
+      ${MINI_LIB_DIR}/../include
+      ${MINI_LIB_DIR}/../include/minizip
+  )
+  if (MINI_ZIP_LIB AND MINI_UNZIP_INC)
+    message(STATUS "Minizip found: ${MINI_ZIP_LIB} and ${MINI_UNZIP_INC}")
+    target_link_libraries(unzip_util PRIVATE ${MINI_ZIP_LIB} ZLIB::ZLIB)
+    get_filename_component(MINI_INCLUDE_DIR ${MINI_UNZIP_INC} DIRECTORY)
+    target_include_directories(unzip_util
+      PRIVATE
+        ${MINI_INCLUDE_DIR} ${ZLIB_INCLUDE_DIR}
+    )
+    set(MINI_ZIP_FOUND TRUE)
+  endif()
+endif()
+
+if (NOT MINI_ZIP_FOUND)
+  message(FATAL_ERROR "Minizip from zLib NOT found. Please run the 'install_prerequisites.sh' script to install zLib with Minizip")
+endif()
 
 set(LIBRARY_NAME rest-remote-platform-client)
 add_library(${LIBRARY_NAME}
@@ -47,20 +100,40 @@ target_include_directories(rest-remote-platform-server
     ../..
 )
 
-target_link_libraries(rest-remote-platform-server
-  PRIVATE
-    rest_server_impl
-    cudaq 
-    cudaq-operator
-    cudaq-em-default 
-    cudaq-mlir-runtime
-    cudaq-platform-default
-    cudaq-qir-verifier
-    nvqir
-    fmt::fmt-header-only
-)
+if(APPLE)
+  target_link_libraries(rest-remote-platform-server
+    PRIVATE
+      rest_server_impl
+      cudaq
+      cudaq-operator
+      cudaq-em-default
+      cudaq-mlir-runtime
+      cudaq-platform-default
+      "$<TARGET_FILE:cudaq-qir-verifier>"
+      nvqir
+      fmt::fmt-header-only
+  )
+  add_dependencies(rest-remote-platform-server cudaq-qir-verifier)
+  target_link_options(rest-remote-platform-server PRIVATE
+    "LINKER:-undefined,dynamic_lookup")
+else()
+  target_link_libraries(rest-remote-platform-server
+    PRIVATE
+      rest_server_impl
+      cudaq
+      cudaq-operator
+      cudaq-em-default
+      cudaq-mlir-runtime
+      cudaq-platform-default
+      cudaq-qir-verifier
+      nvqir
+      fmt::fmt-header-only
+  )
+endif()
 
-if (cuStateVec_FOUND)
+if (CUDA_FOUND AND CUSTATEVEC_ROOT)
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
   target_compile_definitions(rest-remote-platform-server PRIVATE CUDAQ_ENABLE_CUDA)
   target_link_libraries(rest-remote-platform-server PRIVATE CUDA::cudart_static)
 endif()
diff --git a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
index 2b3194b2528..00cc353b47d 100644
--- a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
+++ b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
@@ -55,18 +55,19 @@
 #include <fstream>
 #include <streambuf>
 
+using namespace mlir;
+
 extern "C" {
 void __nvqir__setCircuitSimulator(nvqir::CircuitSimulator *);
 }
 
 namespace {
-using namespace mlir;
-using namespace cudaq_internal::compiler;
 // Encapsulates a dynamically-loaded NVQIR simulator library
 struct SimulatorHandle {
   std::string name;
   void *libHandle;
 };
+} // namespace
 
 // Implementation of llvm::cantFail which throws a C++ exception rather than
 // emits a signal/asserts.
@@ -83,7 +84,7 @@ T getValueOrThrow(llvm::Expected<T> valOrErr,
 // Clear any registered operations in the ExecutionManager and then destroy the
 // JIT. This needs to be called when the registered operations may contain
 // pointers into the code objects inside the JIT.
-void clearRegOpsAndDestroyJIT(std::unique_ptr<llvm::orc::LLJIT> &jit) {
+static void clearRegOpsAndDestroyJIT(std::unique_ptr<llvm::orc::LLJIT> &jit) {
   cudaq::getExecutionManager()->clearRegisteredOperations();
   // Destroys the LLJIT object
   jit.reset();
@@ -105,10 +106,11 @@ auto withExecutionContextExceptRun(cudaq::ExecutionContext &io_context,
 // Optionally, the JIT'ed kernel can be executed a number of
 // times along with a post-execution callback. For example, sample a dynamic
 // kernel.
-void invokeWrappedKernel(
-    std::function<void()> func, cudaq::ExecutionContext &executionContext,
-    std::size_t numTimes = 1,
-    std::function<void(std::size_t)> postExecCallback = {}) {
+static void
+invokeWrappedKernel(std::function<void()> func,
+                    cudaq::ExecutionContext &executionContext,
+                    std::size_t numTimes = 1,
+                    std::function<void(std::size_t)> postExecCallback = {}) {
   auto &platform = cudaq::get_platform();
   for (std::size_t i = 0; i < numTimes; ++i) {
     // Invoke the wrapper with serialized data and the kernel.
@@ -120,6 +122,7 @@ void invokeWrappedKernel(
   }
 }
 
+namespace {
 class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
   int m_port = -1;
   std::unique_ptr<cudaq::RestServer> m_server;
@@ -196,7 +199,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
         [&](const std::string &reqBody,
             const std::unordered_multimap<std::string, std::string> &headers) {
           requestStart = std::chrono::high_resolution_clock::now();
-          auto shutdownAfterHandlingRequest = llvm::make_scope_exit([&] {
+          llvm::scope_exit stopGuard([&] {
             if (this->exitAfterJob)
               m_server->stop();
           });
@@ -212,7 +215,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
 
           return resultJs;
         });
-    m_mlirContext = getOwningMLIRContext();
+    m_mlirContext = cudaq_internal::compiler::getOwningMLIRContext();
     m_hasMpi = cudaq::mpi::is_initialized();
   }
 
@@ -351,8 +354,9 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
         // In library mode (LLVM), check to see if we have mid-circuit measures
         // by tracing the kernel function.
         cudaq::ExecutionContext context("tracer");
-        std::tie(llvmJit, wrappedKernel) = createWrappedKernel(
-            ir, std::string(kernelName), kernelArgs, argsSize);
+        std::tie(llvmJit, wrappedKernel) =
+            cudaq_internal::compiler::createWrappedKernel(
+                ir, std::string(kernelName), kernelArgs, argsSize);
         invokeWrappedKernel(wrappedKernel, context);
         // In trace mode, if we have a measure result
         // that is passed to an if statement, then
@@ -370,8 +374,9 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
           // deleted.
           clearRegOpsAndDestroyJIT(llvmJit);
           // If it has conditionals, loop over individual circuit executions
-          std::tie(llvmJit, wrappedKernel) = createWrappedKernel(
-              ir, std::string(kernelName), kernelArgs, argsSize);
+          std::tie(llvmJit, wrappedKernel) =
+              cudaq_internal::compiler::createWrappedKernel(
+                  ir, std::string(kernelName), kernelArgs, argsSize);
           invokeWrappedKernel(wrappedKernel, io_context, io_context.shots,
                               [&](std::size_t i) {
                                 // Flush the single measure result and
@@ -386,13 +391,15 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
           // in an LLVM JIT, we must clear them before any prior LLVM JIT gets
           // deleted.
           clearRegOpsAndDestroyJIT(llvmJit);
-          std::tie(llvmJit, wrappedKernel) = createWrappedKernel(
-              ir, std::string(kernelName), kernelArgs, argsSize);
+          std::tie(llvmJit, wrappedKernel) =
+              cudaq_internal::compiler::createWrappedKernel(
+                  ir, std::string(kernelName), kernelArgs, argsSize);
           invokeWrappedKernel(wrappedKernel, io_context);
         }
       } else {
-        std::tie(llvmJit, wrappedKernel) = createWrappedKernel(
-            ir, std::string(kernelName), kernelArgs, argsSize);
+        std::tie(llvmJit, wrappedKernel) =
+            cudaq_internal::compiler::createWrappedKernel(
+                ir, std::string(kernelName), kernelArgs, argsSize);
         invokeWrappedKernel(wrappedKernel, io_context);
       }
     } else {
@@ -431,9 +438,10 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
     CUDAQ_INFO("Running jitCode.");
     auto module = currentModule.clone();
     ExecutionEngineOptions opts;
-    opts.transformer = [](llvm::Module *m) { return llvm::ErrorSuccess(); };
+    auto transformerTemp = [](llvm::Module *m) { return llvm::ErrorSuccess(); };
+    opts.transformer = std::move(transformerTemp);
     opts.enableObjectDump = true;
-    opts.jitCodeGenOptLevel = llvm::CodeGenOpt::None;
+    opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::None;
     SmallVector<StringRef, 4> sharedLibs;
     for (auto &lib : extraLibPaths) {
       CUDAQ_INFO("Extra library loaded: {}", lib);
@@ -473,18 +481,24 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
 
     CUDAQ_INFO("- Finish IR input verification.");
 
-    opts.llvmModuleBuilder =
+    auto llvmModuleBuilderTemp =
         [](Operation *module,
            llvm::LLVMContext &llvmContext) -> std::unique_ptr<llvm::Module> {
-      llvmContext.setOpaquePointers(false);
       auto llvmModule = translateModuleToLLVMIR(module, llvmContext);
       if (!llvmModule) {
         llvm::errs() << "Failed to emit LLVM IR\n";
         return nullptr;
       }
-      ExecutionEngine::setupTargetTriple(llvmModule.get());
+      auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+      if (tmBuilderOrError) {
+        auto tmOrError = tmBuilderOrError->createTargetMachine();
+        if (tmOrError)
+          ExecutionEngine::setupTargetTripleAndDataLayout(
+              llvmModule.get(), tmOrError.get().get());
+      }
       return llvmModule;
     };
+    opts.llvmModuleBuilder = std::move(llvmModuleBuilderTemp);
 
     CUDAQ_INFO("- Creating the MLIR ExecutionEngine");
     auto uniqueJit =
@@ -617,7 +631,7 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
     });
 
     // Notify watchdog thread of graceful completion at scope exit
-    auto notifyWatchdog = llvm::make_scope_exit([&] {
+    llvm::scope_exit watchdogGuard([&] {
       std::unique_lock<std::mutex> lock(watchdogMutex);
       processingComplete = true;
       lock.unlock();
@@ -771,7 +785,6 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
     }
   }
 };
-
 } // namespace
 
 CUDAQ_REGISTER_TYPE(cudaq::RemoteRuntimeServer, RemoteRestRuntimeServer, rest)
diff --git a/runtime/cudaq/platform/default/rest_server/helpers/server_impl/CMakeLists.txt b/runtime/cudaq/platform/default/rest_server/helpers/server_impl/CMakeLists.txt
index 4294259180b..27ccd959d35 100644
--- a/runtime/cudaq/platform/default/rest_server/helpers/server_impl/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest_server/helpers/server_impl/CMakeLists.txt
@@ -12,3 +12,10 @@ add_library(${LIBRARY_NAME} OBJECT RestServer.cpp)
 target_link_libraries(${LIBRARY_NAME} PRIVATE Crow::Crow)
 target_include_directories(${LIBRARY_NAME} PRIVATE
   ../ ${Crow_SOURCE_DIR}/include ${asio_SOURCE_DIR}/asio/include)
+if (APPLE)
+  include(CheckCXXCompilerFlag)
+  check_cxx_compiler_flag(-Wno-deprecated-literal-operator CUDAQ_HAS_WNO_DEPRECATED_LITERAL_OPERATOR)
+  if (CUDAQ_HAS_WNO_DEPRECATED_LITERAL_OPERATOR)
+    target_compile_options(${LIBRARY_NAME} PRIVATE -Wno-deprecated-literal-operator)
+  endif()
+endif()
diff --git a/runtime/cudaq/platform/fermioniq/CMakeLists.txt b/runtime/cudaq/platform/fermioniq/CMakeLists.txt
index b2a8feb0aab..92ed7dec78e 100644
--- a/runtime/cudaq/platform/fermioniq/CMakeLists.txt
+++ b/runtime/cudaq/platform/fermioniq/CMakeLists.txt
@@ -21,10 +21,9 @@ target_link_libraries(${LIBRARY_NAME}
     cudaq-operator
     cudaq-common 
   PRIVATE
-    pthread
-    cudaq-mlir-runtime 
+    cudaq-mlir-runtime
     fmt::fmt-header-only
-    cudaq 
+    cudaq
     cudaq-platform-default
     nvqir
 )
diff --git a/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp b/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
index 980d1018729..a8bc0a463d1 100644
--- a/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
+++ b/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
@@ -21,7 +21,13 @@
 #include <filesystem>
 #include <fstream>
 
-CUDAQ_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType)
+// Note: LLVM_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType) is intentionally
+// NOT placed here. The canonical QPU registry instance lives in
+// quantum_platform.cpp (libcudaq). With LLVM 22's static-inline Head/Tail
+// pointers in llvm::Registry, having the instantiation in multiple DSOs can
+// cause registry fragmentation — nodes added via cudaq_add_qpu_node (which
+// targets libcudaq's registry) would be invisible to code in this DSO if the
+// linker kept separate copies. A single instantiation in libcudaq avoids this.
 
 namespace {
 class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
diff --git a/runtime/cudaq/platform/mqpu/remote/CMakeLists.txt b/runtime/cudaq/platform/mqpu/remote/CMakeLists.txt
index 8e8fb49b443..93b622eb7a2 100644
--- a/runtime/cudaq/platform/mqpu/remote/CMakeLists.txt
+++ b/runtime/cudaq/platform/mqpu/remote/CMakeLists.txt
@@ -6,15 +6,32 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
+if (OPENSSL_FOUND AND CUDAQ_ENABLE_REST)
 add_library(cudaq-remote-simulator-qpu SHARED RemoteSimulatorQPU.cpp)
-target_link_libraries(cudaq-remote-simulator-qpu PUBLIC 
-                        cudaq
-                        cudaq-common 
-                        cudaq-logger
-                        cudaq-mlir-runtime
-                        rest-remote-platform-client
-                        cudaq-platform-mqpu
-                    )
-                    
-install(TARGETS cudaq-remote-simulator-qpu DESTINATION lib)
+target_link_libraries(cudaq-remote-simulator-qpu
+  PUBLIC
+    cudaq
+    cudaq-common
+    cudaq-logger
+    cudaq-mlir-runtime
+    rest-remote-platform-client
+    cudaq-platform-mqpu
+)
+# rest-remote-platform-client only registers its symbols via static
+# constructors; GNU ld's --as-needed drops it because no symbol is
+# referenced directly. Bracket just that library with --no-as-needed
+# so it stays in DT_NEEDED; otherwise the "rest" RemoteRuntimeClient
+# is never registered and BaseRemoteSimulatorQPU's constructor
+# segfaults on registry::get<RemoteRuntimeClient>("rest").
+# Apple's ld doesn't drop directly-linked dylibs by default and
+# doesn't understand --push-state/--no-as-needed/--pop-state, so
+# this is Linux-only.
+if(NOT APPLE)
+  target_link_options(cudaq-remote-simulator-qpu PRIVATE
+    "LINKER:--push-state,--no-as-needed"
+    "LINKER:$<TARGET_FILE:rest-remote-platform-client>"
+    "LINKER:--pop-state")
+endif()
 
+install(TARGETS cudaq-remote-simulator-qpu DESTINATION lib)
+endif()
diff --git a/runtime/cudaq/platform/orca/CMakeLists.txt b/runtime/cudaq/platform/orca/CMakeLists.txt
index 57f3cd6cc92..27c2d9c8185 100644
--- a/runtime/cudaq/platform/orca/CMakeLists.txt
+++ b/runtime/cudaq/platform/orca/CMakeLists.txt
@@ -27,10 +27,9 @@ target_link_libraries(${LIBRARY_NAME}
     cudaq-operator
     cudaq-common 
   PRIVATE
-    pthread
-    cudaq-mlir-runtime 
+    cudaq-mlir-runtime
     fmt::fmt-header-only
-    cudaq 
+    cudaq
     cudaq-platform-default)
 
 install(TARGETS ${LIBRARY_NAME} DESTINATION lib)
diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
index 580dda022a5..41d103bddbd 100644
--- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
@@ -11,6 +11,10 @@
 #include "nlohmann/json.hpp"
 #include "llvm/Support/Base64.h"
 
+#ifdef CUDAQ_PYTHON_EXTENSION
+extern "C" cudaq::ServerHelper *cudaq_find_server_helper(const char *name);
+#endif
+
 using namespace cudaq;
 
 /// @brief This setTargetBackend override is in charge of reading the
@@ -53,7 +57,11 @@ void cudaq::OrcaRemoteRESTQPU::setTargetBackend(const std::string &backend) {
   /// pipeline.
   // Set the qpu name
   qpuName = mutableBackend;
+#ifdef CUDAQ_PYTHON_EXTENSION
+  serverHelper.reset(cudaq_find_server_helper(qpuName.c_str()));
+#else
   serverHelper = registry::get<ServerHelper>(qpuName);
+#endif
   serverHelper->initialize(backendConfig);
 
   // Give the server helper to the executor
@@ -94,4 +102,28 @@ KernelThunkResultType cudaq::OrcaRemoteRESTQPU::launchKernelCommon(
   return {};
 }
 
+void cudaq::OrcaRemoteRESTQPU::launchKernel(const std::string &,
+                                            const std::vector<void *> &) {
+  throw std::runtime_error("launch kernel on raw args not implemented");
+}
+
+#ifdef CUDAQ_PYTHON_EXTENSION
+extern "C" void cudaq_add_qpu_node(void *node_ptr);
+
+namespace {
+struct OrcaQPURegistration {
+  cudaq::RegistryEntry<cudaq::QPU> entry;
+  cudaq::Registry<cudaq::QPU>::node node;
+  OrcaQPURegistration()
+      : entry("orca", &OrcaQPURegistration::ctorFn), node(entry) {
+    cudaq_add_qpu_node(&node);
+  }
+  static std::unique_ptr<cudaq::QPU> ctorFn() {
+    return std::make_unique<cudaq::OrcaRemoteRESTQPU>();
+  }
+};
+static OrcaQPURegistration s_orcaQPURegistration;
+} // namespace
+#else
 CUDAQ_REGISTER_TYPE(QPU, OrcaRemoteRESTQPU, orca)
+#endif
diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
index 502210a0009..4e4c28e75aa 100644
--- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
+++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
@@ -111,5 +111,8 @@ class OrcaRemoteRESTQPU : public cudaq::QPU {
                const std::vector<void *> &rawArgs) override {
     return launchKernelCommon(kernelName, kernelFunc, args);
   }
+
+  void launchKernel(const std::string &kernelName,
+                    const std::vector<void *> &rawArgs);
 };
 } // namespace cudaq
diff --git a/runtime/cudaq/platform/pasqal/CMakeLists.txt b/runtime/cudaq/platform/pasqal/CMakeLists.txt
index 038e57e0ce6..2e84ec8b834 100644
--- a/runtime/cudaq/platform/pasqal/CMakeLists.txt
+++ b/runtime/cudaq/platform/pasqal/CMakeLists.txt
@@ -31,13 +31,6 @@ add_library(${SERVERHELPER_LIBRARY_NAME}
 set(_pasqal_public_link_libs
   cudaq-operator
   cudaq-common)
-set(_pasqal_private_link_libs
-  pthread
-  cudaq-mlir-runtime
-  fmt::fmt-header-only
-  cudaq
-  cudaq-platform-default
-  nvqir)
 foreach(_pasqal_target IN ITEMS ${LIBRARY_NAME} ${SERVERHELPER_LIBRARY_NAME})
   target_include_directories(${_pasqal_target} PRIVATE .
     PUBLIC
@@ -46,9 +39,22 @@ foreach(_pasqal_target IN ITEMS ${LIBRARY_NAME} ${SERVERHELPER_LIBRARY_NAME})
   )
   target_link_libraries(${_pasqal_target}
     PUBLIC ${_pasqal_public_link_libs}
-    PRIVATE ${_pasqal_private_link_libs}
   )
 endforeach()
+
+target_link_libraries(${LIBRARY_NAME}
+  PRIVATE
+    cudaq-mlir-runtime
+    fmt::fmt-header-only
+    cudaq
+    cudaq-platform-default
+    nvqir)
+
+target_link_libraries(${SERVERHELPER_LIBRARY_NAME}
+  PRIVATE
+    fmt::fmt-header-only
+    cudaq-logger
+    pthread)
     
 install(TARGETS ${LIBRARY_NAME} DESTINATION lib)
 install(TARGETS ${SERVERHELPER_LIBRARY_NAME} DESTINATION lib)
diff --git a/runtime/cudaq/platform/pasqal/PasqalRemoteRESTQPU.cpp b/runtime/cudaq/platform/pasqal/PasqalRemoteRESTQPU.cpp
index cfad8d52d05..079aaf86097 100644
--- a/runtime/cudaq/platform/pasqal/PasqalRemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/pasqal/PasqalRemoteRESTQPU.cpp
@@ -20,4 +20,23 @@ class PasqalRemoteRESTQPU : public cudaq::AnalogRemoteRESTQPU {
 };
 } // namespace
 
+#ifdef CUDAQ_PYTHON_EXTENSION
+extern "C" void cudaq_add_qpu_node(void *node_ptr);
+
+namespace {
+struct PasqalQPURegistration {
+  cudaq::RegistryEntry<cudaq::QPU> entry;
+  cudaq::Registry<cudaq::QPU>::node node;
+  PasqalQPURegistration()
+      : entry("pasqal", &PasqalQPURegistration::ctorFn), node(entry) {
+    cudaq_add_qpu_node(&node);
+  }
+  static std::unique_ptr<cudaq::QPU> ctorFn() {
+    return std::make_unique<PasqalRemoteRESTQPU>();
+  }
+};
+static PasqalQPURegistration s_pasqalQPURegistration;
+} // namespace
+#else
 CUDAQ_REGISTER_TYPE(cudaq::QPU, PasqalRemoteRESTQPU, pasqal)
+#endif
diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp
index 27bdba1f0e1..dfe4229a63d 100644
--- a/runtime/cudaq/platform/qpu.cpp
+++ b/runtime/cudaq/platform/qpu.cpp
@@ -15,11 +15,21 @@ using namespace cudaq_internal::compiler;
 
 CUDAQ_INSTANTIATE_REGISTRY(cudaq::ModuleLauncher::RegistryType)
 
+// Bridge so the Python extension can register PythonLauncher into this DSO's
+// registry. CUDA-Q Registry uses static inline Head/Tail, so each DSO that
+// instantiates the template gets its own copy; launchModule runs in this DSO
+// and reads the empty list. Registering via this function adds to our list.
+extern "C" void cudaq_add_module_launcher_node(void *node_ptr) {
+  using Node = cudaq::Registry<cudaq::ModuleLauncher>::node;
+  cudaq::Registry<cudaq::ModuleLauncher>::add_node(
+      static_cast<Node *>(node_ptr));
+}
+
 /// Execute a JIT-compiled kernel with provided arguments.
 ///
-/// Handles argument marshaling via `argsCreator` (if not fully specialized) and
-/// result buffer allocation.
-cudaq::KernelThunkResultType
+/// Handles argument marshaling via `argsCreator` (if not fully specialized)
+/// and result buffer allocation.
+static cudaq::KernelThunkResultType
 launchCompiledModule(const cudaq::CompiledModule &compiled,
                      const std::vector<void *> &rawArgs) {
   auto funcPtr = compiled.getJit()->getFn();
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index daf054adaef..9593c908795 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -24,6 +24,13 @@ using namespace cudaq_internal::compiler;
 
 CUDAQ_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType)
 
+// Bridge so the Python extension can register QPU subtypes (e.g. RemoteRESTQPU)
+// into this DSO's registry. Same pattern as cudaq_add_module_launcher_node.
+extern "C" void cudaq_add_qpu_node(void *node_ptr) {
+  using Node = cudaq::Registry<cudaq::QPU>::node;
+  cudaq::Registry<cudaq::QPU>::add_node(static_cast<Node *>(node_ptr));
+}
+
 namespace cudaq {
 
 // These functions are defined elsewhere, but
diff --git a/runtime/cudaq/platform/quera/CMakeLists.txt b/runtime/cudaq/platform/quera/CMakeLists.txt
index ef493e8a8df..c9d84d797d0 100644
--- a/runtime/cudaq/platform/quera/CMakeLists.txt
+++ b/runtime/cudaq/platform/quera/CMakeLists.txt
@@ -22,8 +22,7 @@ target_link_libraries(${LIBRARY_NAME}
     cudaq-operator
     cudaq-common 
   PRIVATE
-    pthread
-    cudaq-mlir-runtime 
+    cudaq-mlir-runtime
     cudaq-logger
     cudaq
     cudaq-platform-default
diff --git a/runtime/cudaq/platform/quera/QuEraRemoteRESTQPU.cpp b/runtime/cudaq/platform/quera/QuEraRemoteRESTQPU.cpp
index 0c9de6ae231..e8fabe9cb00 100644
--- a/runtime/cudaq/platform/quera/QuEraRemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/quera/QuEraRemoteRESTQPU.cpp
@@ -20,4 +20,23 @@ class QuEraRemoteRESTQPU : public cudaq::AnalogRemoteRESTQPU {
 };
 } // namespace
 
+#ifdef CUDAQ_PYTHON_EXTENSION
+extern "C" void cudaq_add_qpu_node(void *node_ptr);
+
+namespace {
+struct QuEraQPURegistration {
+  cudaq::RegistryEntry<cudaq::QPU> entry;
+  cudaq::Registry<cudaq::QPU>::node node;
+  QuEraQPURegistration()
+      : entry("quera", &QuEraQPURegistration::ctorFn), node(entry) {
+    cudaq_add_qpu_node(&node);
+  }
+  static std::unique_ptr<cudaq::QPU> ctorFn() {
+    return std::make_unique<QuEraRemoteRESTQPU>();
+  }
+};
+static QuEraQPURegistration s_queraQPURegistration;
+} // namespace
+#else
 CUDAQ_REGISTER_TYPE(cudaq::QPU, QuEraRemoteRESTQPU, quera)
+#endif
diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h
index 6e3df5db04f..0c791c4ea10 100644
--- a/runtime/cudaq/qis/qubit_qis.h
+++ b/runtime/cudaq/qis/qubit_qis.h
@@ -135,7 +135,7 @@ void oneQubitApplyControlledRange(QubitRange &ctrls, qubit &target) {
 #define CUDAQ_QIS_ONE_TARGET_QUBIT_(NAME)                                      \
   namespace types {                                                            \
   struct NAME {                                                                \
-    inline static const std::string name{#NAME};                               \
+    static constexpr std::string_view name{#NAME};                             \
   };                                                                           \
   }                                                                            \
   template <typename mod = base, typename... QubitArgs>                        \
@@ -223,7 +223,7 @@ void oneQubitSingleParameterControlledRange(ScalarAngle angle,
 #define CUDAQ_QIS_PARAM_ONE_TARGET_(NAME)                                      \
   namespace types {                                                            \
   struct NAME {                                                                \
-    inline static const std::string name{#NAME};                               \
+    static constexpr std::string_view name{#NAME};                             \
   };                                                                           \
   }                                                                            \
   template <typename mod = base, typename ScalarAngle, typename... QubitArgs>  \
@@ -247,7 +247,7 @@ CUDAQ_QIS_PARAM_ONE_TARGET_(r1)
 
 namespace types {
 struct u3 {
-  inline static const std::string name{"u3"};
+  static constexpr std::string_view name{"u3"};
 };
 } // namespace types
 
@@ -297,7 +297,7 @@ void u3(ScalarAngle theta, ScalarAngle phi, ScalarAngle lambda,
 // Define the swap gate instruction and control versions of it
 namespace types {
 struct swap {
-  inline static const std::string name{"swap"};
+  static constexpr std::string_view name{"swap"};
 };
 } // namespace types
 
diff --git a/runtime/internal/compiler/ArgumentConversion.cpp b/runtime/internal/compiler/ArgumentConversion.cpp
index 5306e447fa5..3b9a9c9dce3 100644
--- a/runtime/internal/compiler/ArgumentConversion.cpp
+++ b/runtime/internal/compiler/ArgumentConversion.cpp
@@ -16,17 +16,19 @@
 #include "cudaq/Todo.h"
 #include "cudaq/qis/pauli_word.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/IR/DataLayout.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Parser/Parser.h"
 
 using namespace mlir;
-using namespace cudaq_internal::compiler;
 
 template <typename A>
 Value genIntegerConstant(OpBuilder &builder, A v, unsigned bits) {
-  return builder.create<arith::ConstantIntOp>(builder.getUnknownLoc(), v, bits);
+  return arith::ConstantIntOp::create(builder, builder.getUnknownLoc(),
+                                      builder.getIntegerType(bits),
+                                      static_cast<int64_t>(v));
 }
 
 static Value genConstant(OpBuilder &builder, bool v) {
@@ -46,12 +48,12 @@ static Value genConstant(OpBuilder &builder, std::int64_t v) {
 }
 
 static Value genConstant(OpBuilder &builder, float v) {
-  return builder.create<arith::ConstantFloatOp>(
-      builder.getUnknownLoc(), APFloat{v}, builder.getF32Type());
+  return arith::ConstantFloatOp::create(builder, builder.getUnknownLoc(),
+                                        builder.getF32Type(), APFloat{v});
 }
 static Value genConstant(OpBuilder &builder, double v) {
-  return builder.create<arith::ConstantFloatOp>(
-      builder.getUnknownLoc(), APFloat{v}, builder.getF64Type());
+  return arith::ConstantFloatOp::create(builder, builder.getUnknownLoc(),
+                                        builder.getF64Type(), APFloat{v});
 }
 
 template <typename A>
@@ -62,7 +64,7 @@ Value genComplexConstant(OpBuilder &builder, const std::complex<A> &v,
   auto complexAttr = builder.getArrayAttr({rePart, imPart});
   auto loc = builder.getUnknownLoc();
   auto ty = ComplexType::get(fTy);
-  return builder.create<complex::ConstantOp>(loc, ty, complexAttr).getResult();
+  return complex::ConstantOp::create(builder, loc, ty, complexAttr).getResult();
 }
 
 static Value genConstant(OpBuilder &builder, std::complex<float> v) {
@@ -72,9 +74,9 @@ static Value genConstant(OpBuilder &builder, std::complex<double> v) {
   return genComplexConstant(builder, v, builder.getF64Type());
 }
 static Value genConstant(OpBuilder &builder, FloatType fltTy, long double *v) {
-  return builder.create<arith::ConstantFloatOp>(
-      builder.getUnknownLoc(),
-      APFloat{fltTy.getFloatSemantics(), std::to_string(*v)}, fltTy);
+  return arith::ConstantFloatOp::create(
+      builder, builder.getUnknownLoc(), fltTy,
+      APFloat{fltTy.getFloatSemantics(), std::to_string(*v)});
 }
 
 static Value genConstant(OpBuilder &builder, const std::string &v,
@@ -85,12 +87,12 @@ static Value genConstant(OpBuilder &builder, const std::string &v,
   auto strLitTy = cudaq::cc::PointerType::get(
       cudaq::cc::ArrayType::get(ctx, i8Ty, v.size() + 1));
   auto strLit =
-      builder.create<cudaq::cc::CreateStringLiteralOp>(loc, strLitTy, v);
+      cudaq::cc::CreateStringLiteralOp::create(builder, loc, strLitTy, v);
   auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
-  auto cast = builder.create<cudaq::cc::CastOp>(loc, i8PtrTy, strLit);
-  auto size = builder.create<arith::ConstantIntOp>(loc, v.size(), 64);
+  auto cast = cudaq::cc::CastOp::create(builder, loc, i8PtrTy, strLit);
+  auto size = arith::ConstantIntOp::create(builder, loc, v.size(), 64);
   auto chSpanTy = cudaq::cc::CharspanType::get(ctx);
-  return builder.create<cudaq::cc::StdvecInitOp>(loc, chSpanTy, cast, size);
+  return cudaq::cc::StdvecInitOp::create(builder, loc, chSpanTy, cast, size);
 }
 
 // Forward declare aggregate type builder as they can be recursive.
@@ -147,8 +149,8 @@ static Value genConstant(OpBuilder &, cudaq::cc::CallableType, void *, ModuleOp,
 
   auto *entryBlock = &initFunc.getRegion().front();
   newBuilder.setInsertionPointToStart(entryBlock);
-  Value zero = newBuilder.create<arith::ConstantIntOp>(loc, 0, 64);
-  Value one = newBuilder.create<arith::ConstantIntOp>(loc, 1, 64);
+  Value zero = arith::ConstantIntOp::create(newBuilder, loc, 0, 64);
+  Value one = arith::ConstantIntOp::create(newBuilder, loc, 1, 64);
   Value begin = zero;
 
   auto argPos = initFunc.getArguments().size();
@@ -186,23 +188,24 @@ static Value genConstant(OpBuilder &, cudaq::cc::CallableType, void *, ModuleOp,
         newBuilder.setInsertionPointAfter(alloc);
 
         if (!arg) {
-          initFunc.insertArgument(argPos, retTy, {}, loc);
+          (void)initFunc.insertArgument(argPos, retTy, {}, loc);
           arg = initFunc.getArgument(argPos);
         }
 
-        auto allocSize = alloc.getSize();
+        Value allocSize = alloc.getSize();
         if (!allocSize)
-          allocSize = newBuilder.create<arith::ConstantIntOp>(
-              loc, quake::getAllocationSize(alloc.getType()), 64);
+          allocSize = arith::ConstantIntOp::create(
+              newBuilder, loc, newBuilder.getI64Type(),
+              quake::getAllocationSize(alloc.getType()));
 
-        auto offset = newBuilder.create<arith::SubIOp>(loc, allocSize, one);
+        auto offset = arith::SubIOp::create(newBuilder, loc, allocSize, one);
         subArg =
-            newBuilder.create<quake::SubVeqOp>(loc, retTy, arg, begin, offset);
+            quake::SubVeqOp::create(newBuilder, loc, retTy, arg, begin, offset);
         alloc.replaceAllUsesWith(subArg);
         cleanUps.push_back(alloc);
-        begin = newBuilder.create<arith::AddIOp>(loc, begin, allocSize);
+        begin = arith::AddIOp::create(newBuilder, loc, begin, allocSize);
         blockAllocSize =
-            newBuilder.create<arith::AddIOp>(loc, blockAllocSize, allocSize);
+            arith::AddIOp::create(newBuilder, loc, blockAllocSize, allocSize);
       }
 
       if (auto retOp = dyn_cast<func::ReturnOp>(&op)) {
@@ -210,12 +213,12 @@ static Value genConstant(OpBuilder &, cudaq::cc::CallableType, void *, ModuleOp,
           newBuilder.setInsertionPointAfter(retOp);
 
           auto offset =
-              newBuilder.create<arith::SubIOp>(loc, blockAllocSize, one);
-          Value ret = newBuilder.create<quake::SubVeqOp>(loc, retTy, arg,
-                                                         blockBegin, offset);
+              arith::SubIOp::create(newBuilder, loc, blockAllocSize, one);
+          Value ret = quake::SubVeqOp::create(newBuilder, loc, retTy, arg,
+                                              blockBegin, offset);
 
           assert(arg && "No veq allocations found");
-          replacedReturn = newBuilder.create<func::ReturnOp>(loc, ret);
+          replacedReturn = func::ReturnOp::create(newBuilder, loc, ret);
           cleanUps.push_back(retOp);
         }
       }
@@ -271,7 +274,8 @@ createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
 
   auto *entryBlock = &numQubitsFunc.getRegion().front();
   newBuilder.setInsertionPointToStart(entryBlock);
-  Value size = newBuilder.create<arith::ConstantIntOp>(loc, 0, retType);
+  Value size = arith::ConstantIntOp::create(newBuilder, loc, retType,
+                                            static_cast<int64_t>(0));
 
   // Process block recursively to calculate and return allocation size
   // and remove everything else.
@@ -282,12 +286,13 @@ createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
     for (auto &op : block) {
       // Calculate allocation size (existing allocation size plus new one)
       if (auto alloc = dyn_cast<quake::AllocaOp>(&op)) {
-        auto allocSize = alloc.getSize();
+        Value allocSize = alloc.getSize();
         if (!allocSize)
-          allocSize = newBuilder.create<arith::ConstantIntOp>(
-              loc, quake::getAllocationSize(alloc.getType()), 64);
+          allocSize = arith::ConstantIntOp::create(
+              newBuilder, loc, newBuilder.getI64Type(),
+              quake::getAllocationSize(alloc.getType()));
         newBuilder.setInsertionPointAfter(alloc);
-        size = newBuilder.create<arith::AddIOp>(loc, size, allocSize);
+        size = arith::AddIOp::create(newBuilder, loc, size, allocSize);
       }
 
       // Return allocation size
@@ -295,7 +300,7 @@ createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
         if (retOp != replacedReturn) {
 
           newBuilder.setInsertionPointAfter(retOp);
-          auto newRet = newBuilder.create<func::ReturnOp>(loc, size);
+          auto newRet = func::ReturnOp::create(newBuilder, loc, size);
           replacedReturn = newRet;
           used.push_back(newRet);
         }
@@ -341,9 +346,10 @@ createNumQubitsFunc(OpBuilder &builder, ModuleOp moduleOp,
   process(numQubitsFunc.getRegion().front());
 }
 
-static Value genConstant(OpBuilder &builder, const cudaq::state *v,
-                         llvm::DataLayout &layout, StringRef kernelName,
-                         ModuleOp substMod, ArgumentConverter &converter) {
+static Value
+genConstant(OpBuilder &builder, const cudaq::state *v, llvm::DataLayout &layout,
+            StringRef kernelName, ModuleOp substMod,
+            cudaq_internal::compiler::ArgumentConverter &converter) {
   auto ctx = builder.getContext();
   auto loc = builder.getUnknownLoc();
   auto simState =
@@ -359,7 +365,7 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
     // Cast int value to state ptr
     auto statePtrTy = cudaq::cc::PointerType::get(quake::StateType::get(ctx));
     Value statePtrVal =
-        builder.create<cudaq::cc::CastOp>(loc, statePtrTy, ptrInt);
+        cudaq::cc::CastOp::create(builder, loc, statePtrTy, ptrInt);
     return statePtrVal;
   }
 
@@ -498,8 +504,8 @@ static Value genConstant(OpBuilder &builder, const cudaq::state *v,
 
     // Create a substitution for the state pointer.
     auto statePtrTy = cudaq::cc::PointerType::get(quake::StateType::get(ctx));
-    return builder.create<quake::MaterializeStateOp>(
-        loc, statePtrTy, builder.getStringAttr(numQubitsKernelName),
+    return quake::MaterializeStateOp::create(
+        builder, loc, statePtrTy, builder.getStringAttr(numQubitsKernelName),
         builder.getStringAttr(initKernelName));
   }
 
@@ -685,15 +691,15 @@ Value genRecursiveSpan(OpBuilder &builder, cudaq::cc::StdvecType ty, void *p,
   auto loc = builder.getUnknownLoc();
   if (!constants) {
     // Empty vector. Not much to contemplate here.
-    auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 64);
-    auto ptr = builder.create<cudaq::cc::CastOp>(
-        loc, cudaq::cc::PointerType::get(ty.getElementType()), zero);
-    return builder.create<cudaq::cc::StdvecInitOp>(loc, ty, ptr, zero);
+    auto zero = arith::ConstantIntOp::create(builder, loc, 0, 64);
+    auto ptr = cudaq::cc::CastOp::create(
+        builder, loc, cudaq::cc::PointerType::get(ty.getElementType()), zero);
+    return cudaq::cc::StdvecInitOp::create(builder, loc, ty, ptr, zero);
   }
   auto arrTy = convertRecursiveSpanType(ty);
   auto conArr =
-      builder.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, constants);
-  return builder.create<cudaq::cc::ReifySpanOp>(loc, ty, conArr);
+      cudaq::cc::ConstantArrayOp::create(builder, loc, arrTy, constants);
+  return cudaq::cc::ReifySpanOp::create(builder, loc, ty, conArr);
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
@@ -714,20 +720,21 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
   std::int32_t vecSize = delta / eleSize;
   auto eleArrTy =
       cudaq::cc::ArrayType::get(builder.getContext(), eleTy, vecSize);
-  auto buffer = builder.create<cudaq::cc::AllocaOp>(loc, eleArrTy);
+  auto buffer = cudaq::cc::AllocaOp::create(builder, loc, eleArrTy);
   const char *cursor = (*vecPtr)[0];
   for (std::int32_t i = 0; i < vecSize; ++i) {
     if (Value val = dispatchSubtype(
             builder, eleTy, static_cast<void *>(const_cast<char *>(cursor)),
             substMod, layout)) {
-      auto atLoc = builder.create<cudaq::cc::ComputePtrOp>(
-          loc, elePtrTy, buffer, ArrayRef<cudaq::cc::ComputePtrArg>{i});
-      builder.create<cudaq::cc::StoreOp>(loc, val, atLoc);
+      auto atLoc = cudaq::cc::ComputePtrOp::create(
+          builder, loc, elePtrTy, buffer,
+          ArrayRef<cudaq::cc::ComputePtrArg>{i});
+      cudaq::cc::StoreOp::create(builder, loc, val, atLoc);
     }
     cursor += eleSize;
   }
-  auto size = builder.create<arith::ConstantIntOp>(loc, vecSize, 64);
-  return builder.create<cudaq::cc::StdvecInitOp>(loc, vecTy, buffer, size);
+  auto size = arith::ConstantIntOp::create(builder, loc, vecSize, 64);
+  return cudaq::cc::StdvecInitOp::create(builder, loc, vecTy, buffer, size);
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
@@ -736,7 +743,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
     return {};
   const char *cursor = static_cast<const char *>(p);
   auto loc = builder.getUnknownLoc();
-  Value aggie = builder.create<cudaq::cc::UndefOp>(loc, strTy);
+  Value aggie = cudaq::cc::UndefOp::create(builder, loc, strTy);
   for (auto iter : llvm::enumerate(strTy.getMembers())) {
     auto i = iter.index();
     if (Value v = dispatchSubtype(
@@ -744,7 +751,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
             static_cast<void *>(const_cast<char *>(
                 cursor + cudaq::opt::getDataOffset(layout, strTy, i))),
             substMod, layout))
-      aggie = builder.create<cudaq::cc::InsertValueOp>(loc, strTy, aggie, v, i);
+      aggie =
+          cudaq::cc::InsertValueOp::create(builder, loc, strTy, aggie, v, i);
   }
   return aggie;
 }
@@ -769,8 +777,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
   unsigned liftedPos =
       hasLiftedArgs ? *closure->getStartLiftedPos() : inpTys.size();
   assert(liftedPos == inpTys.size() && "formal arity must be equal");
-  Value lamb = builder.create<cudaq::cc::CreateLambdaOp>(
-      loc, callTy, [&](OpBuilder &builder, Location loc) {
+  Value lamb = cudaq::cc::CreateLambdaOp::create(
+      builder, loc, callTy, [&](OpBuilder &builder, Location loc) {
         Block *entryBlock = builder.getInsertionBlock();
         SmallVector<Value> args{entryBlock->getArguments().begin(),
                                 entryBlock->getArguments().end()};
@@ -782,14 +790,14 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
             args.push_back(v);
           }
         }
-        auto result = builder.create<func::CallOp>(loc, resTy, longName, args);
-        builder.create<cudaq::cc::ReturnOp>(loc, result.getResults());
+        auto result = func::CallOp::create(builder, loc, resTy, longName, args);
+        cudaq::cc::ReturnOp::create(builder, loc, result.getResults());
       });
   auto decl = substMod.lookupSymbol<func::FuncOp>(longName);
   if (!decl) {
     OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPointToEnd(substMod.getBody());
-    auto fd = builder.create<func::FuncOp>(loc, longName, calleeTy);
+    auto fd = func::FuncOp::create(builder, loc, longName, calleeTy);
     fd.setPrivate();
   }
   return lamb;
@@ -802,14 +810,15 @@ Value genConstant(OpBuilder &builder, cudaq::cc::ArrayType arrTy, void *p,
   auto eleTy = arrTy.getElementType();
   auto loc = builder.getUnknownLoc();
   auto eleSize = cudaq::opt::getDataSize(layout, eleTy);
-  Value aggie = builder.create<cudaq::cc::UndefOp>(loc, arrTy);
+  Value aggie = cudaq::cc::UndefOp::create(builder, loc, arrTy);
   std::size_t arrSize = arrTy.getSize();
   const char *cursor = static_cast<const char *>(p);
   for (std::size_t i = 0; i < arrSize; ++i) {
     if (Value v = dispatchSubtype(
             builder, eleTy, static_cast<void *>(const_cast<char *>(cursor)),
             substMod, layout))
-      aggie = builder.create<cudaq::cc::InsertValueOp>(loc, arrTy, aggie, v, i);
+      aggie =
+          cudaq::cc::InsertValueOp::create(builder, loc, arrTy, aggie, v, i);
     cursor += eleSize;
   }
   return aggie;
@@ -834,28 +843,30 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
     cast<SymbolOpInterface>(clone).setPrivate();
   }
   auto loc = builder.getUnknownLoc();
-  auto func = builder.create<func::ConstantOp>(
-      loc, indCallTy.getSignature(),
+  auto func = func::ConstantOp::create(
+      builder, loc, indCallTy.getSignature(),
       std::string{cudaq::runtime::cudaqGenPrefixName} + name);
-  return builder.create<cudaq::cc::CastOp>(loc, indCallTy, func);
+  return cudaq::cc::CastOp::create(builder, loc, indCallTy, func);
 }
 
 //===----------------------------------------------------------------------===//
 
-ArgumentConverter::ArgumentConverter(StringRef kernelName,
-                                     ModuleOp sourceModule)
+cudaq_internal::compiler::ArgumentConverter::ArgumentConverter(
+    StringRef kernelName, ModuleOp sourceModule)
     : sourceModule(sourceModule), kernelName(kernelName) {}
 
-void ArgumentConverter::gen(std::span<void *const> arguments) {
+void cudaq_internal::compiler::ArgumentConverter::gen(
+    std::span<void *const> arguments) {
   gen(kernelName, sourceModule, arguments);
 }
 
-void ArgumentConverter::gen(StringRef kernelName, ModuleOp sourceModule,
-                            std::span<void *const> arguments) {
+void cudaq_internal::compiler::ArgumentConverter::gen(
+    StringRef kernelName, ModuleOp sourceModule,
+    std::span<void *const> arguments) {
   auto *ctx = sourceModule.getContext();
   OpBuilder builder(ctx);
   ModuleOp substModule =
-      builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
+      mlir::ModuleOp::create(builder, builder.getUnknownLoc());
   auto *kernelInfo = addKernelInfo(kernelName, substModule);
 
   // Find the kernel in the module.
@@ -877,7 +888,7 @@ void ArgumentConverter::gen(StringRef kernelName, ModuleOp sourceModule,
     auto buildSubst = [&, i = i]<typename... Ts>(Ts &&...ts) {
       builder.setInsertionPointToEnd(substModule.getBody());
       auto loc = builder.getUnknownLoc();
-      auto result = builder.create<cudaq::cc::ArgumentSubstitutionOp>(loc, i);
+      auto result = cudaq::cc::ArgumentSubstitutionOp::create(builder, loc, i);
       auto *block = new Block();
       result.getBody().push_back(block);
       builder.setInsertionPointToEnd(block);
@@ -961,8 +972,9 @@ void ArgumentConverter::gen(StringRef kernelName, ModuleOp sourceModule,
   }
 }
 
-void ArgumentConverter::gen(std::span<void *const> arguments,
-                            const std::unordered_set<unsigned> &exclusions) {
+void cudaq_internal::compiler::ArgumentConverter::gen(
+    std::span<void *const> arguments,
+    const std::unordered_set<unsigned> &exclusions) {
   std::vector<void *> partialArgs;
   for (auto iter : llvm::enumerate(arguments)) {
     if (exclusions.contains(iter.index())) {
@@ -974,8 +986,8 @@ void ArgumentConverter::gen(std::span<void *const> arguments,
   gen(partialArgs);
 }
 
-void ArgumentConverter::gen_drop_front(std::span<void *const> arguments,
-                                       unsigned numDrop) {
+void cudaq_internal::compiler::ArgumentConverter::gen_drop_front(
+    std::span<void *const> arguments, unsigned numDrop) {
   // If we're dropping all the arguments, we're done.
   if (numDrop >= arguments.size())
     return;
diff --git a/runtime/internal/compiler/CMakeLists.txt b/runtime/internal/compiler/CMakeLists.txt
index 2de4cb1d705..0628185478d 100644
--- a/runtime/internal/compiler/CMakeLists.txt
+++ b/runtime/internal/compiler/CMakeLists.txt
@@ -41,29 +41,47 @@ target_link_libraries(cudaq-mlir-runtime
   PUBLIC
     cudaq-mlir-runtime-headers
   PRIVATE
-    CCDialect
-    QuakeDialect
-    OptCodeGen
-    OptTransforms
-    MLIRTranslateLib
-    MLIRIR
-    MLIRParser
-    MLIRPass
-    MLIRTranslateLib
-    MLIRSupport
-    MLIROptLib
-    MLIRExecutionEngine
-    MLIRTransforms
-    MLIRTargetLLVMIRExport
-    MLIRLLVMCommonConversion
-    MLIRLLVMToLLVMIRTranslation
     cudaq-common
-    cudaq-qir-verifier
+    $<$<NOT:$<PLATFORM_ID:Darwin>>:cudaq-qir-verifier>
     CUDAQTargetConfigUtil
     cudaq-logger
     fmt::fmt-header-only
 )
 
+if(APPLE)
+  add_dependencies(cudaq-mlir-runtime OptCodeGen OptTransforms OptimBuilder cudaq-qir-verifier)
+  target_link_libraries(cudaq-mlir-runtime PRIVATE
+    "$<TARGET_FILE:OptCodeGen>"
+    "$<TARGET_FILE:OptTransforms>"
+    "$<TARGET_FILE:OptimBuilder>"
+    "$<TARGET_FILE:cudaq-qir-verifier>")
+else()
+  target_link_libraries(cudaq-mlir-runtime
+    PRIVATE
+      CCDialect
+      QuakeDialect
+      OptCodeGen
+      OptTransforms
+      MLIRTranslateLib
+      MLIRIR
+      MLIRParser
+      MLIRPass
+      MLIRSupport
+      MLIROptLib
+      MLIRExecutionEngine
+      MLIRTransforms
+      MLIRTargetLLVMIRExport
+      MLIRLLVMCommonConversion
+      MLIRLLVMToLLVMIRTranslation
+      MLIRFuncInlinerExtension
+  )
+endif()
+
+if(APPLE)
+  target_link_options(cudaq-mlir-runtime PRIVATE
+    "LINKER:-flat_namespace")
+endif()
+
 install(TARGETS cudaq-mlir-runtime-headers
         EXPORT cudaq-mlir-runtime-targets)
 
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index ff499856cc6..e6a1036db68 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -5,6 +5,7 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
+
 #include "cudaq_internal/compiler/Compiler.h"
 #include "common/CodeGenConfig.h"
 #include "common/DeviceCodeRegistry.h"
@@ -46,7 +47,6 @@
 #include <regex>
 
 using namespace mlir;
-using namespace cudaq_internal::compiler;
 
 namespace {
 /// Conditionally form an output_names JSON object if this was for QIR
@@ -81,8 +81,8 @@ nlohmann::json formOutputNames(const std::string &codegenTranslation,
   } else if (codegenTranslation.starts_with("qasm2")) {
     for (auto &op : moduleOp) {
       if (op.hasAttr(cudaq::entryPointAttrName) && op.hasAttr("output_names")) {
-        if (auto strAttr = op.getAttr(cudaq::opt::QIROutputNamesAttrName)
-                               .dyn_cast_or_null<mlir::StringAttr>()) {
+        if (auto strAttr = mlir::dyn_cast_if_present<mlir::StringAttr>(
+                op.getAttr(cudaq::opt::QIROutputNamesAttrName))) {
           output_names = nlohmann::json::parse(strAttr.getValue());
           break;
         }
@@ -110,8 +110,10 @@ std::vector<std::size_t> extractMappingReorderIdx(mlir::ModuleOp moduleOp,
 } // namespace
 
 std::pair<const void *, std::shared_ptr<mlir::MLIRContext>>
-Compiler::loadQuakeCodeByName(const std::string &kernelName) {
-  std::shared_ptr<mlir::MLIRContext> context(getOwningMLIRContext().release());
+cudaq_internal::compiler::Compiler::loadQuakeCodeByName(
+    const std::string &kernelName) {
+  std::shared_ptr<mlir::MLIRContext> context(
+      cudaq_internal::compiler::getOwningMLIRContext().release());
 
   // Get the quake representation of the kernel
   auto quakeCode = cudaq::get_quake_by_name(kernelName);
@@ -122,13 +124,14 @@ Compiler::loadQuakeCodeByName(const std::string &kernelName) {
   return std::make_pair(m_module.release().getAsOpaquePointer(), context);
 }
 
-Compiler::Compiler(cudaq::ServerHelper *serverHelper,
-                   const std::map<std::string, std::string> &backendConfig,
-                   cudaq::config::TargetConfig &config,
-                   const cudaq::noise_model *noiseModel, bool emulate)
+cudaq_internal::compiler::Compiler::Compiler(
+    cudaq::ServerHelper *serverHelper,
+    const std::map<std::string, std::string> &backendConfig,
+    cudaq::config::TargetConfig &config, const cudaq::noise_model *noiseModel,
+    bool emulate)
     : emulate(emulate) {
 
-  initializeMLIR();
+  cudaq_internal::compiler::initializeMLIR();
 
   // Print the IR if requested
   printIR = cudaq::getEnvBool("CUDAQ_DUMP_JIT_IR", printIR);
@@ -239,15 +242,15 @@ Compiler::Compiler(cudaq::ServerHelper *serverHelper,
   serverHelper->updatePassPipeline(platformPath, passPipelineConfig);
 }
 
-Compiler::~Compiler() = default;
+cudaq_internal::compiler::Compiler::~Compiler() = default;
 
 // =============================================================================
 // Common helpers for policy-specific runPassPipeline overloads
 // =============================================================================
 
-void Compiler::applyPipeline(const std::string &pipeline,
-                             mlir::ModuleOp moduleOp,
-                             const std::string &kernelName) {
+void cudaq_internal::compiler::Compiler::applyPipeline(
+    const std::string &pipeline, mlir::ModuleOp moduleOp,
+    const std::string &kernelName) {
   auto *contextPtr = moduleOp.getContext();
   mlir::PassManager pm(contextPtr);
   std::string errMsg;
@@ -267,8 +270,9 @@ void Compiler::applyPipeline(const std::string &pipeline,
 }
 
 std::pair<mlir::ModuleOp, mlir::func::FuncOp>
-Compiler::prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
-                        const std::vector<void *> &rawArgs, void *kernelArgs) {
+cudaq_internal::compiler::Compiler::prepareModule(
+    const std::string &kernelName, mlir::ModuleOp m_module,
+    const std::vector<void *> &rawArgs, void *kernelArgs) {
   auto *contextPtr = m_module.getContext();
 
   auto origFn = m_module.template lookupSymbol<mlir::func::FuncOp>(
@@ -283,7 +287,8 @@ Compiler::prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
   if (!rawArgs.empty() || kernelArgs) {
     mlir::PassManager pm(contextPtr);
     if (isPython)
-      mergeAllCallableClosures(moduleOp, kernelName, rawArgs);
+      cudaq_internal::compiler::mergeAllCallableClosures(moduleOp, kernelName,
+                                                         rawArgs);
 
     // Mark all newly merged kernels private, and leave the entry point alone.
     for (auto &op : moduleOp)
@@ -296,7 +301,7 @@ Compiler::prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
       // For quantum devices, we generate a collection of `init` and
       // `num_qubits` functions and their substitutions created
       // from a kernel and arguments that generated a state argument.
-      ArgumentConverter argCon(kernelName, moduleOp);
+      cudaq_internal::compiler::ArgumentConverter argCon(kernelName, moduleOp);
       argCon.gen(rawArgs);
 
       // Store kernel and substitution strings on the stack.
@@ -351,8 +356,8 @@ Compiler::prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
   return {moduleOp, epFunc};
 }
 
-bool Compiler::executeMainPipeline(mlir::ModuleOp moduleOp,
-                                   const std::string &kernelName) {
+bool cudaq_internal::compiler::Compiler::executeMainPipeline(
+    mlir::ModuleOp moduleOp, const std::string &kernelName) {
   auto combineMeasurements =
       passPipelineConfig.find("combine-measurements") != std::string::npos;
   if (emulate && combineMeasurements) {
@@ -368,25 +373,33 @@ bool Compiler::executeMainPipeline(mlir::ModuleOp moduleOp,
   return combineMeasurements;
 }
 
-cudaq::CompiledModule Compiler::assembleCompiledModule(
+cudaq::CompiledModule
+cudaq_internal::compiler::Compiler::assembleCompiledModule(
     const std::string &kernelName,
     std::vector<std::pair<std::string, mlir::ModuleOp>> &modules, bool needJit,
     bool runCombineMeasurements, std::optional<cudaq::Resources> resourceCounts,
     const std::vector<std::size_t> &mappingReorderIdx,
     std::shared_ptr<mlir::MLIRContext> context) {
-  std::vector<CompiledModuleHelper::NamedCompiledArtifact> artifacts;
+  std::vector<
+      cudaq_internal::compiler::CompiledModuleHelper::NamedCompiledArtifact>
+      artifacts;
   if (needJit) {
     for (auto &[name, module] : modules) {
       auto clonedModule = module.clone();
-      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
-          kernelName, createJITEngine(clonedModule, codegenTranslation), {},
-          /*isFullySpecialized=*/true);
+      auto jitArtifacts =
+          cudaq_internal::compiler::CompiledModuleHelper::createJitArtifacts(
+              kernelName,
+              cudaq_internal::compiler::createJITEngine(clonedModule,
+                                                        codegenTranslation),
+              {},
+              /*isFullySpecialized=*/true);
       assert(jitArtifacts.size() == 1);
       jitArtifacts[0].first = name;
       artifacts.push_back(std::move(jitArtifacts[0]));
       if (resourceCounts)
-        artifacts.push_back(CompiledModuleHelper::createResourcesArtifact(
-            name, std::move(*resourceCounts)));
+        artifacts.push_back(
+            cudaq_internal::compiler::CompiledModuleHelper::
+                createResourcesArtifact(name, std::move(*resourceCounts)));
     }
   }
 
@@ -396,18 +409,18 @@ cudaq::CompiledModule Compiler::assembleCompiledModule(
 
   for (auto &[name, module] : modules) {
     artifacts.push_back(
-        CompiledModuleHelper::createMlirArtifact(name, module, context));
+        cudaq_internal::compiler::CompiledModuleHelper::createMlirArtifact(
+            name, module, context));
   }
 
-  return CompiledModuleHelper::createCompiledModule(
+  return cudaq_internal::compiler::CompiledModuleHelper::createCompiledModule(
       kernelName, {}, std::move(artifacts), {.reorderIdx = mappingReorderIdx});
 }
 
-cudaq::CompiledModule
-Compiler::runPassPipeline(cudaq::ExecutionContext *executionContext,
-                          const std::string &kernelName, const void *modulePtr,
-                          const std::vector<void *> &rawArgs, void *kernelArgs,
-                          std::shared_ptr<mlir::MLIRContext> context) {
+cudaq::CompiledModule cudaq_internal::compiler::Compiler::runPassPipeline(
+    cudaq::ExecutionContext *executionContext, const std::string &kernelName,
+    const void *modulePtr, const std::vector<void *> &rawArgs, void *kernelArgs,
+    std::shared_ptr<mlir::MLIRContext> context) {
   mlir::ModuleOp m_module = mlir::ModuleOp::getFromOpaquePointer(modulePtr);
   assert(!context || context.get() == m_module.getContext());
   auto [moduleOp, epFunc] =
@@ -543,14 +556,18 @@ Compiler::runPassPipeline(cudaq::ExecutionContext *executionContext,
 }
 
 std::vector<cudaq::KernelExecution>
-Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
+cudaq_internal::compiler::Compiler::emitKernelExecutions(
+    const cudaq::CompiledModule &compiled) {
   // Get the code gen translation
-  auto translation = getTranslation(codegenTranslation);
+  auto translation =
+      cudaq_internal::compiler::getTranslation(codegenTranslation);
 
   // Apply user-specified codegen
   std::vector<cudaq::KernelExecution> codes;
   for (const auto &[name, mlirArtifact] : compiled.getMlirArtifacts()) {
-    auto moduleOpI = CompiledModuleHelper::getMlirModuleOp(mlirArtifact);
+    auto moduleOpI =
+        cudaq_internal::compiler::CompiledModuleHelper::getMlirModuleOp(
+            mlirArtifact);
 
     std::string codeStr;
     llvm::raw_string_ostream outStr(codeStr);
@@ -595,15 +612,16 @@ Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
 /// lowering process is controllable via the configuration file in the
 /// platform directory for the targeted backend.
 std::vector<cudaq::KernelExecution>
-Compiler::lowerQuakeCode(cudaq::ExecutionContext *executionContext,
-                         const std::string &kernelName, const void *modulePtr,
-                         void *kernelArgs, const std::vector<void *> &rawArgs) {
+cudaq_internal::compiler::Compiler::lowerQuakeCode(
+    cudaq::ExecutionContext *executionContext, const std::string &kernelName,
+    const void *modulePtr, void *kernelArgs,
+    const std::vector<void *> &rawArgs) {
   auto compiled = runPassPipeline(executionContext, kernelName, modulePtr,
                                   rawArgs, kernelArgs, nullptr);
   return emitKernelExecutions(compiled);
 }
 
-mlir::ModuleOp Compiler::lowerQuakeCodeBuildModule(
+mlir::ModuleOp cudaq_internal::compiler::Compiler::lowerQuakeCodeBuildModule(
     const std::string &kernelName, mlir::ModuleOp m_module,
     mlir::MLIRContext *contextPtr, mlir::func::FuncOp func) {
   llvm::SmallVector<mlir::func::FuncOp> newFuncOpsWithDefinitions;
@@ -679,8 +697,8 @@ mlir::ModuleOp Compiler::lowerQuakeCodeBuildModule(
       auto funcType = builder.getFunctionType(argTypes, resTypes);
 
       // Create a *declaration* (no body) for the callback function.
-      [[maybe_unused]] auto decl = builder.create<mlir::func::FuncOp>(
-          deviceCall.getLoc(), calleeName, funcType);
+      [[maybe_unused]] auto decl = mlir::func::FuncOp::create(
+          builder, deviceCall.getLoc(), calleeName, funcType);
       decl.setPrivate();
       deviceCallCallees.insert(calleeName.str());
     });
@@ -693,7 +711,7 @@ mlir::ModuleOp Compiler::lowerQuakeCodeBuildModule(
   // FIXME this should be added to the builder.
   if (!func->hasAttr(cudaq::entryPointAttrName))
     func->setAttr(cudaq::entryPointAttrName, builder.getUnitAttr());
-  auto moduleOp = builder.create<mlir::ModuleOp>();
+  auto moduleOp = mlir::ModuleOp::create(builder);
   moduleOp->setAttrs(m_module->getAttrDictionary());
   auto mangledNameMap = m_module->getAttrOfType<mlir::DictionaryAttr>(
       cudaq::runtime::mangledNameMap);
diff --git a/runtime/internal/compiler/JIT.cpp b/runtime/internal/compiler/JIT.cpp
index 5499c09aa11..8b9479da039 100644
--- a/runtime/internal/compiler/JIT.cpp
+++ b/runtime/internal/compiler/JIT.cpp
@@ -49,8 +49,6 @@
 #define DEBUG_TYPE "cudaq-qpud"
 
 using namespace mlir;
-using namespace cudaq_internal::compiler;
-using cudaq::JitEngine;
 
 std::tuple<std::unique_ptr<llvm::orc::LLJIT>, std::function<void()>>
 cudaq_internal::compiler::createWrappedKernel(std::string_view irString,
@@ -121,17 +119,21 @@ cudaq_internal::compiler::createWrappedKernel(std::string_view irString,
   if (mangledKernelNames.first.empty() || mangledKernelNames.second.empty())
     throw std::runtime_error("Failed to locate symbols from the IR");
 
-  ExecutionEngine::setupTargetTriple(llvmModule.get());
+  auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+  if (tmBuilderOrError) {
+    auto tmOrError = tmBuilderOrError->createTargetMachine();
+    if (tmOrError)
+      mlir::ExecutionEngine::setupTargetTripleAndDataLayout(
+          llvmModule.get(), tmOrError.get().get());
+  }
   auto dataLayout = llvmModule->getDataLayout();
 
   // Create the object layer
-  auto objectLinkingLayerCreator = [&](llvm::orc::ExecutionSession &session,
-                                       const llvm::Triple &tt) {
-    auto objectLayer =
-        std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, []() {
+  auto objectLinkingLayerCreator = [&](llvm::orc::ExecutionSession &session) {
+    auto objectLayer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(
+        session, [](const llvm::MemoryBuffer &) {
           return std::make_unique<llvm::SectionMemoryManager>();
         });
-    llvm::Triple targetTriple(llvm::Twine(llvmModule->getTargetTriple()));
     return objectLayer;
   };
 
@@ -165,11 +167,10 @@ cudaq_internal::compiler::createWrappedKernel(std::string_view irString,
   return std::make_tuple(std::move(jit), callable);
 }
 
-namespace {
-void insertSetupAndCleanupOperations(Operation *module) {
+static void insertSetupAndCleanupOperations(Operation *module) {
   OpBuilder modBuilder(module);
   auto *context = module->getContext();
-  auto arrayQubitTy = cudaq::opt::getArrayType(context);
+  auto arrayQubitTy = cudaq::cg::getLLVMArrayType(context);
   auto voidTy = LLVM::LLVMVoidType::get(context);
   auto boolTy = modBuilder.getI1Type();
   FlatSymbolRefAttr allocateSymbol =
@@ -215,32 +216,35 @@ void insertSetupAndCleanupOperations(Operation *module) {
     OpBuilder builder(&block, block.begin());
     auto loc = builder.getUnknownLoc();
 
-    auto origMode = builder.create<LLVM::CallOp>(loc, TypeRange{boolTy},
-                                                 isDynamicSymbol, ValueRange{});
+    auto origMode =
+        mlir::LLVM::CallOp::create(builder, loc, mlir::TypeRange{boolTy},
+                                   isDynamicSymbol, mlir::ValueRange{});
 
     auto numQubitsVal =
         cudaq::opt::factory::genLlvmI64Constant(loc, builder, num_qubits);
-    auto falseVal = builder.create<LLVM::ConstantOp>(
-        loc, boolTy, builder.getI16IntegerAttr(false));
+    auto falseVal = mlir::LLVM::ConstantOp::create(
+        builder, loc, boolTy, builder.getI16IntegerAttr(false));
 
-    auto qubitAlloc = builder.create<LLVM::CallOp>(
-        loc, TypeRange{arrayQubitTy}, allocateSymbol,
-        ValueRange{numQubitsVal.getResult()});
-    builder.create<LLVM::CallOp>(loc, TypeRange{voidTy}, setDynamicSymbol,
-                                 ValueRange{falseVal.getResult()});
+    auto qubitAlloc = mlir::LLVM::CallOp::create(
+        builder, loc, mlir::TypeRange{arrayQubitTy}, allocateSymbol,
+        mlir::ValueRange{numQubitsVal.getResult()});
+    mlir::LLVM::CallOp::create(builder, loc, mlir::TypeRange{voidTy},
+                               setDynamicSymbol,
+                               mlir::ValueRange{falseVal.getResult()});
 
     // At the end of the function, deallocate the qubits and restore the
     // simulator state.
     builder.setInsertionPoint(std::prev(blocks.end())->getTerminator());
-    builder.create<LLVM::CallOp>(loc, TypeRange{voidTy}, releaseSymbol,
-                                 ValueRange{qubitAlloc.getResult()});
-    builder.create<LLVM::CallOp>(loc, TypeRange{voidTy}, setDynamicSymbol,
-                                 ValueRange{origMode.getResult()});
-    builder.create<LLVM::CallOp>(loc, TypeRange{voidTy}, clearResultMapsSymbol,
-                                 ValueRange{});
+    mlir::LLVM::CallOp::create(builder, loc, mlir::TypeRange{voidTy},
+                               releaseSymbol,
+                               mlir::ValueRange{qubitAlloc.getResult()});
+    mlir::LLVM::CallOp::create(builder, loc, mlir::TypeRange{voidTy},
+                               setDynamicSymbol,
+                               mlir::ValueRange{origMode.getResult()});
+    mlir::LLVM::CallOp::create(builder, loc, mlir::TypeRange{voidTy},
+                               clearResultMapsSymbol, mlir::ValueRange{});
   }
 }
-} // namespace
 
 cudaq::JitEngine
 cudaq_internal::compiler::createJITEngine(ModuleOp &moduleOp,
@@ -252,15 +256,15 @@ cudaq_internal::compiler::createJITEngine(ModuleOp &moduleOp,
   llvm::cl::ParseCommandLineOptions(2, argv);
 
   ExecutionEngineOptions opts;
-  opts.transformer = [](llvm::Module *m) { return llvm::ErrorSuccess(); };
-  opts.jitCodeGenOptLevel = llvm::CodeGenOpt::None;
-  opts.llvmModuleBuilder =
+  auto transformerTemp = [](llvm::Module *m) { return llvm::ErrorSuccess(); };
+  opts.transformer = std::move(transformerTemp);
+  opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::None;
+  auto llvmModuleBuilderTemp =
       [convertTo = convertTo.str()](
           Operation *module,
           llvm::LLVMContext &llvmContext) -> std::unique_ptr<llvm::Module> {
     ScopedTraceWithContext(cudaq::TIMING_JIT,
                            "createJITEngine::llvmModuleBuilder");
-    llvmContext.setOpaquePointers(false);
 
     auto *context = module->getContext();
     PassManager pm(context);
@@ -331,13 +335,20 @@ cudaq_internal::compiler::createJITEngine(ModuleOp &moduleOp,
     if (!llvmModule)
       throw std::runtime_error("[createJITEngine] Lowering to LLVM IR failed.");
 
-    ExecutionEngine::setupTargetTriple(llvmModule.get());
+    auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+    if (tmBuilderOrError) {
+      auto tmOrError = tmBuilderOrError->createTargetMachine();
+      if (tmOrError)
+        mlir::ExecutionEngine::setupTargetTripleAndDataLayout(
+            llvmModule.get(), tmOrError.get().get());
+    }
     return llvmModule;
   };
+  opts.llvmModuleBuilder = std::move(llvmModuleBuilderTemp);
 
   auto jitOrError = ExecutionEngine::create(moduleOp, opts);
   assert(!!jitOrError && "ExecutionEngine creation failed.");
-  return JitEngine(std::move(jitOrError.get()));
+  return cudaq::JitEngine(std::move(jitOrError.get()));
 }
 
 class cudaq::JitEngine::Impl : public cudaq::JitEngine::Base {
@@ -366,7 +377,7 @@ class cudaq::JitEngine::Impl : public cudaq::JitEngine::Base {
 };
 
 cudaq::JitEngine::JitEngine(std::unique_ptr<ExecutionEngine> jitEngine)
-    : impl(std::make_shared<JitEngine::Impl>(std::move(jitEngine))) {}
+    : impl(std::make_shared<cudaq::JitEngine::Impl>(std::move(jitEngine))) {}
 
 std::size_t cudaq::JitEngine::getKey() const {
   return static_cast<const Impl *>(impl.get())->getKey();
diff --git a/runtime/internal/compiler/LayoutInfo.cpp b/runtime/internal/compiler/LayoutInfo.cpp
index 5e6db3ea75e..1a1cc87e1da 100644
--- a/runtime/internal/compiler/LayoutInfo.cpp
+++ b/runtime/internal/compiler/LayoutInfo.cpp
@@ -14,16 +14,16 @@
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
 
 using namespace mlir;
-using namespace cudaq_internal::compiler;
 
-namespace {
-LayoutInfoType extractLayout(const std::string &kernelName, ModuleOp moduleOp) {
+static cudaq_internal::compiler::LayoutInfoType
+extractLayout(const std::string &kernelName, ModuleOp moduleOp) {
   auto *fnOp =
       moduleOp.lookupSymbol(cudaq::runtime::cudaqGenPrefixName + kernelName);
   if (!fnOp)
@@ -78,18 +78,18 @@ LayoutInfoType extractLayout(const std::string &kernelName, ModuleOp moduleOp) {
   return {totalSize, fieldOffsets};
 }
 
-LayoutInfoType extractLayout(const std::string &kernelName,
-                             const std::string &quakeCode) {
-  auto moduleOp =
-      parseSourceString<ModuleOp>(StringRef(quakeCode), getMLIRContext());
+static cudaq_internal::compiler::LayoutInfoType
+extractLayout(const std::string &kernelName, const std::string &quakeCode) {
+  auto moduleOp = parseSourceString<ModuleOp>(
+      StringRef(quakeCode), cudaq_internal::compiler::getMLIRContext());
   if (!moduleOp)
     throw std::runtime_error("module cannot be parsed");
   return extractLayout(kernelName, *moduleOp);
 }
-} // namespace
 
-LayoutInfoType cudaq_internal::compiler::getLayoutInfo(const std::string &name,
-                                                       void *opt_module) {
+cudaq_internal::compiler::LayoutInfoType
+cudaq_internal::compiler::getLayoutInfo(const std::string &name,
+                                        void *opt_module) {
   if (opt_module) {
     // In Python, the interpreter already has the ModuleOp resident.
     ModuleOp mod{reinterpret_cast<Operation *>(opt_module)};
@@ -102,7 +102,7 @@ LayoutInfoType cudaq_internal::compiler::getLayoutInfo(const std::string &name,
   return {};
 }
 
-LayoutInfoType
+cudaq_internal::compiler::LayoutInfoType
 cudaq_internal::compiler::getTargetLayout(ModuleOp mod,
                                           cudaq::cc::StructType structTy) {
   StringRef dataLayoutSpec = "";
@@ -127,8 +127,8 @@ cudaq_internal::compiler::getTargetLayout(ModuleOp mod,
   return {strSize, fieldOffsets};
 }
 
-LayoutInfoType cudaq_internal::compiler::getResultBufferLayout(ModuleOp mod,
-                                                               Type resultTy) {
+cudaq_internal::compiler::LayoutInfoType
+cudaq_internal::compiler::getResultBufferLayout(ModuleOp mod, Type resultTy) {
   std::size_t bufferSize = 0;
   std::vector<std::size_t> fieldOffsets;
 
diff --git a/runtime/internal/compiler/RuntimeCppMLIR.cpp b/runtime/internal/compiler/RuntimeCppMLIR.cpp
index c1c2ab43604..615a92c0ae1 100644
--- a/runtime/internal/compiler/RuntimeCppMLIR.cpp
+++ b/runtime/internal/compiler/RuntimeCppMLIR.cpp
@@ -9,8 +9,8 @@
 #include "cudaq/Optimizer/InitAllPasses.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "cudaq_internal/compiler/TracePassInstrumentation.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/TargetParser/Host.h"
 
 void cudaq_internal::compiler::initializeLangMLIR() {
   llvm::InitializeNativeTarget();
diff --git a/runtime/internal/compiler/RuntimeMLIR.cpp b/runtime/internal/compiler/RuntimeMLIR.cpp
index 1d5e8c7e2f0..c1713423189 100644
--- a/runtime/internal/compiler/RuntimeMLIR.cpp
+++ b/runtime/internal/compiler/RuntimeMLIR.cpp
@@ -18,6 +18,7 @@
 #include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/InitAllDialects.h"
+#include "cudaq/Optimizer/InitAllPasses.h"
 #include "cudaq/Support/TargetConfig.h"
 #include "cudaq/Verifier/QIRLLVMIRDialect.h"
 #include "cudaq/Verifier/QIRSpec.h"
@@ -26,14 +27,23 @@
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Base64.h"
-#include "llvm/Support/Host.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/InitAllTranslations.h"
+#include "mlir/Parser/Parser.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Tools/ParseUtilities.h"
@@ -105,7 +115,7 @@ cudaq_internal::compiler::TranslateFromMLIRRegistration::
 
 static bool setupTargetTriple(llvm::Module *llvmModule) {
   // Setup the machine properties from the current architecture.
-  auto targetTriple = llvm::sys::getDefaultTargetTriple();
+  llvm::Triple targetTriple(llvm::sys::getDefaultTargetTriple());
   std::string errorMessage;
   const auto *target =
       llvm::TargetRegistry::lookupTarget(targetTriple, errorMessage);
@@ -114,11 +124,10 @@ static bool setupTargetTriple(llvm::Module *llvmModule) {
 
   std::string cpu(llvm::sys::getHostCPUName());
   llvm::SubtargetFeatures features;
-  llvm::StringMap<bool> hostFeatures;
+  llvm::StringMap<bool> hostFeatures = llvm::sys::getHostCPUFeatures();
 
-  if (llvm::sys::getHostCPUFeatures(hostFeatures))
-    for (auto &f : hostFeatures)
-      features.AddFeature(f.first(), f.second);
+  for (auto &f : hostFeatures)
+    features.AddFeature(f.first(), f.second);
 
   std::unique_ptr<llvm::TargetMachine> machine(target->createTargetMachine(
       targetTriple, cpu, features.getString(), {}, {}));
@@ -171,6 +180,28 @@ static void applyWriteOnlyAttributes(llvm::Module *llvmModule) {
       }
 }
 
+// LLVM 22 no longer infers the nonnull attribute on GEP arguments pointing to
+// global constant strings during O3 optimization. The QIR profile verification
+// expects nonnull on pointer parameters of __quantum__rt__result_record_output
+// calls, so we explicitly add it here after optimization.
+void applyNonNullAttributes(llvm::Module *llvmModule) {
+  for (llvm::Function &func : *llvmModule)
+    for (llvm::BasicBlock &block : func)
+      for (llvm::Instruction &inst : block) {
+        auto callInst = llvm::dyn_cast_or_null<llvm::CallBase>(&inst);
+        if (callInst && callInst->getCalledFunction()) {
+          auto funcName = callInst->getCalledFunction()->getName();
+          if (funcName == cudaq::opt::QIRRecordOutput ||
+              funcName == cudaq::opt::QIRArrayRecordOutput) {
+            for (unsigned i = 0; i < callInst->arg_size(); ++i) {
+              if (callInst->getArgOperand(i)->getType()->isPointerTy())
+                callInst->addParamAttr(i, llvm::Attribute::NonNull);
+            }
+          }
+        }
+      }
+}
+
 // Once a call to a function with irreversible attribute is seen, no more calls
 // to reversible functions are allowed.
 static LogicalResult
@@ -209,10 +240,9 @@ static LogicalResult verifyOutputCalls(llvm::CallBase *callInst,
   int iArg = 0;
   for (auto &arg : callInst->args()) {
     auto myArg = arg->getType();
-    auto ptrTy = llvm::dyn_cast_or_null<llvm::PointerType>(myArg);
-    // If we're dealing with the i8* parameters
-    if (ptrTy != nullptr &&
-        ptrTy->getNonOpaquePointerElementType()->isIntegerTy(8)) {
+    auto ptrTy = dyn_cast_if_present<llvm::PointerType>(myArg);
+    // If we're dealing with pointer parameters (opaque pointers)
+    if (ptrTy != nullptr) {
       // Verify that it has the nonnull attribute
       if (!callInst->paramHasAttr(iArg, llvm::Attribute::NonNull)) {
         llvm::errs() << "error - nonnull attribute is missing from i8* "
@@ -375,7 +405,7 @@ static LogicalResult filterSpecificCodePatterns(llvm::Module *llvmModule,
         for (llvm::Instruction &inst : block)
           if (auto *call = llvm::dyn_cast<llvm::CallInst>(&inst)) {
             auto *calledFunc = call->getCalledFunction();
-            auto name = calledFunc->getGlobalIdentifier();
+            auto name = calledFunc->getName();
             if (eraseStackBounding && calledFunc->isIntrinsic() &&
                 (name == cudaq::llvmStackSave ||
                  name == cudaq::llvmStackRestore))
@@ -442,7 +472,6 @@ qirProfileTranslationFunction(const std::string &qirProfile, Operation *op,
   timingScope.stop();
 
   auto llvmContext = std::make_unique<llvm::LLVMContext>();
-  llvmContext->setOpaquePointers(false);
   auto llvmModule = translateModuleToLLVMIR(op, *llvmContext);
 
   // Apply required attributes for the Base Profile
@@ -521,9 +550,11 @@ qirProfileTranslationFunction(const std::string &qirProfile, Operation *op,
   if (failed(filterSpecificCodePatterns(llvmModule.get(), config)))
     return failure();
 
-  // Note: optimizeLLVM is the one that is setting nonnull attributes on
-  // the @__quantum__rt__result_record_output calls.
+  // Note: LLVM 22 no longer infers nonnull attributes on GEP arguments to
+  // @__quantum__rt__result_record_output during O3 optimization, so we
+  // explicitly add them after optimization.
   optimizeLLVM(llvmModule.get());
+  applyNonNullAttributes(llvmModule.get());
   if (!setupTargetTriple(llvmModule.get()))
     throw std::runtime_error("Failed to setup the llvm module target triple.");
 
@@ -674,9 +705,12 @@ static std::unique_ptr<MLIRContext> createMLIRContext() {
   DialectRegistry registry;
   cudaq::opt::registerCodeGenDialect(registry);
   cudaq::registerAllDialects(registry);
+  mlir::func::registerInlinerExtension(registry);
+  mlir::LLVM::registerInlinerInterface(registry);
+  registerBuiltinDialectTranslation(registry);
+  registerLLVMDialectTranslation(registry);
   auto context = std::make_unique<MLIRContext>(registry);
   context->loadAllAvailableDialects();
-  registerLLVMDialectTranslation(*context);
   return context;
 }
 
@@ -711,7 +745,7 @@ cudaq_internal::compiler::getEntryPointName(OwningOpRef<ModuleOp> &module) {
     if (auto op = dyn_cast<func::FuncOp>(a)) {
       // Note: the .thunk function is where unmarshalling happens. It is *not*
       // an entry point.
-      if (op.getName().endswith(".thunk"))
+      if (op.getName().ends_with(".thunk"))
         return {op.getName().str()};
     }
   }
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.cpp b/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.cpp
index 3ab416d9c89..e7902c542b0 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.cpp
@@ -362,10 +362,6 @@ cudaq::dynamics::CuDensityMatOpConverter::createProductOperatorTerm(
     for (size_t j = 0; j < sub_degrees.size(); j++) {
       std::size_t degree = sub_degrees[j];
       int modality = modalities[j];
-
-      if (sub_degrees[i] < 0)
-        throw std::out_of_range("Degree cannot be negative!");
-
       allDegrees.emplace_back(degree);
       allModeActionDuality.emplace_back(modality);
     }
diff --git a/runtime/nvqir/cutensornet/simulator_tensornet_fp32_register.cpp b/runtime/nvqir/cutensornet/simulator_tensornet_fp32_register.cpp
index b4b1bc1bfa2..53221dc64bb 100644
--- a/runtime/nvqir/cutensornet/simulator_tensornet_fp32_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_tensornet_fp32_register.cpp
@@ -7,7 +7,12 @@
  ******************************************************************************/
 #define TENSORNET_FP32
 
+// GCC 12 emits a spurious -Wstringop-overflow false positive inside
+// std::copy<size_t*> inlined from SimulatorTensorNetBase::swap().
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
 #include "simulator_tensornet.h"
+#pragma GCC diagnostic pop
 
 /// Register this Simulator class with NVQIR under name "tensornet-fp32"
 extern "C" {
diff --git a/runtime/nvqir/cutensornet/simulator_tensornet_fp64_register.cpp b/runtime/nvqir/cutensornet/simulator_tensornet_fp64_register.cpp
index 60e6a80a040..be954ffced7 100644
--- a/runtime/nvqir/cutensornet/simulator_tensornet_fp64_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_tensornet_fp64_register.cpp
@@ -6,7 +6,12 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+// GCC 12 emits a spurious -Wstringop-overflow false positive inside
+// std::copy<size_t*> inlined from SimulatorTensorNetBase::swap().
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
 #include "simulator_tensornet.h"
+#pragma GCC diagnostic pop
 
 /// Register this Simulator class with NVQIR under name "tensornet"
 extern "C" {
diff --git a/runtime/nvqir/stim/verify_linkage.sh b/runtime/nvqir/stim/verify_linkage.sh
index d7da325129a..d1c3cd7d544 100644
--- a/runtime/nvqir/stim/verify_linkage.sh
+++ b/runtime/nvqir/stim/verify_linkage.sh
@@ -22,12 +22,21 @@ if [ ! -f "$TARGET_LIB" ]; then
     exit 1
 fi
 
+# List exported dynamic symbols. GNU nm uses -D; Apple's nm rejects -D on
+# Mach-O dylibs ("File format has no dynamic symbol table") and instead
+# exposes exported symbols via -gU (global, defined-only).
+if [ "$(uname -s)" = "Darwin" ]; then
+  NM_FLAGS="-gU"
+else
+  NM_FLAGS="-D"
+fi
+
 # Search for 'stim' symbols, excluding the known entry point.
 # The command fails if grep finds any matching lines.
-if nm -D "$TARGET_LIB" | grep 'stim' | grep -q -v 'getCircuitSimulator_stim'; then
+if nm $NM_FLAGS "$TARGET_LIB" | grep 'stim' | grep -q -v 'getCircuitSimulator_stim'; then
   echo "ERROR: Found unexpected exported symbols containing 'stim' in $TARGET_LIB" >&2
   echo '--- Offending Symbols ---' >&2
-  nm -D "$TARGET_LIB" | grep 'stim' | grep -v 'getCircuitSimulator_stim' >&2
+  nm $NM_FLAGS "$TARGET_LIB" | grep 'stim' | grep -v 'getCircuitSimulator_stim' >&2
   echo '-------------------------' >&2
   exit 1
 fi
diff --git a/scripts/bootstrap_prerequisites.sh b/scripts/bootstrap_prerequisites.sh
new file mode 100644
index 00000000000..53999971e26
--- /dev/null
+++ b/scripts/bootstrap_prerequisites.sh
@@ -0,0 +1,680 @@
+#!/bin/bash
+
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# Usage:
+# This script builds and installs a minimal set of dependencies needed to build
+# CUDA-Q from source.
+#
+# Usage:
+# bash bootstrap_prerequisites.sh
+#   -e <name>     Exclude a prerequisite (e.g. zlib, llvm, blas, ssl, curl, aws, cuquantum, cutensor, toolchain)
+#   -t <name>     Select toolchain (e.g. gcc12, llvm)
+#   -m            Only install libraries for which an *_INSTALL_PREFIX is defined
+#   -l            Generate a prerequisites lock file and exit (no installation)
+#
+# When the -l flag is used, a lock file named cudaq_prereqs.lock (or the path
+# given via the PREREQS_LOCK_FILE environment variable) is generated that
+# enumerates the source locations for all prerequisites that would be installed
+# for the current configuration. This can be used to pre-download sources in
+# controlled build environments.
+#
+# For the libraries LLVM, BLAS, ZLIB, OPENSSL, CURL, CUQUANTUM, CUTENSOR, if the
+# library is not found in the location defined by the corresponding environment variable
+# *_INSTALL_PREFIX, it will be built from source and installed in that location.
+# If the LLVM libraries are built from source, the environment variable LLVM_PROJECTS
+# can be used to customize which projects are built, and pybind11 will be built and 
+# installed in the location defined by PYBIND11_INSTALL_PREFIX if necessary.
+# The cuQuantum and cuTensor libraries are only installed if a suitable CUDA compiler 
+# is installed. 
+# 
+# By default, all prerequisites outlined above are installed even if the
+# corresponding *_INSTALL_PREFIX is not defined. The command line flag -m changes
+# that behavior to only install the libraries for which this variable is defined.
+# A compiler toolchain, cmake, and ninja will be installed unless the the -m flag 
+# is passed or the corresponding commands already exist. If the commands already 
+# exist, compatibility or versions won't be validated.
+
+# Centralized version / source definitions used by both installation and lockfile
+# generation. Keeping these here avoids duplication between code paths.
+CMAKE_VERSION=3.26.4
+CMAKE_MACOS_TARBALL_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-macos-universal.tar.gz"
+CMAKE_LINUX_INSTALLER_URL_BASE="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-"
+
+NINJA_VERSION=1.11.1
+NINJA_TARBALL_URL="https://github.com/ninja-build/ninja/archive/refs/tags/v${NINJA_VERSION}.tar.gz"
+
+ZLIB_VERSION=1.3.1
+ZLIB_TARBALL_URL="https://github.com/madler/zlib/releases/download/v${ZLIB_VERSION}/zlib-${ZLIB_VERSION}.tar.gz"
+
+BLAS_VERSION=3.11.0
+BLAS_TARBALL_URL="http://www.netlib.org/blas/blas-${BLAS_VERSION}.tgz"
+
+PERL_VERSION=5.38.2
+PERL_TARBALL_URL="https://www.cpan.org/src/5.0/perl-${PERL_VERSION}.tar.gz"
+
+OPENSSL_VERSION=3.5.1
+OPENSSL_TARBALL_URL="https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz"
+
+CURL_VERSION=8.5.0
+CURL_VERSION_UNDERSCORE=curl-8_5_0
+CURL_TARBALL_URL="https://github.com/curl/curl/releases/download/${CURL_VERSION_UNDERSCORE}/curl-${CURL_VERSION}.tar.gz"
+CACERT_URL="https://curl.se/ca/cacert.pem"
+CACERT_SHA256_URL="${CACERT_URL}.sha256"
+
+AWS_SDK_CPP_URL="https://github.com/aws/aws-sdk-cpp"
+AWS_SDK_CPP_REF="1.11.454"
+
+# QRMI pre-built C artifacts for Pasqal QRMI connector
+QRMI_RELEASE_REPO=${QRMI_RELEASE_REPO:-qiskit-community/qrmi}
+QRMI_RELEASE_TAG=${QRMI_RELEASE_TAG:-v0.12.0}
+QRMI_RELEASE_VERSION=${QRMI_RELEASE_TAG#v}
+QRMI_RELEASE_BASE="https://github.com/${QRMI_RELEASE_REPO}/releases/download/${QRMI_RELEASE_TAG}"
+QRMI_ARCHIVE="libqrmi-${QRMI_RELEASE_VERSION}-el8-x86_64.tar.gz"
+QRMI_UNPACK_DIR="libqrmi-${QRMI_RELEASE_VERSION}"
+# NOTE: This needs to be updated whenever the pre-built artifacts are updated. The SHA-256 can be computed with:
+#   wget -O qrmi.tar.gz "${QRMI_RELEASE_BASE}/${QRMI_ARCHIVE}"
+#   sha256sum qrmi.tar.gz | awk '{print $1}'
+QRMI_ARCHIVE_SHA256=${QRMI_ARCHIVE_SHA256:-2986150d4f55e1f6566bef16d9fb3897ca04dd7eaa681865f7ef244f298a6746}
+
+# Process command line arguments
+toolchain=''
+exclude_prereq=''
+install_all=true
+lock_mode=false
+__optind__=$OPTIND
+OPTIND=1
+while getopts ":e:t:ml-:" opt; do
+  case $opt in
+    e) exclude_prereq="$(echo "$OPTARG" | tr '[:upper:]' '[:lower:]')"
+    ;;
+    t) toolchain="$OPTARG"
+    ;;
+    m) install_all=false
+    ;;
+    l) lock_mode=true
+    ;;
+    :) echo "Option -$OPTARG requires an argument."
+    (return 0 2>/dev/null) && return 1 || exit 1
+    ;;
+    \?) echo "Invalid command line option -$OPTARG" >&2
+    (return 0 2>/dev/null) && return 1 || exit 1
+    ;;
+  esac
+done
+OPTIND=$__optind__
+
+# Set default install prefix environment variables (only when install_all is true)
+if $install_all; then
+  source "$(dirname "${BASH_SOURCE[0]}")/set_env_defaults.sh"
+fi
+
+# If requested, generate a lock file describing all source archives / repositories
+# that would be used to build the prerequisites, then exit without installing.
+if $lock_mode; then
+  LOCK_FILE="${PREREQS_LOCK_FILE:-cudaq_prereqs.lock}"
+
+  # Helper to append one entry to the lock file in a simple key=value format.
+  function add_lock_line {
+    local name="$1"; shift
+    echo "name=${name} $*" >> "$LOCK_FILE"
+  }
+
+  # Initialize / truncate the lock file and add a short header.
+  {
+    echo "# CUDA-Q prerequisites lockfile"
+    echo "# Generated: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+    echo "# Format: name=<id> key=value ..."
+  } > "$LOCK_FILE"
+
+  # [Toolchain] CMake and Ninja sources (compiler toolchain itself is handled
+  # via install_toolchain.sh or the system toolchain and is not pinned here).
+  # In lockfile mode, always list the toolchain sources regardless of what is
+  # currently installed or excluded.
+  add_lock_line "cmake-macos" \
+    "type=tar" \
+    "url=${CMAKE_MACOS_TARBALL_URL}" \
+    "version=${CMAKE_VERSION}"
+  add_lock_line "cmake" \
+    "type=sh" \
+    "url=${CMAKE_LINUX_INSTALLER_URL_BASE}$(uname -m).sh" \
+    "version=${CMAKE_VERSION}"
+  add_lock_line "ninja" \
+    "type=tar" \
+    "url=${NINJA_TARBALL_URL}" \
+    "version=${NINJA_VERSION}"
+
+  # [Zlib / Minizip]
+  add_lock_line "zlib" \
+    "type=tar" \
+    "url=${ZLIB_TARBALL_URL}" \
+    "version=${ZLIB_VERSION}"
+
+  # [BLAS]
+  add_lock_line "blas" \
+    "type=tar" \
+    "url=${BLAS_TARBALL_URL}" \
+    "version=${BLAS_VERSION}"
+
+  # [OpenSSL] (and its private Perl used only for the build)
+  add_lock_line "perl" \
+    "type=tar" \
+    "url=${PERL_TARBALL_URL}" \
+    "version=${PERL_VERSION}"
+  add_lock_line "openssl" \
+    "type=tar" \
+    "url=${OPENSSL_TARBALL_URL}" \
+    "version=${OPENSSL_VERSION}"
+
+  # [CURL] (including CA bundle)
+  add_lock_line "cacert" \
+    "type=pem" \
+    "url=${CACERT_URL}"
+  add_lock_line "curl" \
+    "type=tar" \
+    "url=${CURL_TARBALL_URL}" \
+    "version=${CURL_VERSION}"
+
+  # [AWS SDK]
+  add_lock_line "aws-sdk-cpp" \
+    "type=git" \
+    "url=${AWS_SDK_CPP_URL}" \
+    "ref=${AWS_SDK_CPP_REF}"
+
+  # [QRMI] Pre-built C artifacts for Pasqal QRMI connector
+  # Keep this in sync with the QRMI section in the installation path below.
+  add_lock_line "qrmi" \
+    "type=tar" \
+    "url=${QRMI_RELEASE_BASE}/${QRMI_ARCHIVE}" \
+    "version=${QRMI_RELEASE_VERSION}" \
+    "sha256=${QRMI_ARCHIVE_SHA256}"
+
+  echo "Prerequisites lockfile written to ${LOCK_FILE}."
+  (return 0 2>/dev/null) && return 0 || exit 0
+fi
+
+# Create a temporary directory for building source packages
+PREREQS_BUILD_DIR=$(mktemp -d)
+: "${PREREQS_BUILD_DIR:?ERROR mktemp failed}"
+echo "Building prerequisites in $PREREQS_BUILD_DIR"
+# Remove below if you wish to debug pre-req build failures
+trap "rm -rf $PREREQS_BUILD_DIR" EXIT
+
+function temp_install_if_command_unknown {
+  if [ ! -x "$(command -v $1)" ]; then
+    if [ -x "$(command -v apt-get)" ]; then
+      if [ -z "$PKG_UNINSTALL" ]; then apt-get update; fi
+      apt-get install -y --no-install-recommends $2
+    elif [ -x "$(command -v dnf)" ]; then
+      dnf install -y --nobest --setopt=install_weak_deps=False $2
+    elif [ -x "$(command -v brew)" ]; then
+      HOMEBREW_NO_AUTO_UPDATE=1 brew install $2
+    else
+      echo "No package manager was found to install $2." >&2
+    fi
+    PKG_UNINSTALL="$PKG_UNINSTALL $2"
+  fi
+}
+
+function remove_temp_installs {
+  if [ -n "$PKG_UNINSTALL" ]; then
+    echo "Uninstalling packages used for bootstrapping: $PKG_UNINSTALL"
+    if [ -x "$(command -v apt-get)" ]; then  
+      apt-get remove -y $PKG_UNINSTALL
+      apt-get autoremove -y --purge
+    elif [ -x "$(command -v dnf)" ]; then
+      dnf remove -y $PKG_UNINSTALL
+      dnf clean all
+    elif [ -x "$(command -v brew)" ]; then
+      brew uninstall --force $PKG_UNINSTALL 
+    else
+      echo "No package manager configured for clean up." >&2
+    fi
+    unset PKG_UNINSTALL
+  fi
+}
+
+working_dir=`pwd`
+read __errexit__ < <(echo $SHELLOPTS | grep -Eo '(^|:)errexit(:|$)' || echo)
+function prepare_exit {
+  cd "$working_dir" && remove_temp_installs
+  if [ -z "$__errexit__" ]; then set +e; fi
+}
+
+set -e
+trap 'prepare_exit && ((return 0 2>/dev/null) && return 1 || exit 1)' EXIT
+this_file_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+if [ "$(uname)" = "Darwin" ] && [ -x "$(command -v xcrun)" ]; then
+  export SDKROOT="${SDKROOT:-$(xcrun --show-sdk-path)}"
+fi
+
+# [Toolchain] CMake, ninja and C/C++ compiler
+if $install_all && [ -z "$(echo $exclude_prereq | grep toolchain)" ]; then
+    if [ "$(uname)" = "Darwin" ]; then
+      export CC=clang
+      export CXX=clang++
+      echo "Using Apple Clang: $(clang --version | head -1)"
+    else
+      export LLVM_STAGE1_BUILD="$LLVM_INSTALL_PREFIX/bootstrap"
+      if [ ! -x "$LLVM_STAGE1_BUILD/bin/clang" ]; then
+        temp_install_if_command_unknown cmake cmake
+        temp_install_if_command_unknown ninja ninja-build
+        echo "Building stage1 LLVM (clang;lld)..."
+        LLVM_INSTALL_PREFIX="$LLVM_STAGE1_BUILD" \
+        LLVM_PROJECTS='clang;lld' \
+        LLVM_ENABLE_ZLIB=OFF \
+        LLVM_BUILD_FOLDER=bootstrap_build \
+        bash "$this_file_dir/build_llvm.sh" -v
+        remove_temp_installs
+      fi
+      export CC="$LLVM_STAGE1_BUILD/bin/clang"
+      export CXX="$LLVM_STAGE1_BUILD/bin/clang++"
+      echo "Using stage1 clang: $CC"
+    fi
+  if [ ! -x "$(command -v cmake)" ]; then
+    echo "Installing CMake..."
+    temp_install_if_command_unknown wget wget
+    pushd "$PREREQS_BUILD_DIR"
+    if [ "$(uname)" = "Darwin" ]; then
+      cmake_arch="$(uname -m)"
+      wget "${CMAKE_MACOS_TARBALL_URL}" -O cmake.tar.gz
+      tar -xzf cmake.tar.gz
+      mv "cmake-${CMAKE_VERSION}-macos-universal/CMake.app/Contents/bin/"* "$HOME/.local/bin/"
+      mv "cmake-${CMAKE_VERSION}-macos-universal/CMake.app/Contents/share/"* "$HOME/.local/share/"
+    else
+      wget "${CMAKE_LINUX_INSTALLER_URL_BASE}$(uname -m).sh" -O cmake-install.sh
+      bash cmake-install.sh --skip-licence --exclude-subdir --prefix=/usr/local
+    fi
+    popd
+  fi
+  if [ ! -x "$(command -v ninja)" ]; then
+    echo "Installing Ninja..."
+    temp_install_if_command_unknown wget wget
+    temp_install_if_command_unknown make make
+
+    pushd "$PREREQS_BUILD_DIR"
+
+    # The pre-built binary for Linux on GitHub is built for x86_64 only,
+    # see also https://github.com/ninja-build/ninja/issues/2284.
+    wget "${NINJA_TARBALL_URL}"
+    tar -xzvf "v${NINJA_VERSION}.tar.gz" && cd "ninja-${NINJA_VERSION}"
+    if [ "$(uname)" = "Darwin" ]; then
+      cmake -B build
+    else
+      lld="${LLVM_STAGE1_BUILD:+$LLVM_STAGE1_BUILD/bin/ld.lld}"
+      if [ -x "$lld" ]; then
+        cmake -B build -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=$lld"
+      else
+        LDFLAGS="-static-libstdc++" cmake -B build
+      fi
+    fi
+    cmake --build build
+
+    if [ "$(uname)" = "Darwin" ]; then
+      mv build/ninja $HOME/.local/bin/
+    else
+      mv build/ninja /usr/local/bin/
+    fi
+
+    popd
+  fi
+fi
+
+# [Zlib] Needed to build LLVM with zlib support (used by linker)
+# [Minizip] Needed by rest_server for archive handling
+# Build both from source for consistency across platforms.
+if [ -n "$ZLIB_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep zlib)" ]; then
+  if [ ! -f "$ZLIB_INSTALL_PREFIX/lib/libz.a" ] || [ ! -f "$ZLIB_INSTALL_PREFIX/lib/libminizip.a" ]; then
+    echo "Installing libz and minizip..."
+    temp_install_if_command_unknown wget wget
+    temp_install_if_command_unknown make make
+    temp_install_if_command_unknown automake automake
+    # On macOS, Apple ships /usr/bin/libtool which is NOT GNU libtool.
+    # Homebrew installs GNU libtool as 'glibtool' to avoid conflicts.
+    # We need glibtoolize for autoreconf, so check for that instead.
+    if [ "$(uname)" = "Darwin" ]; then
+      temp_install_if_command_unknown glibtoolize libtool
+    else
+      temp_install_if_command_unknown libtool libtool
+    fi
+
+    pushd "$PREREQS_BUILD_DIR"
+
+    wget -O "zlib-${ZLIB_VERSION}.tar.gz" "${ZLIB_TARBALL_URL}"
+    tar -xzf "zlib-${ZLIB_VERSION}.tar.gz" && cd "zlib-${ZLIB_VERSION}"
+    CC="$CC" CFLAGS="-fPIC" \
+    ./configure --prefix="$ZLIB_INSTALL_PREFIX" --static
+    make CC="$CC" && make install
+    cd contrib/minizip
+    # On macOS with Homebrew, set up environment for autoreconf:
+    # - Add Homebrew's m4 macros to aclocal search path
+    # - Point LIBTOOLIZE to glibtoolize (Homebrew's GNU libtoolize)
+    if [ "$(uname)" = "Darwin" ] && [ -x "$(command -v brew)" ]; then
+      export ACLOCAL_PATH="$(brew --prefix)/share/aclocal${ACLOCAL_PATH:+:$ACLOCAL_PATH}"
+      export LIBTOOLIZE=glibtoolize
+    fi
+    autoreconf --install
+    CC="$CC" CFLAGS="-fPIC" \
+    ./configure --prefix="$ZLIB_INSTALL_PREFIX" --disable-shared
+    make CC="$CC" && make install
+
+    popd
+    remove_temp_installs
+  else
+    echo "libz and minizip already installed in $ZLIB_INSTALL_PREFIX."
+  fi
+fi
+
+# [nanobind] Needed for MLIR Python bindings (MLIR 22+)
+# Install nanobind independently of the LLVM build so that it is available
+# even when LLVM is restored from cache.
+if [ -n "$NANOBIND_INSTALL_PREFIX" ]; then
+  if [ ! -d "$NANOBIND_INSTALL_PREFIX" ] || [ -z "$(ls -A "$NANOBIND_INSTALL_PREFIX"/* 2> /dev/null)" ]; then
+    echo "Building nanobind..."
+    cd "$this_file_dir" && cd $(git rev-parse --show-toplevel)
+    git submodule update --init --recursive --recommend-shallow --single-branch tpls/nanobind
+    mkdir -p "tpls/nanobind/build" && cd "tpls/nanobind/build"
+    cmake -G Ninja ../ -DCMAKE_INSTALL_PREFIX="$NANOBIND_INSTALL_PREFIX" -DNB_TEST=False
+    cmake --build . --target install --config Release
+    cd "$working_dir"
+  else
+    echo "nanobind already installed in $NANOBIND_INSTALL_PREFIX."
+  fi
+fi
+
+# [LLVM/MLIR] Needed to build the CUDA Quantum toolchain
+if [ -n "$LLVM_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep llvm)" ]; then
+  if [ ! -d "$LLVM_INSTALL_PREFIX/lib/cmake/llvm" ]; then
+    echo "Installing LLVM libraries..."
+    LLVM_INSTALL_PREFIX="$LLVM_INSTALL_PREFIX" \
+    LLVM_PROJECTS="$LLVM_PROJECTS" \
+    PYBIND11_INSTALL_PREFIX="$PYBIND11_INSTALL_PREFIX" \
+    NANOBIND_INSTALL_PREFIX="$NANOBIND_INSTALL_PREFIX" \
+    Python3_EXECUTABLE="$Python3_EXECUTABLE" \
+    bash "$this_file_dir/build_llvm.sh" -v
+  else
+    echo "LLVM already installed in $LLVM_INSTALL_PREFIX."
+  fi
+
+  export CC="$LLVM_INSTALL_PREFIX/bin/clang"
+  export CXX="$LLVM_INSTALL_PREFIX/bin/clang++"
+  export FC="$LLVM_INSTALL_PREFIX/bin/flang"
+  echo "Configured C compiler: $CC"
+  echo "Configured C++ compiler: $CXX"
+  echo "Configured Fortran compiler: $FC"
+fi
+
+# [Blas] Needed for certain optimizers
+if [ -n "$BLAS_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep blas)" ]; then
+  if [ ! -f "$BLAS_INSTALL_PREFIX/libblas.a" ] && [ ! -f "$BLAS_INSTALL_PREFIX/lib/libblas.a" ]; then
+    echo "Installing BLAS..."
+    temp_install_if_command_unknown wget wget
+    temp_install_if_command_unknown make make
+    if [ ! -x "$(command -v "$FC")" ]; then
+      unset FC
+      # On macOS, 'brew install gfortran' installs gcc which provides gfortran
+      temp_install_if_command_unknown gfortran gfortran
+    fi
+
+    pushd "$PREREQS_BUILD_DIR"
+
+    # See also: https://github.com/NVIDIA/cuda-quantum/issues/452
+    wget "${BLAS_TARBALL_URL}"
+    tar -xzvf "blas-${BLAS_VERSION}.tgz" && cd BLAS-3.11.0
+    make FC="${FC:-gfortran}" FFLAGS="-O2"
+    mkdir -p "$BLAS_INSTALL_PREFIX"
+    mv blas_*.a "$BLAS_INSTALL_PREFIX/libblas.a"
+
+    popd
+    remove_temp_installs
+  else
+    echo "BLAS already installed in $BLAS_INSTALL_PREFIX."
+  fi
+fi
+
+# [OpenSSL] Needed for communication with external services
+if [ -n "$OPENSSL_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep ssl)" ]; then
+  if [ ! -d "$OPENSSL_INSTALL_PREFIX" ] || \
+     [ -z "$(find "$OPENSSL_INSTALL_PREFIX" -name 'libssl.a' 2>/dev/null)" ]; then
+    echo "Installing OpenSSL..."
+    temp_install_if_command_unknown wget wget
+    temp_install_if_command_unknown make make
+
+    pushd "$PREREQS_BUILD_DIR"
+
+    # Not all perl installations include all necessary modules.
+    # To facilitate a consistent build across platforms and to minimize dependencies,
+    # we just use our own perl version for the OpenSSL build.
+    wget "${PERL_TARBALL_URL}"
+    tar -xzf "perl-${PERL_VERSION}.tar.gz" && cd "perl-${PERL_VERSION}"
+    ./Configure -des -Dcc="$CC" -Dprefix="$PREREQS_BUILD_DIR/perl5"
+    find . -name "*.PL" -exec touch {} + # normalize WSL clock skew
+    make CC="$CC" && make install
+    cd ..
+    # Additional perl modules can be installed with cpan, e.g.
+    # PERL_MM_USE_DEFAULT=1 $PREREQS_BUILD_DIR/perl5/bin/cpan App::cpanminus
+
+    if [ ! -x "$(command -v ar)" ]; then
+      cc_exe_dir=`dirname "$CC"`
+      if [ -x "$(command -v "$cc_exe_dir/ar")" ]; then AR="$cc_exe_dir/ar"
+      else AR=`find "$cc_exe_dir" -maxdepth 1 -name 'llvm-ar'`
+      fi
+    fi
+
+    wget "${OPENSSL_TARBALL_URL}"
+    tar -xf "openssl-${OPENSSL_VERSION}.tar.gz" && cd "openssl-${OPENSSL_VERSION}"
+    CC="$CC" CFLAGS="-fPIC" CXX="$CXX" CXXFLAGS="-fPIC" AR="${AR:-ar}" \
+    "$PREREQS_BUILD_DIR/perl5/bin/perl" Configure no-shared \
+      --prefix="$OPENSSL_INSTALL_PREFIX" zlib \
+      --with-zlib-include="$ZLIB_INSTALL_PREFIX/include" \
+      --with-zlib-lib="$ZLIB_INSTALL_PREFIX/lib"
+    make CC="$CC" CXX="$CXX" && make install
+
+    popd
+    remove_temp_installs
+  else
+    echo "OpenSSL already installed in $OPENSSL_INSTALL_PREFIX."
+  fi
+fi
+
+# [CURL] Needed for communication with external services
+if [ -n "$CURL_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep curl)" ]; then
+  if [ ! -f "$CURL_INSTALL_PREFIX/lib/libcurl.a" ]; then
+    echo "Installing Curl..."
+    temp_install_if_command_unknown wget wget
+    temp_install_if_command_unknown make make
+
+    pushd "$PREREQS_BUILD_DIR"
+
+    # The arguments --with-ca-path and --with-ca-bundle can be used to configure the default
+    # locations where Curl will look for certificates. Note that the paths where certificates
+    # are stored by default varies across operating systems, and to build a Curl library that
+    # can run out of the box on various operating systems pretty much necessitates including
+    # and distributing a certificate bundle, or downloading such a bundle dynamically at
+    # at runtime if needed. The Mozilla certificate bundle can be
+    # downloaded from https://curl.se/ca/cacert.pem. For more information, see
+    # - https://curl.se/docs/sslcerts.html
+    # - https://curl.se/docs/caextract.html
+    wget "${CACERT_URL}"
+    wget "${CACERT_SHA256_URL}"
+    if [ -x "$(command -v sha256sum)" ]; then
+      computed_sha256="$(sha256sum cacert.pem)"
+    else
+      computed_sha256="$(shasum -a 256 cacert.pem)"
+    fi
+    if [ "$computed_sha256" != "$(cat cacert.pem.sha256)" ]; then
+      echo -e "\e[01;31mWarning: Incorrect sha256sum of cacert.pem. The file cacert.pem has been removed. The file can be downloaded manually from https://curl.se/docs/sslcerts.html.\e[0m" >&2
+    else
+      mkdir -p "$CURL_INSTALL_PREFIX" && mv cacert.pem "$CURL_INSTALL_PREFIX"
+    fi
+
+    # Unfortunately, it looks like the default paths need to be absolute and known at compile time.
+    # Note that while the environment variable CURL_CA_BUNDLE allows to easily override the default
+    # path when invoking the Curl executable, this variable is *not* respected by default by the
+    # built library itself; instead, the user of libcurl is responsible for picking up the
+    # environment variables and passing them to curl via CURLOPT_CAINFO and CURLOPT_PROXY_CAINFO.
+    # We opt to build Curl without any default paths, and instead have the CUDA-Q runtime
+    # determine and pass a suitable path.
+    #
+    # Build curl with CMake to generate proper CMake config files (CURLConfig.cmake).
+    # This allows CMake's find_package(CURL) to use config mode, which correctly encodes
+    # full paths to dependencies (OpenSSL, zlib) and avoids pkg-config issues where
+    # -lssl/-lcrypto resolve to the wrong system libraries on macOS.
+    wget "${CURL_TARBALL_URL}"
+    tar -xzvf "curl-${CURL_VERSION}.tar.gz" && cd "curl-${CURL_VERSION}"
+    cmake -G Ninja -B build \
+      -DCMAKE_C_COMPILER="$CC" \
+      -DCMAKE_C_FLAGS="-fPIC" \
+      -DCMAKE_INSTALL_PREFIX="$CURL_INSTALL_PREFIX" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=OFF \
+      -DBUILD_CURL_EXE=ON \
+      -DCURL_USE_OPENSSL=ON \
+      -DOPENSSL_ROOT_DIR="$OPENSSL_INSTALL_PREFIX" \
+      -DCURL_ZLIB=ON \
+      -DZLIB_ROOT="$ZLIB_INSTALL_PREFIX" \
+      -DCURL_CA_BUNDLE=none \
+      -DCURL_CA_PATH=none \
+      -DCURL_USE_LIBSSH2=OFF \
+      -DCURL_USE_LIBPSL=OFF \
+      -DUSE_LIBIDN2=OFF \
+      -DCURL_BROTLI=OFF \
+      -DCURL_ZSTD=OFF \
+      -DUSE_NGHTTP2=OFF \
+      -DENABLE_ARES=OFF \
+      -DCURL_DISABLE_FTP=ON \
+      -DCURL_DISABLE_TFTP=ON \
+      -DCURL_DISABLE_SMTP=ON \
+      -DCURL_DISABLE_LDAP=ON \
+      -DCURL_DISABLE_LDAPS=ON \
+      -DCURL_DISABLE_SMB=ON \
+      -DCURL_DISABLE_GOPHER=ON \
+      -DCURL_DISABLE_TELNET=ON \
+      -DCURL_DISABLE_RTSP=ON \
+      -DCURL_DISABLE_POP3=ON \
+      -DCURL_DISABLE_IMAP=ON \
+      -DCURL_DISABLE_FILE=ON \
+      -DCURL_DISABLE_DICT=ON
+    cmake --build build --config Release
+    cmake --install build --config Release
+
+    popd
+    remove_temp_installs
+  else
+    echo "Curl already installed in $CURL_INSTALL_PREFIX."
+  fi
+fi
+
+# [AWS SDK] Needed for communication with Braket
+if [ -n "$AWS_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep aws)" ]; then
+  if [ ! -d "$AWS_INSTALL_PREFIX" ] || [ -z "$(ls -A "$AWS_INSTALL_PREFIX"/* 2> /dev/null)" ]; then
+    pushd "$PREREQS_BUILD_DIR"
+
+    aws_service_components='braket s3-crt sts'
+    git clone --filter=tree:0 "${AWS_SDK_CPP_URL}" aws-sdk-cpp
+    cd aws-sdk-cpp && git checkout "${AWS_SDK_CPP_REF}" && git submodule update --init --recursive
+
+    # FIXME: CUDAQ VERSION?
+    mkdir build && cd build
+    cmake -G Ninja .. \
+      -DCMAKE_INSTALL_PREFIX="${AWS_INSTALL_PREFIX}" \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \
+      -DAWS_SDK_WARNINGS_ARE_ERRORS=OFF \
+      -DAWS_USER_AGENT_CUSTOMIZATION=CUDA-Q/${CUDA_QUANTUM_VERSION} \
+      -DBUILD_ONLY="$(echo $aws_service_components | tr ' ' ';')" \
+      -DBUILD_SHARED_LIBS=OFF \
+      -DZLIB_ROOT="${ZLIB_INSTALL_PREFIX}" \
+      -DZLIB_USE_STATIC_LIBS=ON \
+      -DOPENSSL_ROOT_DIR="${OPENSSL_INSTALL_PREFIX}" \
+      -DCURL_LIBRARY="${CURL_INSTALL_PREFIX}/lib/libcurl.a" \
+      -DCURL_INCLUDE_DIR="${CURL_INSTALL_PREFIX}/include" \
+      -Dcrypto_LIBRARY="$(find "$OPENSSL_INSTALL_PREFIX" -name libcrypto.a)" \
+      -Dcrypto_INCLUDE_DIR="${OPENSSL_INSTALL_PREFIX}/include" \
+      -DENABLE_TESTING=OFF \
+      -DAUTORUN_UNIT_TESTS=OFF
+    cmake --build . --config=Release
+    cmake --install . --config=Release
+
+    popd
+    remove_temp_installs
+  else
+    echo "AWS SDK already installed in $AWS_INSTALL_PREFIX."
+  fi
+fi
+
+# [QRMI] Needed for the Pasqal QRMI connector
+if [ -n "$QRMI_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep qrmi)" ] && [ "$(uname)" = "Linux" ] && [ "$(uname -m)" = "x86_64" ]; then
+  qrmi_header="$QRMI_INSTALL_PREFIX/include/qrmi.h"
+  qrmi_library="$QRMI_INSTALL_PREFIX/lib64/libqrmi.so"
+  if [ ! -f "$qrmi_header" ] || [ ! -f "$qrmi_library" ]; then
+    echo "Installing QRMI C artifacts..."
+    temp_install_if_command_unknown wget wget
+    pushd "$PREREQS_BUILD_DIR"
+
+    mkdir -p "$QRMI_INSTALL_PREFIX/include" "$QRMI_INSTALL_PREFIX/lib64"
+    wget "${QRMI_RELEASE_BASE}/${QRMI_ARCHIVE}" -O "${QRMI_ARCHIVE}"
+
+    if [ -x "$(command -v sha256sum)" ]; then
+      computed_sha256="$(sha256sum "${QRMI_ARCHIVE}" | awk '{print $1}')"
+    else
+      computed_sha256="$(shasum -a 256 "${QRMI_ARCHIVE}" | awk '{print $1}')"
+    fi
+    if [ "$computed_sha256" != "$QRMI_ARCHIVE_SHA256" ]; then
+      echo -e "\e[01;31mError: SHA-256 checksum mismatch for ${QRMI_ARCHIVE}.\e[0m" >&2
+      echo "Expected: $QRMI_ARCHIVE_SHA256" >&2
+      echo "Got:      $computed_sha256" >&2
+      rm -f "${QRMI_ARCHIVE}"
+      (return 1 2>/dev/null) && return 1 || exit 1
+    fi
+
+    tar -xzf "${QRMI_ARCHIVE}"
+    cp "${QRMI_UNPACK_DIR}/qrmi.h" "$qrmi_header"
+    cp "${QRMI_UNPACK_DIR}/libqrmi.so" "$qrmi_library"
+    rm -rf "${QRMI_ARCHIVE}" "${QRMI_UNPACK_DIR}"
+
+    popd
+    remove_temp_installs
+  else
+    echo "QRMI already installed in $QRMI_INSTALL_PREFIX."
+  fi
+elif [ -n "$QRMI_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep qrmi)" ]; then
+  echo "Skipping QRMI C artifacts install (supported only on Linux x86_64)."
+fi
+
+# [cuQuantum and cuTensor] Needed for GPU-accelerated components
+cuda_driver=${CUDACXX:-${CUDA_HOME:-/usr/local/cuda}/bin/nvcc}
+cuda_version=`"$cuda_driver" --version 2>/dev/null | grep -o 'release [0-9]*\.[0-9]*' | cut -d ' ' -f 2`
+if [ -n "$cuda_version" ]; then
+  if [ -n "$CUQUANTUM_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep cuquantum)" ]; then
+    if [ ! -d "$CUQUANTUM_INSTALL_PREFIX" ] || [ -z "$(ls -A "$CUQUANTUM_INSTALL_PREFIX"/* 2> /dev/null)" ]; then
+      echo "Installing cuQuantum..."
+      bash "$this_file_dir/configure_build.sh" install-cuquantum
+    else 
+      echo "cuQuantum already installed in $CUQUANTUM_INSTALL_PREFIX."
+    fi
+  fi
+  if [ -n "$CUTENSOR_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep cutensor)" ]; then
+    if [ ! -d "$CUTENSOR_INSTALL_PREFIX" ] || [ -z "$(ls -A "$CUTENSOR_INSTALL_PREFIX"/* 2> /dev/null)" ]; then
+      echo "Installing cuTensor..."
+      bash "$this_file_dir/configure_build.sh" install-cutensor
+    else 
+      echo "cuTensor already installed in $CUTENSOR_INSTALL_PREFIX."
+    fi
+  fi
+fi
+
+exclude_prereq="$(echo $exclude_prereq | tr ';' ' ' | sed 's/  */, /g')"
+if $install_all; then echo "All prerequisites have been installed (excluded: ${exclude_prereq:-none})."
+else echo "Prerequisites for which an *_INSTALL_PREFIX variable was defined have been installed (excluded: ${exclude_prereq:-none})."
+fi
+# Make sure to call prepare_exit so that we properly uninstalled all helper tools,
+# and so that we are in the correct directory also when this script is sourced.
+prepare_exit && ((return 0 2>/dev/null) && return 0 || exit 0)
diff --git a/scripts/build_cudaq.sh b/scripts/build_cudaq.sh
index 12c342213ca..41ae978fa8a 100755
--- a/scripts/build_cudaq.sh
+++ b/scripts/build_cudaq.sh
@@ -192,16 +192,26 @@ else
   fi
 fi
 
-# Determine linker and linker flags
-# On macOS, always use the system linker (Apple's ld) as we haven't yet added building with lld on MacOS.
-if [ "$(uname)" != "Darwin" ] && [ -x "$(command -v "$LLVM_INSTALL_PREFIX/bin/ld.lld")" ]; then
-  echo "Configuring nvq++ and local build to use the lld linker by default."
-  NVQPP_LD_PATH="$LLVM_INSTALL_PREFIX/bin/ld.lld"
+# Determine linker and linker flags.
+# On macOS, prefer LLVM's Mach-O lld (ld64.lld) when available; on Linux use
+# ld.lld. In either case fall back to the system linker if lld isn't present.
+if [ "$(uname)" = "Darwin" ] && [ -x "$LLVM_INSTALL_PREFIX/bin/ld64.lld" ]; then
+  LLD_BIN="$LLVM_INSTALL_PREFIX/bin/ld64.lld"
+elif [ "$(uname)" != "Darwin" ] && [ -x "$LLVM_INSTALL_PREFIX/bin/ld.lld" ]; then
+  LLD_BIN="$LLVM_INSTALL_PREFIX/bin/ld.lld"
+else
+  LLD_BIN=""
+fi
+if [ -n "$LLD_BIN" ] && [ "$(uname)" != "Darwin" ]; then
+  echo "Configuring nvq++ and local build to use the lld linker by default ($LLD_BIN)."
+  NVQPP_LD_PATH="$LLD_BIN"
   LINKER_TO_USE="lld"
   LINKER_FLAGS="-fuse-ld=lld -B$LLVM_INSTALL_PREFIX/bin"
   LINKER_FLAG_LIST="\
     -DCMAKE_LINKER='"$LINKER_TO_USE"' \
     -DCMAKE_EXE_LINKER_FLAGS='"$LINKER_FLAGS"' \
+    -DCMAKE_SHARED_LINKER_FLAGS='"$LINKER_FLAGS"' \
+    -DCMAKE_MODULE_LINKER_FLAGS='"$LINKER_FLAGS"' \
     -DLLVM_USE_LINKER='"$LINKER_TO_USE"'"
 else
   echo "Using the system linker."
@@ -220,17 +230,13 @@ if [ -z "$CUDAHOSTCXX" ] && [ -z "$CUDAFLAGS" ]; then
   fi
 fi
 
-# Determine OpenMP flags (check for .so on Linux, .dylib on macOS)
+# Determine OpenMP flags (check for .so on Linux, .dylib on macOS).
+# Use -idirafter so omp.h is searched after system headers (avoids a conflict
+# with clang's stdint.h on macOS).
 OpenMP_libomp_LIBRARY_PATH=$(find "$LLVM_INSTALL_PREFIX" \( -name 'libomp.so' -o -name 'libomp.dylib' \) 2>/dev/null | head -1)
 if [ -n "$OpenMP_libomp_LIBRARY_PATH" ]; then
   omp_header_dir=$(find "$LLVM_INSTALL_PREFIX" -name 'omp.h' -print -quit 2>/dev/null | xargs dirname)
-  # Apple Clang requires -Xpreprocessor -fopenmp; LLVM Clang/GCC use -fopenmp directly
-  # Use -idirafter to add omp.h path AFTER system headers (avoids conflicts with clang's stdint.h)
-  if ${CXX:-c++} --version 2>&1 | grep -q "Apple clang"; then
-    OpenMP_FLAGS="${OpenMP_FLAGS:--Xpreprocessor -fopenmp -idirafter $omp_header_dir}"
-  else
-    OpenMP_FLAGS="${OpenMP_FLAGS:--fopenmp -idirafter $omp_header_dir}"
-  fi
+  OpenMP_FLAGS="${OpenMP_FLAGS:--fopenmp -idirafter $omp_header_dir}"
 fi
 
 # Check for ccache and configure compiler launcher
@@ -275,6 +281,7 @@ cmake_args="-G Ninja '"$repo_root"' \
   -DCUDAQ_BUILD_TESTS=${CUDAQ_BUILD_TESTS:-TRUE} \
   -DCUDAQ_TEST_MOCK_SERVERS=${CUDAQ_BUILD_TESTS:-TRUE} \
   -DCMAKE_COMPILE_WARNING_AS_ERROR=${CUDAQ_WERROR:-ON} \
+  -Dnanobind_DIR=$NANOBIND_INSTALL_PREFIX/nanobind/cmake \
   $extra_cmake_args"
 
 # Add CUDA-specific flags only on non-macOS systems
diff --git a/scripts/build_llvm.sh b/scripts/build_llvm.sh
index 3495b8f5117..13e219d7276 100755
--- a/scripts/build_llvm.sh
+++ b/scripts/build_llvm.sh
@@ -11,8 +11,8 @@
 # This scripts builds the clang and mlir project from the source in the LLVM submodule.
 # The binaries will be installed in the folder defined by the LLVM_INSTALL_PREFIX environment
 # variable, or in $HOME/.llvm if LLVM_INSTALL_PREFIX is not defined.
-# If Python bindings are generated, pybind11 will be built and installed in the location 
-# defined by PYBIND11_INSTALL_PREFIX unless that folder already exists.
+# If Python bindings are generated, nanobind will be built and installed in the location 
+# defined by NANOBIND_INSTALL_PREFIX unless that folder already exists.
 #
 # Usage:
 # bash scripts/build_llvm.sh
@@ -34,7 +34,7 @@
 
 LLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-$HOME/.llvm}
 LLVM_PROJECTS=${LLVM_PROJECTS:-'clang;lld;mlir;python-bindings'}
-PYBIND11_INSTALL_PREFIX=${PYBIND11_INSTALL_PREFIX:-/usr/local/pybind11}
+NANOBIND_INSTALL_PREFIX=${NANOBIND_INSTALL_PREFIX:-/usr/local/nanobind}
 Python3_EXECUTABLE=${Python3_EXECUTABLE:-python3}
 
 # Process command line arguments.
@@ -75,12 +75,12 @@ if [ -z "${llvm_projects##*python-bindings;*}" ]; then
   mlir_python_bindings=ON
   projects=("${projects[@]/python-bindings}")
 
-  if [ ! -d "$PYBIND11_INSTALL_PREFIX" ] || [ -z "$(ls -A "$PYBIND11_INSTALL_PREFIX"/* 2> /dev/null)" ]; then
+  if [ ! -d "$NANOBIND_INSTALL_PREFIX" ] || [ -z "$(ls -A "$NANOBIND_INSTALL_PREFIX"/* 2> /dev/null)" ]; then
     cd "$this_file_dir" && cd $(git rev-parse --show-toplevel)
-    echo "Building PyBind11..."
-    git submodule update --init --recursive --recommend-shallow --single-branch tpls/pybind11 
-    mkdir -p "tpls/pybind11/build" && cd "tpls/pybind11/build"
-    cmake -G Ninja ../ -DCMAKE_INSTALL_PREFIX="$PYBIND11_INSTALL_PREFIX" -DPYBIND11_TEST=False
+    echo "Building nanobind..."
+    git submodule update --init --recursive --recommend-shallow --single-branch tpls/nanobind
+    mkdir -p "tpls/nanobind/build" && cd "tpls/nanobind/build"
+    cmake -G Ninja ../ -DCMAKE_INSTALL_PREFIX="$NANOBIND_INSTALL_PREFIX" -DNB_TEST=False
     cmake --build . --target install --config Release
   fi
 fi
@@ -155,18 +155,14 @@ if [ -z "${llvm_projects##*clang;*}" ]; then
 fi
 if [ -z "${llvm_projects##*flang;*}" ]; then
   echo "- including Flang components"
-  llvm_components+="flang-new;"
+  llvm_components+="flang;"
   projects=("${projects[@]/flang}")
 fi
 if [ -z "${llvm_projects##*openmp;*}" ]; then
   echo "- including OpenMP components"
-  # There are no suitable distribution components for libomp. 
+  # There are no suitable distribution components for libomp.
   # We instead manually build suitable targets.
   install_targets+=" omp"
-  # omptarget (GPU offloading) is only available on Linux with CUDA
-  if [ "$(uname)" != "Darwin" ]; then
-    llvm_components+="omptarget;"
-  fi
   llvm_components+="openmp-resource-headers;"
   projects=("${projects[@]/openmp}")
 fi
@@ -181,7 +177,7 @@ if [ -z "${llvm_projects##*mlir;*}" ]; then
 fi
 if [ -z "${llvm_projects##*lld;*}" ]; then
   echo "- including LLD components"
-  llvm_enable_zlib=ON # certain system libraries are compressed with ELFCOMPRESS_ZLIB, requiring zlib support for lld
+  [ "${LLVM_ENABLE_ZLIB}" != "OFF" ] && llvm_enable_zlib=ON # certain system libraries are compressed with ELFCOMPRESS_ZLIB, requiring zlib support for lld
   llvm_components+="lld;"
   projects=("${projects[@]/lld}")
 fi
@@ -196,25 +192,18 @@ if [ "$(echo ${projects[@]} | xargs)" != "" ]; then
   install_targets="install $install_targets"
 else 
   install_targets="install-distribution-stripped $install_targets"
-  if [ -n "$mlir_python_bindings" ]; then
-    # Cherry-pick the necessary commit to have a distribution target
-    # for the mlir-python-sources; to be removed after we update to LLVM 17.
-    echo "Cherry-picking commit 9494bd84df3c5b496fc087285af9ff40d7859b6a"
-    git cherry-pick --no-commit 9494bd84df3c5b496fc087285af9ff40d7859b6a
-    if [ ! 0 -eq $? ]; then
-      echo "Cherry-pick failed."
-      if $(git rev-parse --is-shallow-repository); then
-        echo "Unshallow the repository and try again."
-        (return 0 2>/dev/null) && return 1 || exit 1
-      fi
-    fi
-  fi
 fi
 
 # A hack, since otherwise the build can fail due to line endings in the LLVM script:
 cat "$LLVM_SOURCE/llvm/cmake/config.guess" | tr -d '\r' > ~config.guess
 cat ~config.guess > "$LLVM_SOURCE/llvm/cmake/config.guess" && rm -rf ~config.guess
 
+# `-fno-gnu-unique` is GCC-only
+LLVM_EXTRA_CXX_FLAGS="-w"
+if "${CXX:-c++}" --version 2>&1 | grep -q "Free Software Foundation"; then
+  LLVM_EXTRA_CXX_FLAGS="$LLVM_EXTRA_CXX_FLAGS -fno-gnu-unique"
+fi
+
 # Some flags that may be useful to build a GPU-offload-capable compiler: 
 # targets_to_build="host;NVPTX"
 #  -DLLVM_TARGETS_TO_BUILD='"$targets_to_build"' \
@@ -222,6 +211,7 @@ cat ~config.guess > "$LLVM_SOURCE/llvm/cmake/config.guess" && rm -rf ~config.gue
 # maybe:  -DLLVM_RUNTIME_TARGETS='nvptx64-nvidia-cuda' \
 cmake_args=" \
   -DLLVM_DEFAULT_TARGET_TRIPLE='"$(bash $LLVM_SOURCE/llvm/cmake/config.guess)"' \
+  -DLLVM_TARGETS_TO_BUILD=host \
   -DCMAKE_BUILD_TYPE=$build_configuration \
   -DCMAKE_INSTALL_PREFIX='"$LLVM_INSTALL_PREFIX"' \
   -DLLVM_ENABLE_PROJECTS='"${llvm_projects%;}"' \
@@ -232,7 +222,8 @@ cmake_args=" \
   -DPython3_EXECUTABLE='"$Python3_EXECUTABLE"' \
   -DMLIR_ENABLE_BINDINGS_PYTHON=$mlir_python_bindings \
   -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-  -DCMAKE_CXX_FLAGS='-w'"
+  -DCMAKE_CXX_FLAGS='"$LLVM_EXTRA_CXX_FLAGS"' \
+  -Dnanobind_DIR=$NANOBIND_INSTALL_PREFIX/nanobind/cmake"
 
 if [ -z "$LLVM_CMAKE_CACHE" ]; then 
   LLVM_CMAKE_CACHE=`find "$this_file_dir/.." -path '*/cmake/caches/*' -name LLVM.cmake`
@@ -319,13 +310,17 @@ if [ -n "$llvm_runtimes" ]; then
     # We can use a default config file to set specific clang configurations.
     # See https://clang.llvm.org/docs/UsersManual.html#configuration-files
     clang_config_file="$LLVM_INSTALL_PREFIX/bin/clang++.cfg"
-    echo '-L"'$LLVM_INSTALL_PREFIX/lib'"' > "$clang_config_file"
+    if [ -f "$LLVM_INSTALL_PREFIX/bin/ld.lld" ]; then
+      echo '-fuse-ld=lld' > "$clang_config_file"
+    fi
+    echo '-L"'$LLVM_INSTALL_PREFIX/lib'"' >> "$clang_config_file"
     echo '-Wl,-rpath,"'$LLVM_INSTALL_PREFIX/lib'"' >> "$clang_config_file"
     target_specific_libs=`ls -d "$LLVM_INSTALL_PREFIX/lib"/*linux*`
     for libdir in $target_specific_libs; do
       echo '-L"'$libdir'"' >> "$clang_config_file"
       echo '-Wl,-rpath,"'$libdir'"' >> "$clang_config_file"
     done
+    cp "$clang_config_file" "$LLVM_INSTALL_PREFIX/bin/clang.cfg"
     echo "Added default configuration $clang_config_file."
   fi
 fi
diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh
index ae87ddefc6a..b286751a523 100755
--- a/scripts/build_wheel.sh
+++ b/scripts/build_wheel.sh
@@ -274,13 +274,9 @@ if [ -z "$OpenMP_libomp_LIBRARY_PATH" ] && [ "$platform" = "Darwin" ]; then
 fi
 if [ -n "$OpenMP_libomp_LIBRARY_PATH" ]; then
     omp_header_dir=$(find "$OpenMP_SEARCH_PREFIX" -name 'omp.h' -print -quit 2>/dev/null | xargs dirname)
-    # Apple Clang requires -Xpreprocessor -fopenmp; LLVM Clang/GCC use -fopenmp directly
-    # Use -idirafter to add omp.h path AFTER system headers (avoids conflicts with clang's stdint.h)
-    if ${CXX:-c++} --version 2>&1 | grep -q "Apple clang"; then
-        OpenMP_FLAGS="${OpenMP_FLAGS:--Xpreprocessor -fopenmp -idirafter $omp_header_dir}"
-    else
-        OpenMP_FLAGS="${OpenMP_FLAGS:--fopenmp -idirafter $omp_header_dir}"
-    fi
+    # Use -idirafter so omp.h is searched after system headers (avoids a
+    # conflict with clang's stdint.h on macOS).
+    OpenMP_FLAGS="${OpenMP_FLAGS:--fopenmp -idirafter $omp_header_dir}"
     echo "OpenMP found: $OpenMP_libomp_LIBRARY_PATH"
 else
     echo "OpenMP not found - wheel will be built without OpenMP parallelization"
diff --git a/scripts/configure_build.sh b/scripts/configure_build.sh
index 17d4cc36125..a28578651ee 100644
--- a/scripts/configure_build.sh
+++ b/scripts/configure_build.sh
@@ -103,7 +103,7 @@ fi
 
 if [ "$1" == "install-gcc" ]; then
 # [>gccInstall]
-    GCC_VERSION=${GCC_VERSION:-11}
+    GCC_VERSION=${GCC_VERSION:-12}
     dnf install -y --nobest --setopt=install_weak_deps=False \
         gcc-toolset-${GCC_VERSION}
     # Enabling the toolchain globally is only needed for debug builds
@@ -116,7 +116,7 @@ if [ "$1" == "install-gcc" ]; then
 fi
 
 # [>ToolchainConfiguration]
-export GCC_TOOLCHAIN=/opt/rh/gcc-toolset-11/root/usr/
+export GCC_TOOLCHAIN=/opt/rh/gcc-toolset-12/root/usr/
 export CXX="${GCC_TOOLCHAIN}/bin/g++"
 export CC="${GCC_TOOLCHAIN}/bin/gcc"
 export CUDACXX=/usr/local/cuda/bin/nvcc
diff --git a/scripts/generate_cc.sh b/scripts/generate_cc.sh
index c914fca07c2..2ebc41369d4 100644
--- a/scripts/generate_cc.sh
+++ b/scripts/generate_cc.sh
@@ -22,7 +22,7 @@
 #
 # Note:
 # The script should be run in the cuda-quantum-devdeps container environment.
-# current tested image: ghcr.io/nvidia/cuda-quantum-devdeps:clang16-main
+# current tested image: ghcr.io/nvidia/cuda-quantum-devdeps:llvm-main
 # Don't enable GPU
 # C/C++ coverage is located in the ./build/ccoverage directory
 # Python coverage is located in the ./build/pycoverage directory
@@ -65,8 +65,12 @@ repo_root=$(cd "$this_file_dir" && git rev-parse --show-toplevel)
 # Set envs
 if $gen_cpp_coverage; then
     export CUDAQ_ENABLE_CC=ON
-    mkdir -p /usr/lib/llvm-16/lib/clang/16/lib/linux
-    ln -s /usr/local/llvm/lib/clang/16/lib/x86_64-unknown-linux-gnu/libclang_rt.profile.a /usr/lib/llvm-16/lib/clang/16/lib/linux/libclang_rt.profile-x86_64.a
+    clang_ver=$(clang --version 2>/dev/null | grep -oP 'version \K[0-9]+')
+    arch=$(uname -m)-unknown-linux-gnu
+    profile_src="$LLVM_INSTALL_PREFIX/lib/clang/$clang_ver/lib/$arch/libclang_rt.profile.a"
+    profile_dst="/usr/lib/llvm-$clang_ver/lib/clang/$clang_ver/lib/linux/libclang_rt.profile-$(uname -m).a"
+    mkdir -p "$(dirname "$profile_dst")"
+    ln -sf "$profile_src" "$profile_dst"
     export LLVM_PROFILE_FILE=${repo_root}/build/tmp/cudaq-cc/profile-%9m.profraw
 fi
 
diff --git a/scripts/install_prerequisites.sh b/scripts/install_prerequisites.sh
index da627de417c..f9f01fe5fbc 100755
--- a/scripts/install_prerequisites.sh
+++ b/scripts/install_prerequisites.sh
@@ -251,6 +251,10 @@ set -e
 trap 'prepare_exit && ((return 0 2>/dev/null) && return 1 || exit 1)' EXIT
 this_file_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
+if [ "$(uname)" = "Darwin" ] && [ -x "$(command -v xcrun)" ]; then
+  export SDKROOT="${SDKROOT:-$(xcrun --show-sdk-path)}"
+fi
+
 # [Toolchain] CMake, ninja and C/C++ compiler
 if $install_all && [ -z "$(echo $exclude_prereq | grep toolchain)" ]; then
   if [ -n "$toolchain" ] || [ ! -x "$(command -v "$CC")" ] || [ ! -x "$(command -v "$CXX")" ]; then
@@ -363,6 +367,23 @@ if [ -n "$ZLIB_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep zlib)" ]
   fi
 fi
 
+# [nanobind] Needed for MLIR Python bindings (MLIR 22+)
+# Install nanobind independently of the LLVM build so that it is available
+# even when LLVM is restored from cache.
+if [ -n "$NANOBIND_INSTALL_PREFIX" ]; then
+  if [ ! -d "$NANOBIND_INSTALL_PREFIX" ] || [ -z "$(ls -A "$NANOBIND_INSTALL_PREFIX"/* 2> /dev/null)" ]; then
+    echo "Building nanobind..."
+    cd "$this_file_dir" && cd $(git rev-parse --show-toplevel)
+    git submodule update --init --recursive --recommend-shallow --single-branch tpls/nanobind
+    mkdir -p "tpls/nanobind/build" && cd "tpls/nanobind/build"
+    cmake -G Ninja ../ -DCMAKE_INSTALL_PREFIX="$NANOBIND_INSTALL_PREFIX" -DNB_TEST=False
+    cmake --build . --target install --config Release
+    cd "$working_dir"
+  else
+    echo "nanobind already installed in $NANOBIND_INSTALL_PREFIX."
+  fi
+fi
+
 # [LLVM/MLIR] Needed to build the CUDA Quantum toolchain
 if [ -n "$LLVM_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep llvm)" ]; then
   if [ ! -d "$LLVM_INSTALL_PREFIX/lib/cmake/llvm" ]; then
@@ -370,20 +391,23 @@ if [ -n "$LLVM_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep llvm)" ]
     LLVM_INSTALL_PREFIX="$LLVM_INSTALL_PREFIX" \
     LLVM_PROJECTS="$LLVM_PROJECTS" \
     PYBIND11_INSTALL_PREFIX="$PYBIND11_INSTALL_PREFIX" \
+    NANOBIND_INSTALL_PREFIX="$NANOBIND_INSTALL_PREFIX" \
     Python3_EXECUTABLE="$Python3_EXECUTABLE" \
     bash "$this_file_dir/build_llvm.sh" -v
-  else 
+  else
     echo "LLVM already installed in $LLVM_INSTALL_PREFIX."
   fi
 
-  if [ "$toolchain" = "llvm" ]; then
+  if [ "$toolchain" = "llvm" ] || [ "$(uname)" = "Darwin" ]; then
     #rm -rf "$llvm_stage1_tmpdir"
-    export CC="$LLVM_INSTALL_PREFIX/bin/clang" 
+    export CC="$LLVM_INSTALL_PREFIX/bin/clang"
     export CXX="$LLVM_INSTALL_PREFIX/bin/clang++"
-    export FC="$LLVM_INSTALL_PREFIX/bin/flang-new"
     echo "Configured C compiler: $CC"
     echo "Configured C++ compiler: $CXX"
-    echo "Configured Fortran compiler: $FC"
+    if [ -x "$LLVM_INSTALL_PREFIX/bin/flang" ]; then
+      export FC="$LLVM_INSTALL_PREFIX/bin/flang"
+      echo "Configured Fortran compiler: $FC"
+    fi
   fi
 fi
 
@@ -404,7 +428,7 @@ if [ -n "$BLAS_INSTALL_PREFIX" ] && [ -z "$(echo $exclude_prereq | grep blas)" ]
     # See also: https://github.com/NVIDIA/cuda-quantum/issues/452
     wget "${BLAS_TARBALL_URL}"
     tar -xzvf "blas-${BLAS_VERSION}.tgz" && cd BLAS-3.11.0
-    make FC="${FC:-gfortran}"
+    make FC="${FC:-gfortran}" FFLAGS="-O2"
     mkdir -p "$BLAS_INSTALL_PREFIX"
     mv blas_*.a "$BLAS_INSTALL_PREFIX/libblas.a"
 
diff --git a/scripts/install_toolchain.sh b/scripts/install_toolchain.sh
index 1351732985c..b402ee74f52 100644
--- a/scripts/install_toolchain.sh
+++ b/scripts/install_toolchain.sh
@@ -27,7 +27,7 @@ fi
 # -or-
 #   source scripts/install_toolchain.sh -t <toolchain> -e path/to/dir
 #
-# where <toolchain> can be either llvm, clang16, gcc12, or gcc11. 
+# where <toolchain> can be either llvm or gcc12.
 # The -e option creates a init_command.sh file in the given directory that 
 # can be used to reinstall the same toolchain if needed.
 
@@ -94,40 +94,25 @@ if [ "${toolchain#gcc}" != "$toolchain" ]; then
       echo "No supported package manager detected." >&2
     fi
 
-elif [ "$toolchain" = "clang16" ]; then
-
-    if [ -x "$(command -v apt-get)" ]; then
-        temp_install_if_command_unknown wget wget
-        temp_install_if_command_unknown gpg gnupg
-        temp_install_if_command_unknown add-apt-repository software-properties-common
-
-        wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
-        add-apt-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-16 main"
-        apt-get update && apt-get install -y --no-install-recommends clang-16
-    elif [ -x "$(command -v dnf)" ]; then
-        dnf install -y --nobest --setopt=install_weak_deps=False clang-16.0.6
-    else
-        echo "No supported package manager detected." >&2
-    fi
-
-    CC="$(find_executable clang-16)" 
-    CXX="$(find_executable clang++-16)" 
-    FC="$(find_executable flang-new-16)"
-
 elif [ "$toolchain" = "llvm" ]; then
 
     LLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-"$HOME/.llvm"}
     if [ ! -f "$LLVM_INSTALL_PREFIX/bin/clang" ] || [ ! -f "$LLVM_INSTALL_PREFIX/bin/clang++" ] || [ ! -f "$LLVM_INSTALL_PREFIX/bin/ld.lld" ]; then
 
         if [ ! -x "$(command -v "$CC")" ] || [ ! -x "$(command -v "$CXX")" ]; then
-            # We use the clang to bootstrap the llvm build since it is faster than gcc.
-            source "$(readlink -f "${BASH_SOURCE[0]}")" -t clang16 || \
-            echo -e "\e[01;31mError: Failed to install clang compiler for bootstrapping.\e[0m" >&2
-            toolchain=llvm
-            if [ ! -x "$(command -v "$CC")" ] || [ ! -x "$(command -v "$CXX")" ]; then
+            # Use the system clang to bootstrap the llvm build.
+            if [ -x "$(command -v apt-get)" ]; then
+                apt-get update && apt-get install -y --no-install-recommends clang lld
+                export CC=clang CXX=clang++
+            elif [ -x "$(command -v dnf)" ]; then
+                dnf install -y --nobest --setopt=install_weak_deps=False clang lld
+                CC="$(find_executable clang)" && CXX="$(find_executable clang++)"
+                export CC CXX
+            else
                 echo -e "\e[01;31mError: No compiler set for bootstrapping. Please define the environment variables CC and CXX.\e[0m" >&2
                 (return 0 2>/dev/null) && return 2 || exit 2
             fi
+            toolchain=llvm
         fi
 
         temp_install_if_command_unknown ninja ninja-build
@@ -152,12 +137,12 @@ elif [ "$toolchain" = "llvm" ]; then
 
     CC="$LLVM_INSTALL_PREFIX/bin/clang"
     CXX="$LLVM_INSTALL_PREFIX/bin/clang++"
-    FC="$LLVM_INSTALL_PREFIX/bin/flang-new"
+    FC="$LLVM_INSTALL_PREFIX/bin/flang"
 
 else
 
     echo "The requested toolchain cannot be installed by this script."
-    echo "Supported toolchains: llvm, clang16, gcc12, gcc11."
+    echo "Supported toolchains: llvm, gcc12."
     (return 0 2>/dev/null) && return 1 || exit 1
 
 fi
diff --git a/scripts/set_env_defaults.sh b/scripts/set_env_defaults.sh
index a33a72caaaa..98e2146e891 100644
--- a/scripts/set_env_defaults.sh
+++ b/scripts/set_env_defaults.sh
@@ -32,6 +32,7 @@ if [ "$(uname)" = "Darwin" ]; then
   export OPENSSL_INSTALL_PREFIX=${OPENSSL_INSTALL_PREFIX:-$HOME/.local/ssl}
   export CURL_INSTALL_PREFIX=${CURL_INSTALL_PREFIX:-$HOME/.local/curl}
   export PYBIND11_INSTALL_PREFIX=${PYBIND11_INSTALL_PREFIX:-$HOME/.local/pybind11}
+  export NANOBIND_INSTALL_PREFIX=${NANOBIND_INSTALL_PREFIX:-$HOME/.local/nanobind}
   export AWS_INSTALL_PREFIX=${AWS_INSTALL_PREFIX:-$HOME/.local/aws}
   # Include OpenMP by default on macOS since CUDA/GPU acceleration is unavailable.
   # To skip building OpenMP with LLVM (e.g., if using
@@ -41,10 +42,21 @@ if [ "$(uname)" = "Darwin" ]; then
   # Set minimum macOS deployment target for consistent builds.
   # This ensures LLVM/clang and CUDA-Q libraries use the same target.
   export MACOSX_DEPLOYMENT_TARGET="${MACOSX_DEPLOYMENT_TARGET:-13.0}"
+  # Default CC/CXX to the built LLVM toolchain once it exists. This keeps the
+  # CUDA-Q build, nvq++, and the just-built MLIR/Clang all on the same
+  # compiler (same warning set, same libc++ target), avoiding the drift
+  # between Apple Clang / Homebrew Clang / upstream Clang that makes the
+  # macOS path fragile. Guarded on the install existing so the first run of
+  # build_llvm.sh (which needs a working system compiler) isn't broken.
+  if [ -x "$LLVM_INSTALL_PREFIX/bin/clang++" ]; then
+    export CC="${CC:-$LLVM_INSTALL_PREFIX/bin/clang}"
+    export CXX="${CXX:-$LLVM_INSTALL_PREFIX/bin/clang++}"
+  fi
 else
   # Linux: system-wide installations (may require sudo)
   export LLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-/opt/llvm}
   export PYBIND11_INSTALL_PREFIX=${PYBIND11_INSTALL_PREFIX:-/usr/local/pybind11}
+  export NANOBIND_INSTALL_PREFIX=${NANOBIND_INSTALL_PREFIX:-/usr/local/nanobind}
   export BLAS_INSTALL_PREFIX=${BLAS_INSTALL_PREFIX:-/usr/local/blas}
   export ZLIB_INSTALL_PREFIX=${ZLIB_INSTALL_PREFIX:-/usr/local/zlib}
   export OPENSSL_INSTALL_PREFIX=${OPENSSL_INSTALL_PREFIX:-/usr/lib/ssl}
diff --git a/targettests/Remote-Sim/observe.cpp b/targettests/Remote-Sim/observe.cpp
index 46827f8d5e8..71cbe0e2960 100644
--- a/targettests/Remote-Sim/observe.cpp
+++ b/targettests/Remote-Sim/observe.cpp
@@ -51,7 +51,9 @@ int main() {
         1, [&](const std::vector<double> &x, std::vector<double> &grad_vec) {
           double e = cudaq::observe(ansatz{}, h, x[0]);
           gradient.compute(x, grad_vec, h, e);
-          printf("<H>(%lf, %lf) = %lf\n", x[0], x[1], e);
+          // FIXME: temp fix for oob vector access
+          // printf("<H>(%lf, %lf) = %lf\n", x[0], x[1], e);
+          printf("<H>(%lf) = %lf\n", x[0], e);
           return e;
         });
     printf("Optimal value = %.16lf\n", opt_val);
diff --git a/targettests/execution/explicit_measurement.cpp b/targettests/execution/explicit_measurement.cpp
index 86ed3522e53..9af3254195e 100644
--- a/targettests/execution/explicit_measurement.cpp
+++ b/targettests/execution/explicit_measurement.cpp
@@ -7,13 +7,13 @@
  ******************************************************************************/
 
 // clang-format off
-// RUN: nvq++ --target stim %s -o %t && CUDAQ_LOG_LEVEL=info %t 2>&1 | grep "Creating new Stim frame simulator" | wc -l | FileCheck %s
-// RUN: nvq++ --target anyon      --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
+// RUN: if %stim_avail; then nvq++ --target stim %s -o %t && CUDAQ_LOG_LEVEL=info %t 2>&1 | grep "Creating new Stim frame simulator" | wc -l | FileCheck %s ; fi
+// RUN: nvq++ --target anyon --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL ; fi
 // RUN: nvq++ --target infleqtion --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
-// RUN: nvq++ --target ionq       --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
+// RUN: nvq++ --target ionq --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
 // RUN: nvq++ --target iqm --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt %t 2>&1 | FileCheck %s -check-prefix=FAIL
-// RUN: nvq++ --target oqc        --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
+// RUN: nvq++ --target oqc --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
 // clang-format on
 
diff --git a/targettests/execution/mapping_test-1.cpp b/targettests/execution/mapping_test-1.cpp
index b4fc40442ce..e7ae8975d1c 100644
--- a/targettests/execution/mapping_test-1.cpp
+++ b/targettests/execution/mapping_test-1.cpp
@@ -37,17 +37,18 @@ int main() {
 }
 
 // clang-format off
-// QUAKE-LABEL: tail call void @__quantum__qis__x__body(%Qubit* null)
-// QUAKE:       tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*))
-// QUAKE:       tail call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 1 to %Qubit*))
-// QUAKE:       tail call void @__quantum__qis__swap__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 1 to %Qubit*))
-// QUAKE:       tail call void @__quantum__qis__cnot__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Qubit* nonnull inttoptr (i64 2 to %Qubit*))
-// QUAKE:       tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Result* writeonly null)
-// QUAKE:       tail call void @__quantum__qis__mz__body(%Qubit* null, %Result* nonnull writeonly inttoptr (i64 1 to %Result*))
-// QUAKE:       tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Result* nonnull writeonly inttoptr (i64 2 to %Result*))
-// QUAKE:       tail call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-// QUAKE:       tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 1 to %Result*), i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-// QUAKE:       tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 2 to %Result*), i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.{{.*}}, i64 0, i64 0))
+// QUAKE-LABEL: tail call void @__quantum__qis__x__body(ptr null)
+// QUAKE:       tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 1 to ptr))
+// QUAKE:       tail call void @__quantum__qis__cnot__body(ptr null, ptr nonnull inttoptr (i64 1 to ptr))
+// QUAKE:       tail call void @__quantum__qis__swap__body(ptr null, ptr nonnull inttoptr (i64 1 to ptr))
+// QUAKE:       tail call void @__quantum__qis__cnot__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 2 to ptr))
+// QUAKE:       tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr writeonly null)
+// QUAKE:       tail call void @__quantum__qis__mz__body(ptr null, ptr nonnull writeonly inttoptr (i64 1 to ptr))
+// QUAKE:       tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull writeonly inttoptr (i64 2 to ptr))
+// QUAKE:       tail call void @__quantum__rt__array_record_output(i64 3, ptr nonnull @cstr.{{.*}})
+// QUAKE:       tail call void @__quantum__rt__result_record_output(ptr nonnull null, ptr nonnull @cstr.{{.*}})
+// QUAKE:       tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.{{.*}})
+// QUAKE:       tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull @cstr.{{.*}})
 // QUAKE:       ret void
 
 // CHECK-LABEL: most_probable "101"
diff --git a/targettests/execution/mapping_test-2.cpp b/targettests/execution/mapping_test-2.cpp
index c33b473b619..5074cf162ef 100644
--- a/targettests/execution/mapping_test-2.cpp
+++ b/targettests/execution/mapping_test-2.cpp
@@ -33,17 +33,18 @@ int main() {
   return 0;
 }
 
-// CHECK:         tail call void @__quantum__qis__x__body(%Qubit* null)
-// CHECK:         tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*))
-// CHECK:         tail call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 1 to %Qubit*))
-// CHECK:         tail call void @__quantum__qis__swap__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 1 to %Qubit*))
-// CHECK:         tail call void @__quantum__qis__cnot__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Qubit* nonnull inttoptr (i64 2 to %Qubit*))
-// CHECK:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Result* writeonly null)
-// CHECK:         tail call void @__quantum__qis__mz__body(%Qubit* null, %Result* nonnull writeonly inttoptr (i64 1 to %Result*))
-// CHECK:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Result* nonnull writeonly inttoptr (i64 2 to %Result*))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([9 x i8], [9 x i8]* @cstr.726573756C74253000, i64 0, i64 0))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 1 to %Result*), i8* nonnull getelementptr inbounds ([9 x i8], [9 x i8]* @cstr.726573756C74253100, i64 0, i64 0))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 2 to %Result*), i8* nonnull getelementptr inbounds ([9 x i8], [9 x i8]* @cstr.726573756C74253200, i64 0, i64 0))
+// CHECK:         tail call void @__quantum__qis__x__body(ptr null)
+// CHECK:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__qis__cnot__body(ptr null, ptr nonnull inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__qis__swap__body(ptr null, ptr nonnull inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__qis__cnot__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 2 to ptr))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr writeonly null)
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr null, ptr nonnull writeonly inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull writeonly inttoptr (i64 2 to ptr))
+// CHECK:         tail call void @__quantum__rt__array_record_output(i64 3, ptr nonnull @cstr.61727261793C6931207820333E00)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull null, ptr nonnull @cstr.726573756C74253000)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.726573756C74253100)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull @cstr.726573756C74253200)
 // CHECK:         ret void
 // STDOUT-DAG: __global__ : { 101:1000 }
 // STDOUT-DAG: result%0 : { 1:1000 }
diff --git a/targettests/execution/qir_cond_for_break.cpp b/targettests/execution/qir_cond_for_break.cpp
index 9a024c2ca7c..9165911ff97 100644
--- a/targettests/execution/qir_cond_for_break.cpp
+++ b/targettests/execution/qir_cond_for_break.cpp
@@ -7,7 +7,7 @@
  ******************************************************************************/
 
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
-// RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %stim_avail; then CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s ; fi
 
 #include <cudaq.h>
 #include <iostream>
diff --git a/targettests/execution/qir_cond_for_loop-1.cpp b/targettests/execution/qir_cond_for_loop-1.cpp
index 7db107d762d..50dea0d598a 100644
--- a/targettests/execution/qir_cond_for_loop-1.cpp
+++ b/targettests/execution/qir_cond_for_loop-1.cpp
@@ -8,7 +8,7 @@
 
 // clang-format off
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
-// RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %stim_avail; then CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s ; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/qir_cond_for_loop-3.cpp b/targettests/execution/qir_cond_for_loop-3.cpp
index a13e27ff08b..1daf8843c5f 100644
--- a/targettests/execution/qir_cond_for_loop-3.cpp
+++ b/targettests/execution/qir_cond_for_loop-3.cpp
@@ -8,7 +8,7 @@
 
 // clang-format off
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
-// RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %stim_avail; then CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s ; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/qir_cond_for_loop-4.cpp b/targettests/execution/qir_cond_for_loop-4.cpp
index a81ab7991e0..e6595359463 100644
--- a/targettests/execution/qir_cond_for_loop-4.cpp
+++ b/targettests/execution/qir_cond_for_loop-4.cpp
@@ -8,7 +8,7 @@
 
 // clang-format off
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
-// RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %stim_avail; then CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s ; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/qir_cond_for_loop-5.cpp b/targettests/execution/qir_cond_for_loop-5.cpp
index eba04655137..c6cad2f9df0 100644
--- a/targettests/execution/qir_cond_for_loop-5.cpp
+++ b/targettests/execution/qir_cond_for_loop-5.cpp
@@ -8,7 +8,7 @@
 
 // clang-format off
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
-// RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %stim_avail; then CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s ; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/qir_cond_for_loop-6.cpp b/targettests/execution/qir_cond_for_loop-6.cpp
index e81ca9d2475..0811e1ef1ef 100644
--- a/targettests/execution/qir_cond_for_loop-6.cpp
+++ b/targettests/execution/qir_cond_for_loop-6.cpp
@@ -8,7 +8,7 @@
 
 // clang-format off
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
-// RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %stim_avail; then CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s ; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/qir_simple_cond-1.cpp b/targettests/execution/qir_simple_cond-1.cpp
index 6bebf0b9a05..ef0fa706d49 100644
--- a/targettests/execution/qir_simple_cond-1.cpp
+++ b/targettests/execution/qir_simple_cond-1.cpp
@@ -7,7 +7,7 @@
  ******************************************************************************/
 
 // clang-format off
-// RUN: nvq++ --target stim --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: if %stim_avail; then nvq++ --target stim --enable-mlir %s -o %t && %t | FileCheck %s ; fi
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
 
diff --git a/targettests/execution/qir_simple_cond-2.cpp b/targettests/execution/qir_simple_cond-2.cpp
index cf8c961884d..e78e5c90b92 100644
--- a/targettests/execution/qir_simple_cond-2.cpp
+++ b/targettests/execution/qir_simple_cond-2.cpp
@@ -8,11 +8,6 @@
 
 // clang-format off
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
-// XFAIL: *
-// ^^^^^ This is caused by this error: invalid instruction found:   %2 = xor i1 %0, true
-//       This error is reasonable given the current version of the Adaptive
-//       Profile that is supported, but future versions of the Adaptive
-//       Profile (that contain optional capabilities) may legalize this.
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/qir_string_label.cpp b/targettests/execution/qir_string_label.cpp
index 03f4c8dd108..725e3a2b856 100644
--- a/targettests/execution/qir_string_label.cpp
+++ b/targettests/execution/qir_string_label.cpp
@@ -32,9 +32,11 @@ int main() {
 }
 
 // clang-format off
-// QIR_ADAPTIVE: @cstr.[[ADDRESS:[A-Z0-9]+]] = private constant [14 x i8] c"measureResult\00"
+// QIR_ADAPTIVE-DAG: @cstr.[[TYPE_ADDR:[A-Z0-9]+]] = private constant [14 x i8] c"array<i1 x 1>\00"
+// QIR_ADAPTIVE-DAG: @cstr.[[LABEL_ADDR:[A-Z0-9]+]] = private constant [14 x i8] c"measureResult\00"
 // CHECK-LABEL: define void @__nvqpp__mlirgen__function_qir_test.
 // CHECK-SAME:    () local_unnamed_addr #[[ATTR_1:[0-9]+]] {
-// QIR_ADAPTIVE:         call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @cstr.[[ADDRESS]], i64 0, i64 0))
+// QIR_ADAPTIVE:         tail call void @__quantum__rt__array_record_output(i64 1, ptr nonnull @cstr.[[TYPE_ADDR]])
+// QIR_ADAPTIVE:         tail call void @__quantum__rt__result_record_output(ptr nonnull null, ptr nonnull @cstr.[[LABEL_ADDR]])
 // IONQ:         tail call void @__quantum__qis__x__body(
 // CHECK:     attributes #[[ATTR_1]] = { "entry_point" {{.*}}"qir_profiles"="{{.*}}_profile" "requiredQubits"="1" "requiredResults"="1" }
diff --git a/targettests/lit.site.cfg.py.in b/targettests/lit.site.cfg.py.in
index b984c065874..4195d340994 100644
--- a/targettests/lit.site.cfg.py.in
+++ b/targettests/lit.site.cfg.py.in
@@ -71,6 +71,13 @@ if cmake_boolvar_to_bool(config.cudaq_backends_quantum_machines):
 else:
   config.substitutions.append(('%quantum_machines_avail', 'false'))
 
+config.cudaq_stim_backend_disabled = "@CUDAQ_ENABLE_ASSERTIONS@"
+if cmake_boolvar_to_bool(config.cudaq_stim_backend_disabled):
+  config.substitutions.append(('%stim_avail', 'false'))
+else:
+  config.available_features.add('stim')
+  config.substitutions.append(('%stim_avail', 'true'))
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/test/AST-Quake/apply_noise.cpp b/test/AST-Quake/apply_noise.cpp
index b8b5421d76e..8810cff8e5a 100644
--- a/test/AST-Quake/apply_noise.cpp
+++ b/test/AST-Quake/apply_noise.cpp
@@ -38,13 +38,13 @@ struct testApplyNoise {
 // CHECK:         }
 
 // QIR-LABEL: define void @__nvqpp__mlirgen__testApplyNoise() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 2)
-// QIR:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// QIR:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// QIR:         %[[VAL_5:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
-// QIR:         %[[VAL_6:.*]] = load %Qubit*, %Qubit** %[[VAL_5]], align 8
-// QIR:         tail call void @_ZN5cudaq11apply_noise{{.*}}SantaKraus{{.*}}(%Qubit* %[[VAL_4]], %Qubit* %[[VAL_6]])
-// QIR:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// QIR:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 2)
+// QIR:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// QIR:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// QIR:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// QIR:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// QIR:         tail call void @_ZN5cudaq11apply_noise{{.*}}SantaKraus{{.*}}(ptr %[[VAL_4]], ptr %[[VAL_6]])
+// QIR:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // QIR:         ret void
 // QIR:       }
 // clang-format on
@@ -80,15 +80,15 @@ struct testApplyMoreNoise {
 // CHECK:         }
 
 // QIR-LABEL: define void @__nvqpp__mlirgen__testApplyMoreNoise() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 1)
+// QIR:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 1)
 // QIR:         %[[VAL_2:.*]] = alloca double, align 8
-// QIR:         store double 4.000000e+00, double* %[[VAL_2]], align 8
+// QIR:         store double 4.000000e+00, ptr %[[VAL_2]], align 8
 // QIR:         %[[VAL_3:.*]] = alloca float, align 4
-// QIR:         store float 5.000000e+00, float* %[[VAL_3]], align 4
-// QIR:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
-// QIR:         %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8
-// QIR:         call void @_ZN5cudaq11apply_noise{{.*}}SarahKraus{{.*}}(double* nonnull %[[VAL_2]], float* nonnull %[[VAL_3]], %[[VAL_5]]* %[[VAL_6]])
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// QIR:         store float 5.000000e+00, ptr %[[VAL_3]], align 4
+// QIR:         %[[VAL_4:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// QIR:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_4]], align 8
+// QIR:         call void @_ZN5cudaq11apply_noise{{.*}}SarahKraus{{.*}}(ptr nonnull %[[VAL_2]], ptr nonnull %[[VAL_3]], ptr %[[VAL_6]])
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // QIR:         ret void
 // QIR:       }
 // clang-format on
diff --git a/test/AST-Quake/base_profile-0.cpp b/test/AST-Quake/base_profile-0.cpp
index 0067901d282..df18b8b535e 100644
--- a/test/AST-Quake/base_profile-0.cpp
+++ b/test/AST-Quake/base_profile-0.cpp
@@ -32,8 +32,9 @@ struct kernel {
 
 // clang-format off
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
-// CHECK:         tail call void @__quantum__qis__mz__body(%{{.*}}* null, %{{.*}}* null)
-// CHECK:         tail call void @__quantum__qis__mz__body(%{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*), %{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%{{.*}}* null, i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*), i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr null, ptr null)
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__rt__array_record_output(i64 2, ptr nonnull @cstr.61727261793C6931207820323E00)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr null, ptr nonnull @cstr.623000)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.623100)
 // clang-format on
diff --git a/test/AST-Quake/base_profile-1.cpp b/test/AST-Quake/base_profile-1.cpp
index 231baa16041..2ff8ba2a136 100644
--- a/test/AST-Quake/base_profile-1.cpp
+++ b/test/AST-Quake/base_profile-1.cpp
@@ -92,253 +92,247 @@ struct comprehensive {
 // clang-format off
 
 // BASE-LABEL: define void @__nvqpp__mlirgen__comprehensive()
-// BASE:         tail call void @__quantum__qis__h__body(%Qubit* null)
-// BASE:         tail call void @__quantum__qis__h__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__h__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__h__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__x__body(%Qubit* null)
-// BASE:         tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__cnot__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__y__body(%Qubit* null)
-// BASE:         tail call void @__quantum__qis__y__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__y__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__y__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__z__body(%Qubit* null)
-// BASE:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__t__body(%Qubit* null)
-// BASE:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__t__adj(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__s__body(%Qubit* null)
-// BASE:         tail call void @__quantum__qis__s__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__s__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__s__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__s__adj(%Qubit* null)
-// BASE:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, %Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__rx__body(double -5.612300e+00, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, %Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__ry__body(double -6.612300e+00, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__rz__body(double 7.612300e+00, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__rz__body(double 0x4021397F62B6AE7E, %Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__rz__body(double 0xC025397F62B6AE7E, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__r1__body(double 4.612300e+00, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__r1__body(double 0x400CE5FD8ADAB9F6, %Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__r1__body(double 0xBFF9CBFB15B573EC, %Qubit* null)
-// BASE:         tail call void @__quantum__qis__swap__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__u3__body(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, %Qubit* nonnull inttoptr (i64 3 to %Qubit*))
-// BASE:         tail call void @__quantum__qis__mz__body(%Qubit* null, %Result* null)
-// BASE:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Result* nonnull inttoptr (i64 1 to %Result*))
-// BASE:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Result* nonnull inttoptr (i64 2 to %Result*))
-// BASE:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 3 to %Qubit*), %Result* nonnull inttoptr (i64 3 to %Result*))
-// BASE:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*), %Result* nonnull inttoptr (i64 4 to %Result*))
-// BASE:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*), %Result* nonnull inttoptr (i64 5 to %Result*))
-// BASE:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*), %Result* nonnull inttoptr (i64 6 to %Result*))
-// BASE:         tail call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @cstr.73696E676C65746F6E00, i64 0, i64 0))
-// BASE:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 1 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.65696E7300, i64 0, i64 0))
-// BASE:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 2 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
-// BASE:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 3 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
-// BASE:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 4 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// BASE:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 5 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// BASE:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 6 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
+// BASE:         tail call void @__quantum__qis__h__body(ptr null)
+// BASE:         tail call void @__quantum__qis__h__body(ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__h__body(ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__h__body(ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__x__body(ptr null)
+// BASE:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__cnot__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__y__body(ptr null)
+// BASE:         tail call void @__quantum__qis__y__body(ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__y__body(ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__y__body(ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__z__body(ptr null)
+// BASE:         tail call void @__quantum__qis__z__body(ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__z__body(ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__z__body(ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__t__body(ptr null)
+// BASE:         tail call void @__quantum__qis__t__body(ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__t__body(ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__t__body(ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__t__adj(ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__s__body(ptr null)
+// BASE:         tail call void @__quantum__qis__s__body(ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__s__body(ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__s__body(ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__s__adj(ptr null)
+// BASE:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, ptr null)
+// BASE:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__rx__body(double -5.612300e+00, ptr null)
+// BASE:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, ptr null)
+// BASE:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__ry__body(double -6.612300e+00, ptr null)
+// BASE:         tail call void @__quantum__qis__rz__body(double 7.612300e+00, ptr null)
+// BASE:         tail call void @__quantum__qis__rz__body(double 0x4021397F62B6AE7E, ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__rz__body(double 0xC025397F62B6AE7E, ptr null)
+// BASE:         tail call void @__quantum__qis__r1__body(double 4.612300e+00, ptr null)
+// BASE:         tail call void @__quantum__qis__r1__body(double 0x400CE5FD8ADAB9F6, ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__r1__body(double 0xBFF9CBFB15B573EC, ptr null)
+// BASE:         tail call void @__quantum__qis__swap__body(ptr null, ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__qis__u3__body(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, ptr nonnull inttoptr (i64 3 to ptr))
+// BASE:         tail call void @__quantum__qis__mz__body(ptr null, ptr null)
+// BASE:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 1 to ptr))
+// BASE:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull inttoptr (i64 2 to ptr))
+// BASE:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 3 to ptr), ptr nonnull inttoptr (i64 3 to ptr))
+// BASE:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 4 to ptr), ptr nonnull inttoptr (i64 4 to ptr))
+// BASE:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 5 to ptr), ptr nonnull inttoptr (i64 5 to ptr))
+// BASE:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 6 to ptr), ptr nonnull inttoptr (i64 6 to ptr))
+// BASE:         tail call void @__quantum__rt__result_record_output(ptr null, ptr nonnull @cstr.73696E676C65746F6E00)
+// BASE:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.65696E7300)
+// BASE:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull @cstr.64756200)
+// BASE:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 3 to ptr), ptr nonnull @cstr.64756200)
+// BASE:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 4 to ptr), ptr nonnull @cstr.7472697000)
+// BASE:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 5 to ptr), ptr nonnull @cstr.7472697000)
+// BASE:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 6 to ptr), ptr nonnull @cstr.7472697000)
 // BASE:         ret void
 // BASE:       }
 
 // ADAPT-LABEL: define void @__nvqpp__mlirgen__comprehensive()
-// ADAPT:         tail call void @__quantum__qis__h__body(%Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__h__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__h__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__h__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__x__body(%Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__x__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__cnot__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__y__body(%Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__y__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__y__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__y__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__t__adj(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__s__body(%Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__s__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__s__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__s__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__s__adj(%Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, %Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__rx__body(double -5.612300e+00, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, %Qubit* nonnull inttoptr (i64 4 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__ry__body(double -6.612300e+00, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__rz__body(double 7.612300e+00, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__rz__body(double 0x4021397F62B6AE7E, %Qubit* nonnull inttoptr (i64 5 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__rz__body(double 0xC025397F62B6AE7E, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__r1__body(double 4.612300e+00, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__r1__body(double 0x400CE5FD8ADAB9F6, %Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__r1__body(double 0xBFF9CBFB15B573EC, %Qubit* null)
-// ADAPT:         tail call void @__quantum__qis__swap__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__u3__body(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, %Qubit* nonnull inttoptr (i64 3 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* null, %Result* null)
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @cstr.73696E676C65746F6E00, i64 0, i64 0))
-// ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Result* nonnull inttoptr (i64 1 to %Result*))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 1 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.65696E7300, i64 0, i64 0))
-// ADAPT:         %[[VAL_2:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 1 to %Result*))
-// ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Result* nonnull inttoptr (i64 2 to %Result*))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 2 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
-// ADAPT:         %[[VAL_3:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 2 to %Result*))
-// ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 3 to %Qubit*), %Result* nonnull inttoptr (i64 3 to %Result*))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 3 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
-// ADAPT:         %[[VAL_4:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 3 to %Result*))
-// ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*), %Result* nonnull inttoptr (i64 4 to %Result*))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 4 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// ADAPT:         %[[VAL_5:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 4 to %Result*))
-// ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*), %Result* nonnull inttoptr (i64 5 to %Result*))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 5 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// ADAPT:         %[[VAL_6:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 5 to %Result*))
-// ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*), %Result* nonnull inttoptr (i64 6 to %Result*))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 6 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// ADAPT:         %[[VAL_7:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 6 to %Result*))
+// ADAPT:         tail call void @__quantum__qis__h__body(ptr null)
+// ADAPT:         tail call void @__quantum__qis__h__body(ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__h__body(ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__h__body(ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__x__body(ptr null)
+// ADAPT:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__x__body(ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__cnot__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__y__body(ptr null)
+// ADAPT:         tail call void @__quantum__qis__y__body(ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__y__body(ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__y__body(ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__z__body(ptr null)
+// ADAPT:         tail call void @__quantum__qis__z__body(ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__z__body(ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__z__body(ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__cz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__t__body(ptr null)
+// ADAPT:         tail call void @__quantum__qis__t__body(ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__t__body(ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__t__body(ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__t__adj(ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__s__body(ptr null)
+// ADAPT:         tail call void @__quantum__qis__s__body(ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__s__body(ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__s__body(ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__s__adj(ptr null)
+// ADAPT:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, ptr null)
+// ADAPT:         tail call void @__quantum__qis__rx__body(double 5.612300e+00, ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__rx__body(double -5.612300e+00, ptr null)
+// ADAPT:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, ptr null)
+// ADAPT:         tail call void @__quantum__qis__ry__body(double 6.612300e+00, ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__qis__ry__body(double -6.612300e+00, ptr null)
+// ADAPT:         tail call void @__quantum__qis__rz__body(double 7.612300e+00, ptr null)
+// ADAPT:         tail call void @__quantum__qis__rz__body(double 0x4021397F62B6AE7E, ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__qis__rz__body(double 0xC025397F62B6AE7E, ptr null)
+// ADAPT:         tail call void @__quantum__qis__r1__body(double 4.612300e+00, ptr null)
+// ADAPT:         tail call void @__quantum__qis__r1__body(double 0x400CE5FD8ADAB9F6, ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__r1__body(double 0xBFF9CBFB15B573EC, ptr null)
+// ADAPT:         tail call void @__quantum__qis__swap__body(ptr null, ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__qis__u3__body(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, ptr nonnull inttoptr (i64 3 to ptr))
+// ADAPT:         tail call void @__quantum__qis__mz__body(ptr null, ptr null)
+// ADAPT:         tail call void @__quantum__rt__array_record_output(i64 7, ptr nonnull @cstr.61727261793C6931207820373E00)
+// ADAPT:         tail call void @__quantum__rt__result_record_output(ptr null, ptr nonnull @cstr.73696E676C65746F6E00)
+// ADAPT:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 1 to ptr))
+// ADAPT:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.65696E7300)
+// ADAPT:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull inttoptr (i64 2 to ptr))
+// ADAPT:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull @cstr.64756200)
+// ADAPT:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 3 to ptr), ptr nonnull inttoptr (i64 3 to ptr))
+// ADAPT:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 3 to ptr), ptr nonnull @cstr.64756200)
+// ADAPT:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 4 to ptr), ptr nonnull inttoptr (i64 4 to ptr))
+// ADAPT:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 4 to ptr), ptr nonnull @cstr.7472697000)
+// ADAPT:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 5 to ptr), ptr nonnull inttoptr (i64 5 to ptr))
+// ADAPT:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 5 to ptr), ptr nonnull @cstr.7472697000)
+// ADAPT:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 6 to ptr), ptr nonnull inttoptr (i64 6 to ptr))
+// ADAPT:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 6 to ptr), ptr nonnull @cstr.7472697000)
 // ADAPT:         ret void
 // ADAPT:       }
 
 // FULL-LABEL: define void @__nvqpp__mlirgen__comprehensive()
-// FULL:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 9)
-// FULL:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// FULL:         %[[VAL_4:.*]] = bitcast %Qubit** %[[VAL_2]] to i8**
-// FULL:         %[[VAL_5:.*]] = load i8*, i8** %[[VAL_4]], align 8
-// FULL:         %[[VAL_6:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
-// FULL:         %[[VAL_7:.*]] = bitcast %Qubit** %[[VAL_6]] to i8**
-// FULL:         %[[VAL_8:.*]] = load i8*, i8** %[[VAL_7]], align 8
-// FULL:         %[[VAL_9:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
-// FULL:         %[[VAL_10:.*]] = load %Qubit*, %Qubit** %[[VAL_9]], align 8
-// FULL:         %[[VAL_21:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 3)
-// FULL:         %[[VAL_22:.*]] = load %Qubit*, %Qubit** %[[VAL_21]], align 8
-// FULL:         %[[VAL_17:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 4)
-// FULL:         %[[VAL_18:.*]] = load %Qubit*, %Qubit** %[[VAL_17]], align 8
-// FULL:         %[[VAL_19:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 5)
-// FULL:         %[[VAL_20:.*]] = load %Qubit*, %Qubit** %[[VAL_19]], align 8
-// FULL:         %[[VAL_11:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 6)
-// FULL:         %[[VAL_12:.*]] = load %Qubit*, %Qubit** %[[VAL_11]], align 8
-// FULL:         %[[VAL_13:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 7)
-// FULL:         %[[VAL_14:.*]] = load %Qubit*, %Qubit** %[[VAL_13]], align 8
-// FULL:         %[[VAL_15:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 8)
-// FULL:         %[[VAL_16:.*]] = load %Qubit*, %Qubit** %[[VAL_15]], align 8
-// FULL:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_16]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_18]], %Qubit* %[[VAL_20]], i8* %[[VAL_8]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__h__ctl to i8*), i8* %[[VAL_8]], %Qubit* %[[VAL_10]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_18]], %Qubit* %[[VAL_20]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_16]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_18]], %Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__y(%Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__y(%Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__y(%Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__y(%Qubit* %[[VAL_16]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_12]], %Qubit* %[[VAL_14]], i8* %[[VAL_8]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_16]], i8* %[[VAL_8]], i8* %[[VAL_5]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__y__ctl to i8*), i8* %[[VAL_5]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_16]], i8* %[[VAL_8]], i8* %[[VAL_5]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_12]], %Qubit* %[[VAL_14]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__z(%Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__z(%Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__z(%Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__z(%Qubit* %[[VAL_16]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__z__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__z__ctl to i8*), %Qubit* %[[VAL_18]], %Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_16]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__t__ctl to i8*), i8* %[[VAL_8]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void @__quantum__qis__t__adj(%Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_12]])
-// FULL:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_16]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__s__ctl to i8*), i8* %[[VAL_8]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void @__quantum__qis__s__adj(%Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__rx(double 5.612300e+00, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__rx(double 5.612300e+00, %Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (double, %Array*, %Qubit*)* @__quantum__qis__rx__ctl to i8*), double 5.612300e+00, i8* %[[VAL_8]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void @__quantum__qis__rx(double -5.612300e+00, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__ry(double 6.612300e+00, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__ry(double 6.612300e+00, %Qubit* %[[VAL_12]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_12]], i8* %[[VAL_8]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (double, %Array*, %Qubit*)* @__quantum__qis__ry__ctl to i8*), double 6.612300e+00, i8* %[[VAL_8]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_12]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__ry(double -6.612300e+00, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__rz(double 7.612300e+00, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__rz(double 0x4021397F62B6AE7E, %Qubit* %[[VAL_14]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (double, %Array*, %Qubit*)* @__quantum__qis__rz__ctl to i8*), double 0x4023397F62B6AE7E, i8* %[[VAL_8]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_20]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_20]])
-// FULL:         tail call void @__quantum__qis__rz(double 0xC025397F62B6AE7E, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__r1(double 4.612300e+00, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__r1(double 0x400CE5FD8ADAB9F6, %Qubit* %[[VAL_16]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_14]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (double, %Array*, %Qubit*)* @__quantum__qis__r1__ctl to i8*), double 0x4004E5FD8ADAB9F6, i8* %[[VAL_8]], %Qubit* %[[VAL_22]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_14]], %Qubit* %[[VAL_16]], i8* %[[VAL_8]])
-// FULL:         tail call void @__quantum__qis__r1(double 0xBFF9CBFB15B573EC, %Qubit* %[[VAL_10]])
-// FULL:         tail call void @__quantum__qis__swap(%Qubit* %[[VAL_10]], %Qubit* %[[VAL_16]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 2, i8* nonnull bitcast (void (%Array*, %Qubit*, %Qubit*)* @__quantum__qis__swap__ctl to i8*), %Qubit* %[[VAL_10]], %Qubit* %[[VAL_22]], %Qubit* %[[VAL_20]])
-// FULL:         tail call void @__quantum__qis__u3(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, %Qubit* %[[VAL_20]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_14]], %Qubit* %[[VAL_12]], i8* %[[VAL_8]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 3, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (double, double, double, %Array*, %Qubit*)* @__quantum__qis__u3__ctl to i8*), double 6.200000e+00, double -3.100000e+00, double 0x401F333333333333, i8* %[[VAL_8]], %Qubit* %[[VAL_10]])
-// FULL:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_14]], %Qubit* %[[VAL_12]], i8* %[[VAL_8]])
-// FULL:         %[[VAL_23:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_10]], i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @cstr.73696E676C65746F6E00, i64 0, i64 0))
-// FULL:         %[[VAL_25:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_22]], i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.65696E7300, i64 0, i64 0))
-// FULL:         %[[VAL_26:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_18]], i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
-// FULL:         %[[VAL_27:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_20]], i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
-// FULL:         %[[VAL_28:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_12]], i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// FULL:         %[[VAL_29:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_14]], i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// FULL:         %[[VAL_30:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_16]], i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
-// FULL:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// FULL:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 9)
+// FULL:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// FULL:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// FULL:         %[[VAL_6:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// FULL:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_6]], align 8
+// FULL:         %[[VAL_9:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// FULL:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// FULL:         %[[VAL_21:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 3)
+// FULL:         %[[VAL_22:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// FULL:         %[[VAL_17:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 4)
+// FULL:         %[[VAL_18:.*]] = load ptr, ptr %[[VAL_17]], align 8
+// FULL:         %[[VAL_19:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 5)
+// FULL:         %[[VAL_20:.*]] = load ptr, ptr %[[VAL_19]], align 8
+// FULL:         %[[VAL_11:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 6)
+// FULL:         %[[VAL_12:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// FULL:         %[[VAL_13:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 7)
+// FULL:         %[[VAL_14:.*]] = load ptr, ptr %[[VAL_13]], align 8
+// FULL:         %[[VAL_15:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 8)
+// FULL:         %[[VAL_16:.*]] = load ptr, ptr %[[VAL_15]], align 8
+// FULL:         tail call void @__quantum__qis__h(ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__h(ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__h(ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__h(ptr %[[VAL_16]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_18]], ptr %[[VAL_20]], ptr %[[VAL_8]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__h__ctl, ptr %[[VAL_8]], ptr %[[VAL_10]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_18]], ptr %[[VAL_20]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_16]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_22]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_18]], ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__y(ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__y(ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__y(ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__y(ptr %[[VAL_16]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_12]], ptr %[[VAL_14]], ptr %[[VAL_8]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_16]], ptr %[[VAL_8]], ptr %[[VAL_5]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__y__ctl, ptr %[[VAL_5]], ptr %[[VAL_22]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_16]], ptr %[[VAL_8]], ptr %[[VAL_5]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_12]], ptr %[[VAL_14]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__z(ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__z(ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__z(ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__z(ptr %[[VAL_16]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__z__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_22]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__z__ctl, ptr %[[VAL_18]], ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__t(ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__t(ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__t(ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__t(ptr %[[VAL_16]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__t__ctl, ptr %[[VAL_8]], ptr %[[VAL_22]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void @__quantum__qis__t__adj(ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__s(ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__s(ptr %[[VAL_12]])
+// FULL:         tail call void @__quantum__qis__s(ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__s(ptr %[[VAL_16]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__s__ctl, ptr %[[VAL_8]], ptr %[[VAL_22]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void @__quantum__qis__s__adj(ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__rx(double 5.612300e+00, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__rx(double 5.612300e+00, ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__rx__ctl, double 5.612300e+00, ptr %[[VAL_8]], ptr %[[VAL_22]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void @__quantum__qis__rx(double -5.612300e+00, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__ry(double 6.612300e+00, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__ry(double 6.612300e+00, ptr %[[VAL_12]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_12]], ptr %[[VAL_8]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__ry__ctl, double 6.612300e+00, ptr %[[VAL_8]], ptr %[[VAL_22]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_12]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__ry(double -6.612300e+00, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__rz(double 7.612300e+00, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__rz(double 0x4021397F62B6AE7E, ptr %[[VAL_14]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__rz__ctl, double 0x4023397F62B6AE7E, ptr %[[VAL_8]], ptr %[[VAL_22]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_20]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__x(ptr %[[VAL_20]])
+// FULL:         tail call void @__quantum__qis__rz(double 0xC025397F62B6AE7E, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__r1(double 4.612300e+00, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__r1(double 0x400CE5FD8ADAB9F6, ptr %[[VAL_16]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_14]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 1, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__r1__ctl, double 0x4004E5FD8ADAB9F6, ptr %[[VAL_8]], ptr %[[VAL_22]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_14]], ptr %[[VAL_16]], ptr %[[VAL_8]])
+// FULL:         tail call void @__quantum__qis__r1(double 0xBFF9CBFB15B573EC, ptr %[[VAL_10]])
+// FULL:         tail call void @__quantum__qis__swap(ptr %[[VAL_10]], ptr %[[VAL_16]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 2, ptr nonnull @__quantum__qis__swap__ctl, ptr %[[VAL_10]], ptr %[[VAL_22]], ptr %[[VAL_20]])
+// FULL:         tail call void @__quantum__qis__u3(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, ptr %[[VAL_20]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_14]], ptr %[[VAL_12]], ptr %[[VAL_8]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 3, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__u3__ctl, double 6.200000e+00, double -3.100000e+00, double 0x401F333333333333, ptr %[[VAL_8]], ptr %[[VAL_10]])
+// FULL:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_14]], ptr %[[VAL_12]], ptr %[[VAL_8]])
+// FULL:         %[[VAL_23:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_10]], ptr nonnull @cstr.73696E676C65746F6E00)
+// FULL:         %[[VAL_25:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_22]], ptr nonnull @cstr.65696E7300)
+// FULL:         %[[VAL_26:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_18]], ptr nonnull @cstr.64756200)
+// FULL:         %[[VAL_27:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_20]], ptr nonnull @cstr.64756200)
+// FULL:         %[[VAL_28:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_12]], ptr nonnull @cstr.7472697000)
+// FULL:         %[[VAL_29:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_14]], ptr nonnull @cstr.7472697000)
+// FULL:         %[[VAL_30:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_16]], ptr nonnull @cstr.7472697000)
+// FULL:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // FULL:         ret void
 // FULL:       }
 // FULL:         ret void
diff --git a/test/AST-Quake/bug_3270.cpp b/test/AST-Quake/bug_3270.cpp
index d6ae42c199f..9b5adf25b13 100644
--- a/test/AST-Quake/bug_3270.cpp
+++ b/test/AST-Quake/bug_3270.cpp
@@ -26,21 +26,8 @@ __qpu__ void foo() {
 // CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][2] : (!quake.veq<3>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_3]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<i8 x 3>
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] name "result%[[VAL_0]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_2]] name "result%[[VAL_1]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_12]], %[[VAL_11]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = quake.mz %[[VAL_3]] name "result%[[VAL_2]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_14:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.cast unsigned %[[VAL_14]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_16]], %[[VAL_15]] : !cc.ptr<i8>
+// CHECK:           quake.mz %[[VAL_1]] name "result%{{.*}}" : (!quake.ref) -> !quake.measure
+// CHECK:           quake.mz %[[VAL_2]] name "result%{{.*}}" : (!quake.ref) -> !quake.measure
+// CHECK:           quake.mz %[[VAL_3]] name "result%{{.*}}" : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
diff --git a/test/AST-Quake/cudaq_run.cpp b/test/AST-Quake/cudaq_run.cpp
index 4c7f1a20c75..4a133d17aa0 100644
--- a/test/AST-Quake/cudaq_run.cpp
+++ b/test/AST-Quake/cudaq_run.cpp
@@ -93,8 +93,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__K9.run.entry(
+// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @K9.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_2:.*]] = constant @K9.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @K9.run.kernelName : !llvm.ptr<array<7 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_of_truth._Z15kernel_of_truthv.run()
 // CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__function_kernel_of_truth._Z15kernel_of_truthv() : () -> i1
@@ -105,8 +105,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_of_truth._Z15kernel_of_truthv.run.entry()
+// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_kernel_of_truth._Z15kernel_of_truthv.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_1:.*]] = constant @function_kernel_of_truth._Z15kernel_of_truthv.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_kernel_of_truth._Z15kernel_of_truthv.run.kernelName : !llvm.ptr<array<50 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_of_corn._Z14kernel_of_cornv.run()
 // CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__function_kernel_of_corn._Z14kernel_of_cornv() : () -> i32
@@ -118,8 +118,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_of_corn._Z14kernel_of_cornv.run.entry()
+// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_kernel_of_corn._Z14kernel_of_cornv.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_1:.*]] = constant @function_kernel_of_corn._Z14kernel_of_cornv.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_kernel_of_corn._Z14kernel_of_cornv.run.kernelName : !llvm.ptr<array<48 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__CliffDiver.run()
 // CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__CliffDiver() : () -> f64
@@ -130,8 +130,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__CliffDiver.run.entry(
+// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @CliffDiver.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_2:.*]] = constant @CliffDiver.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @CliffDiver.run.kernelName : !llvm.ptr<array<15 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_of_wheat._Z15kernel_of_wheatv.run()
 // CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__function_kernel_of_wheat._Z15kernel_of_wheatv() : () -> f32
@@ -143,8 +143,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel_of_wheat._Z15kernel_of_wheatv.run.entry()
+// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_kernel_of_wheat._Z15kernel_of_wheatv.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_1:.*]] = constant @function_kernel_of_wheat._Z15kernel_of_wheatv.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_kernel_of_wheat._Z15kernel_of_wheatv.run.kernelName : !llvm.ptr<array<50 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__CliffClimber.run()
 // CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__CliffClimber() : () -> i8
@@ -156,8 +156,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__CliffClimber.run.entry(
+// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @CliffClimber.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_2:.*]] = constant @CliffClimber.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @CliffClimber.run.kernelName : !llvm.ptr<array<17 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_this_is_not_a_drill._Z19this_is_not_a_drillv.run()
 // CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__function_this_is_not_a_drill._Z19this_is_not_a_drillv() : () -> i64
@@ -168,8 +168,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_this_is_not_a_drill._Z19this_is_not_a_drillv.run.entry()
+// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_this_is_not_a_drill._Z19this_is_not_a_drillv.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_1:.*]] = constant @function_this_is_not_a_drill._Z19this_is_not_a_drillv.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_this_is_not_a_drill._Z19this_is_not_a_drillv.run.kernelName : !llvm.ptr<array<58 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_this_is_a_hammer._Z16this_is_a_hammerv.run()
 // CHECK:           %[[VAL_0:.*]] = call @__nvqpp__mlirgen__function_this_is_a_hammer._Z16this_is_a_hammerv() : () -> i16
@@ -181,8 +181,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_this_is_a_hammer._Z16this_is_a_hammerv.run.entry()
+// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_this_is_a_hammer._Z16this_is_a_hammerv.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_1:.*]] = constant @function_this_is_a_hammer._Z16this_is_a_hammerv.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_22:.*]] = llvm.mlir.addressof @function_this_is_a_hammer._Z16this_is_a_hammerv.run.kernelName : !llvm.ptr<array<52 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__CliffHanger.run()
 // CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
@@ -203,8 +203,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__CliffHanger.run.entry(
+// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @CliffHanger.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_2:.*]] = constant @CliffHanger.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @CliffHanger.run.kernelName : !llvm.ptr<array<16 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_unary_test_list._Z15unary_test_listi.run(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32)
@@ -217,8 +217,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_unary_test_list._Z15unary_test_listi.run.entry(
+// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_unary_test_list._Z15unary_test_listi.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_3:.*]] = constant @function_unary_test_list._Z15unary_test_listi.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_unary_test_list._Z15unary_test_listi.run.kernelName : !llvm.ptr<array<50 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_unary_test_list2._Z16unary_test_list2i.run(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32)
@@ -230,8 +230,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_unary_test_list2._Z16unary_test_list2i.run.entry(
+// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_unary_test_list2._Z16unary_test_list2i.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_3:.*]] = constant @function_unary_test_list2._Z16unary_test_list2i.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_28:.*]] = llvm.mlir.addressof @function_unary_test_list2._Z16unary_test_list2i.run.kernelName : !llvm.ptr<array<52 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_dyn_vec_test._Z12dyn_vec_testi.run(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32)
@@ -243,8 +243,8 @@ __qpu__ std::vector<bool> branch_vec_test() {
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_dyn_vec_test._Z12dyn_vec_testi.run.entry(
+// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @function_dyn_vec_test._Z12dyn_vec_testi.run.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_3:.*]] = constant @function_dyn_vec_test._Z12dyn_vec_testi.run.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
-// CHECK:           %[[VAL_23:.*]] = llvm.mlir.addressof @function_dyn_vec_test._Z12dyn_vec_testi.run.kernelName : !llvm.ptr<array<44 x i8>>
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_branch_vec_test._Z15branch_vec_testv.run()
 // CHECK:           %[[V0:.*]] = call @__nvqpp__mlirgen__function_branch_vec_test._Z15branch_vec_testv() : () -> !cc.stdvec<i1>
diff --git a/test/AST-Quake/if.cpp b/test/AST-Quake/if.cpp
index a24698c6c2c..210146fab5c 100644
--- a/test/AST-Quake/if.cpp
+++ b/test/AST-Quake/if.cpp
@@ -108,13 +108,11 @@ struct kernel_short_circuit_or {
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel_short_circuit_or() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant false
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][0] : (!quake.veq<3>) -> !quake.ref
-// CHECK:           %[[VAL_41:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_41]] :
-// CHECK:           %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_4]], %[[VAL_0]] : i1
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
 // CHECK:           %[[VAL_6:.*]] = cc.if(%[[VAL_5]]) -> i1 {
 // CHECK:             cc.continue %[[VAL_5]] : i1
 // CHECK:           } else {
diff --git a/test/AST-Quake/loop_normal.cpp b/test/AST-Quake/loop_normal.cpp
index 3a78cf144a6..b0ca2f583e8 100644
--- a/test/AST-Quake/loop_normal.cpp
+++ b/test/AST-Quake/loop_normal.cpp
@@ -257,9 +257,9 @@ __qpu__ void linear_expr2() {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_linear_expr2
-// CHECK:           %[[VAL_2:.*]] = arith.constant 7 : i32
-// CHECK:           %[[VAL_3:.*]] = arith.constant 3 : i32
-// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 7 : i32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 3 : i32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 2 : i32
 // CHECK:           cc.loop while ((%[[VAL_7:.*]] = %
 // CHECK:             %[[VAL_8:.*]] = arith.muli %[[VAL_7]], %[[VAL_3]] : i32
 // CHECK:             %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_4]] : i32
@@ -276,11 +276,9 @@ __qpu__ void linear_expr3a() {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_linear_expr3a
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 6 : i32
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant -1 : i32
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 2 : i32
 // CHECK:           cc.loop while ((%[[VAL_7:.*]] = %
-// CHECK:             %[[VAL_8:.*]] = arith.muli %[[VAL_7]], %[[VAL_3]] : i32
-// CHECK:             %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_4]] : i32
+// CHECK:             %[[VAL_9:.*]] = arith.subi %[[VAL_4]], %[[VAL_7]] : i32
 // CHECK:             %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_2]] : i32
 // CHECK:             cc.condition %[[VAL_10]](%[[VAL_7]] : i32)
 // CHECK:           } {normalized}
@@ -295,11 +293,9 @@ __qpu__ void linear_expr3b() {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_linear_expr3b
 // CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant -1 : i32
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : i32
 // CHECK:           %[[VAL_5:.*]] = cc.loop while ((%[[VAL_6:.*]] = %
-// CHECK:             %[[VAL_7:.*]] = arith.muli %[[VAL_6]], %[[VAL_2]] : i32
-// CHECK:             %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i32
+// CHECK:             %[[VAL_8:.*]] = arith.subi %[[VAL_3]], %[[VAL_6]] : i32
 // CHECK:             %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %[[VAL_0]] : i32
 // CHECK:           } {normalized}
 
@@ -313,11 +309,9 @@ __qpu__ void linear_expr3c() {
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_linear_expr3c
 // CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant -1 : i32
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : i32
 // CHECK:           cc.loop while ((%[[VAL_6:.*]] = %[[VAL_0]]) -> (i32)) {
-// CHECK:             %[[VAL_7:.*]] = arith.muli %[[VAL_6]], %[[VAL_2]] : i32
-// CHECK:             %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : i32
+// CHECK:             %[[VAL_8:.*]] = arith.subi %[[VAL_3]], %[[VAL_6]] : i32
 // CHECK:             %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %[[VAL_0]] : i32
 // CHECK:           } {normalized}
 
diff --git a/test/AST-Quake/negated_control.cpp b/test/AST-Quake/negated_control.cpp
index 7e9fa67cb7a..7f2dc851aa8 100644
--- a/test/AST-Quake/negated_control.cpp
+++ b/test/AST-Quake/negated_control.cpp
@@ -19,27 +19,24 @@ struct Stuart {
 };
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__Stuart()
-// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 5)
-// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_6:.*]] = bitcast %Qubit** %[[VAL_5]] to i8**
-// CHECK:         %[[VAL_7:.*]] = load i8*, i8** %[[VAL_6]], align 8
-// CHECK:         %[[VAL_8:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 4)
-// CHECK:         %[[VAL_9:.*]] = bitcast %Qubit** %[[VAL_8]] to i8**
-// CHECK:         %[[VAL_10:.*]] = load i8*, i8** %[[VAL_9]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__y__ctl to i8*), %Qubit* %[[VAL_4]], i8* %[[VAL_7]], i8* %[[VAL_10]])
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_11:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
-// CHECK:         %[[VAL_12:.*]] = bitcast %Qubit** %[[VAL_11]] to i8**
-// CHECK:         %[[VAL_13:.*]] = load i8*, i8** %[[VAL_12]], align 8
-// CHECK:         %[[VAL_14:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 3)
-// CHECK:         %[[VAL_15:.*]] = load %Qubit*, %Qubit** %[[VAL_14]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_15]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__z__ctl to i8*), i8* %[[VAL_13]], %Qubit* %[[VAL_15]], i8* %[[VAL_10]])
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_15]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 5)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_7:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// CHECK:         %[[VAL_8:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 4)
+// CHECK:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_8]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_4]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__y__ctl, ptr %[[VAL_4]], ptr %[[VAL_7]], ptr %[[VAL_10]])
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_11:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// CHECK:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_14:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 3)
+// CHECK:         %[[VAL_15:.*]] = load ptr, ptr %[[VAL_14]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_15]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 2, i64 1, ptr nonnull @__quantum__qis__z__ctl, ptr %[[VAL_13]], ptr %[[VAL_15]], ptr %[[VAL_10]])
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_15]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
 // CHECK:         ret void
diff --git a/test/AST-Quake/pure_quantum_struct.cpp b/test/AST-Quake/pure_quantum_struct.cpp
index 60c7f05334b..876cef64f67 100644
--- a/test/AST-Quake/pure_quantum_struct.cpp
+++ b/test/AST-Quake/pure_quantum_struct.cpp
@@ -79,68 +79,68 @@ __qpu__ void entry_ctor() {
 // CHECK:         }
 
 // QIR-LABEL: define void @__nvqpp__mlirgen__function_kernel._Z6kernel4test({ 
-// QIR-SAME:    %Array*, %Array* } %[[VAL_1:.*]]) local_unnamed_addr {
-// QIR:         %[[VAL_2:.*]] = extractvalue { %Array*, %Array* } %[[VAL_1]], 0
-// QIR:         %[[VAL_3:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%Array* %[[VAL_2]])
+// QIR-SAME:    ptr, ptr } %[[VAL_1:.*]]) local_unnamed_addr {
+// QIR:         %[[VAL_2:.*]] = extractvalue { ptr, ptr } %[[VAL_1]], 0
+// QIR:         %[[VAL_3:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_2]])
 // QIR:         %[[VAL_4:.*]] = icmp sgt i64 %[[VAL_3]], 0
 // QIR:         br i1 %[[VAL_4]], label %[[VAL_5:.*]], label %[[VAL_6:.*]]
 // QIR:       .lr.ph:                                           ; preds = %[[VAL_7:.*]], %[[VAL_5]]
 // QIR:         %[[VAL_8:.*]] = phi i64 [ %[[VAL_9:.*]], %[[VAL_5]] ], [ 0, %[[VAL_7]] ]
-// QIR:         %[[VAL_10:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_2]], i64 %[[VAL_8]])
-// QIR:         %[[VAL_13:.*]] = load %Qubit*, %Qubit** %[[VAL_10]], align 8
-// QIR:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_13]])
+// QIR:         %[[VAL_10:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 %[[VAL_8]])
+// QIR:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_10]], align 8
+// QIR:         tail call void @__quantum__qis__h(ptr %[[VAL_13]])
 // QIR:         %[[VAL_9]] = add nuw nsw i64 %[[VAL_8]], 1
 // QIR:         %[[VAL_14:.*]] = icmp eq i64 %[[VAL_9]], %[[VAL_3]]
 // QIR:         br i1 %[[VAL_14]], label %[[VAL_6]], label %[[VAL_5]]
 // QIR:       ._crit_edge:                                      ; preds = %[[VAL_5]], %[[VAL_7]]
-// QIR:         %[[VAL_15:.*]] = extractvalue { %Array*, %Array* } %[[VAL_1]], 1
-// QIR:         %[[VAL_16:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%Array* %[[VAL_15]])
+// QIR:         %[[VAL_15:.*]] = extractvalue { ptr, ptr } %[[VAL_1]], 1
+// QIR:         %[[VAL_16:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_15]])
 // QIR:         %[[VAL_17:.*]] = icmp sgt i64 %[[VAL_16]], 0
 // QIR:         br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_19:.*]]
 // QIR:       .lr.ph3:                                          ; preds = %[[VAL_6]], %[[VAL_18]]
 // QIR:         %[[VAL_20:.*]] = phi i64 [ %[[VAL_21:.*]], %[[VAL_18]] ], [ 0, %[[VAL_6]] ]
-// QIR:         %[[VAL_22:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_15]], i64 %[[VAL_20]])
-// QIR:         %[[VAL_24:.*]] = load %Qubit*, %Qubit** %[[VAL_22]]
-// QIR:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_24]])
+// QIR:         %[[VAL_22:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_15]], i64 %[[VAL_20]])
+// QIR:         %[[VAL_24:.*]] = load ptr, ptr %[[VAL_22]]
+// QIR:         tail call void @__quantum__qis__s(ptr %[[VAL_24]])
 // QIR:         %[[VAL_21]] = add nuw nsw i64 %[[VAL_20]], 1
 // QIR:         %[[VAL_25:.*]] = icmp eq i64 %[[VAL_21]], %[[VAL_16]]
 // QIR:         br i1 %[[VAL_25]], label %[[VAL_19]], label %[[VAL_18]]
 // QIR:       ._crit_edge4:                                     ; preds = %[[VAL_18]], %[[VAL_6]]
-// QIR:         %[[VAL_26:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_2]], i64 0)
-// QIR:         %[[VAL_28:.*]] = load %Qubit*, %Qubit** %[[VAL_26]]
-// QIR:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_28]])
-// QIR:         %[[VAL_29:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_15]], i64 0)
-// QIR:         %[[VAL_31:.*]] = load %Qubit*, %Qubit** %[[VAL_29]]
-// QIR:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_31]])
+// QIR:         %[[VAL_26:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 0)
+// QIR:         %[[VAL_28:.*]] = load ptr, ptr %[[VAL_26]]
+// QIR:         tail call void @__quantum__qis__h(ptr %[[VAL_28]])
+// QIR:         %[[VAL_29:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_15]], i64 0)
+// QIR:         %[[VAL_31:.*]] = load ptr, ptr %[[VAL_29]]
+// QIR:         tail call void @__quantum__qis__x(ptr %[[VAL_31]])
 // QIR:         ret void
 // QIR:       }
 
 // QIR-LABEL: define void @__nvqpp__mlirgen__function_entry_initlist._Z14entry_initlistv() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 4)
-// QIR:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// QIR:         %[[VAL_5:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// QIR:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_5]])
-// QIR:         %[[VAL_6:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
-// QIR:         %[[VAL_8:.*]] = load %Qubit*, %Qubit** %[[VAL_6]]
-// QIR:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_8]])
-// QIR:         %[[VAL_9:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
-// QIR:         %[[VAL_11:.*]] = load %Qubit*, %Qubit** %[[VAL_9]]
-// QIR:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_11]])
-// QIR:         %[[VAL_12:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 3)
-// QIR:         %[[VAL_14:.*]] = load %Qubit*, %Qubit** %[[VAL_12]]
-// QIR:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_14]])
-// QIR:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_5]])
-// QIR:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_11]])
-// QIR:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// QIR:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 4)
+// QIR:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// QIR:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// QIR:         tail call void @__quantum__qis__h(ptr %[[VAL_5]])
+// QIR:         %[[VAL_6:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// QIR:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_6]]
+// QIR:         tail call void @__quantum__qis__h(ptr %[[VAL_8]])
+// QIR:         %[[VAL_9:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// QIR:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]]
+// QIR:         tail call void @__quantum__qis__s(ptr %[[VAL_11]])
+// QIR:         %[[VAL_12:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 3)
+// QIR:         %[[VAL_14:.*]] = load ptr, ptr %[[VAL_12]]
+// QIR:         tail call void @__quantum__qis__s(ptr %[[VAL_14]])
+// QIR:         tail call void @__quantum__qis__h(ptr %[[VAL_5]])
+// QIR:         tail call void @__quantum__qis__x(ptr %[[VAL_11]])
+// QIR:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // QIR:         ret void
 // QIR:       }
 
 // QIR-LABEL: define void @__nvqpp__mlirgen__function_entry_ctor._Z10entry_ctorv() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 4)
-// QIR:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
-// QIR:         %[[VAL_5:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// QIR:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_5]])
-// QIR:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// QIR:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 4)
+// QIR:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// QIR:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// QIR:         tail call void @__quantum__qis__h(ptr %[[VAL_5]])
+// QIR:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // QIR:         ret void
 // QIR:       }
 // clang-format on
diff --git a/test/AST-Quake/qalloc_initialization.cpp b/test/AST-Quake/qalloc_initialization.cpp
index ca4ba11510b..ad51f60c282 100644
--- a/test/AST-Quake/qalloc_initialization.cpp
+++ b/test/AST-Quake/qalloc_initialization.cpp
@@ -413,467 +413,301 @@ __qpu__ bool Peppermint() {
 //===----------------------------------------------------------------------===//
 
 // clang-format off
-// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Vanilla() local_unnamed_addr {
+// QIR-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__Vanilla() local_unnamed_addr {
 // QIR:         %[[VAL_0:.*]] = alloca [4 x double]
-// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 0
-// QIR:         store double 0.000000e+00, double* %[[VAL_1]]
-// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 1
-// QIR:         store double 1.000000e+00, double* %[[VAL_2]]
-// QIR:         %[[VAL_3:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 2
-// QIR:         store double 1.000000e+00, double* %[[VAL_3]]
-// QIR:         %[[VAL_4:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 3
-// QIR:         store double 0.000000e+00, double* %[[VAL_4]]
-// QIR:         %[[VAL_5:.*]] = bitcast [4 x double]* %[[VAL_0]] to i8*
-// QIR:         %[[VAL_6:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_f64(i8* nonnull %[[VAL_5]], i64 4)
-// QIR:         %[[VAL_7:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_6]])
-// QIR:         %[[VAL_8:.*]] = call %[[VAL_9:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_7]], i8** %[[VAL_6]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_6]])
-// QIR:        %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_9]]* %[[VAL_8]])
+// QIR:         store double 0.000000e+00, ptr %[[VAL_0]]
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// QIR:         store double 1.000000e+00, ptr %[[VAL_2]]
+// QIR:         %[[VAL_3:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 16
+// QIR:         store double 1.000000e+00, ptr %[[VAL_3]]
+// QIR:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 24
+// QIR:         store double 0.000000e+00, ptr %[[VAL_4]]
+// QIR:         %[[VAL_6:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_f64(ptr nonnull %[[VAL_0]], i64 4)
+// QIR:         %[[VAL_7:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_6]])
+// QIR:         %[[VAL_8:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_7]], ptr %[[VAL_6]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_6]])
+// QIR:         %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_8]])
 // QIR:         %[[VAL_11:.*]] = icmp sgt i64 %[[VAL_10]], 0
 // QIR:         br i1 %[[VAL_11]], label %[[VAL_12:.*]], label %[[VAL_13:.*]]
 
-// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__VanillaBean() local_unnamed_addr {
+// QIR-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__VanillaBean() local_unnamed_addr {
 // QIR:         %[[VAL_0:.*]] = alloca [4 x double]
-// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 0
-// QIR:         store double 0.000000e+00, double* %[[VAL_1]]
-// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 1
-// QIR:         store double 1.000000e+00, double* %[[VAL_2]]
-// QIR:         %[[VAL_3:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 2
-// QIR:         store double 1.000000e+00, double* %[[VAL_3]]
-// QIR:         %[[VAL_4:.*]] = getelementptr inbounds [4 x double], [4 x double]* %[[VAL_0]], i64 0, i64 3
-// QIR:         store double 0.000000e+00, double* %[[VAL_4]]
-// QIR:         %[[VAL_5:.*]] = bitcast [4 x double]* %[[VAL_0]] to i8*
-// QIR:         %[[VAL_6:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_f64(i8* nonnull %[[VAL_5]], i64 4)
-// QIR:         %[[VAL_7:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_6]])
-// QIR:         %[[VAL_8:.*]] = call %[[VAL_9:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_7]], i8** %[[VAL_6]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_6]])
-// QIR:         %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_9]]* %[[VAL_8]])
+// QIR:         store double 0.000000e+00, ptr %[[VAL_0]]
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// QIR:         store double 1.000000e+00, ptr %[[VAL_2]]
+// QIR:         %[[VAL_3:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 16
+// QIR:         store double 1.000000e+00, ptr %[[VAL_3]]
+// QIR:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 24
+// QIR:         store double 0.000000e+00, ptr %[[VAL_4]]
+// QIR:         %[[VAL_6:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_f64(ptr nonnull %[[VAL_0]], i64 4)
+// QIR:         %[[VAL_7:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_6]])
+// QIR:         %[[VAL_8:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_7]], ptr %[[VAL_6]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_6]])
+// QIR:         %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_8]])
 // QIR:         %[[VAL_11:.*]] = icmp sgt i64 %[[VAL_10]], 0
 // QIR:         br i1 %[[VAL_11]], label %[[VAL_12:.*]], label %[[VAL_13:.*]]
 
-// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Cherry() local_unnamed_addr {
+// QIR-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__Cherry() local_unnamed_addr {
 // QIR:         %[[VAL_0:.*]] = alloca [4 x { double, double }]
-// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 0
-// QIR:         store double 0.000000e+00, double* %[[VAL_1]]
-// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 1
-// QIR:         store double 1.000000e+00, double* %[[VAL_2]]
-// QIR:         %[[VAL_3:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 0
-// QIR:         store double 6.000000e-01, double* %[[VAL_3]]
-// QIR:         %[[VAL_4:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 1
-// QIR:         store double 4.000000e-01, double* %[[VAL_4]]
-// QIR:         %[[VAL_5:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 0
-// QIR:         store double 1.000000e+00, double* %[[VAL_5]]
-// QIR:         %[[VAL_6:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 1
-// QIR:         %[[VAL_7:.*]] = bitcast [4 x { double, double }]* %[[VAL_0]] to i8*
-// QIR:         %[[VAL_8:.*]] = bitcast double* %[[VAL_6]] to i8*
-// QIR:         call void @llvm.memset.p0i8.i64(i8* noundef nonnull {{.*}}dereferenceable(24) %[[VAL_8]], i8 0, i64 24, i1 false)
-// QIR:         %[[VAL_9:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_complex_f64(i8* nonnull %[[VAL_7]], i64 4)
-// QIR:         %[[VAL_10:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_9]])
-// QIR:         %[[VAL_11:.*]] = call %[[VAL_12:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_10]], i8** %[[VAL_9]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_9]])
-// QIR:         %[[VAL_13:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_12]]* %[[VAL_11]])
+// QIR:         store double 0.000000e+00, ptr %[[VAL_0]]
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// QIR:         store double 1.000000e+00, ptr %[[VAL_2]]
+// QIR:         %[[VAL_3:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 16
+// QIR:         store double 6.000000e-01, ptr %[[VAL_3]]
+// QIR:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 24
+// QIR:         store double 4.000000e-01, ptr %[[VAL_4]]
+// QIR:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 32
+// QIR:         store double 1.000000e+00, ptr %[[VAL_5]]
+// QIR:         %[[VAL_6:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 40
+// QIR:         call void @llvm.memset.p0.i64(ptr noundef nonnull {{.*}}dereferenceable(24) %[[VAL_6]], i8 0, i64 24, i1 false)
+// QIR:         %[[VAL_9:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_complex_f64(ptr nonnull %[[VAL_0]], i64 4)
+// QIR:         %[[VAL_10:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_9]])
+// QIR:         %[[VAL_11:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_10]], ptr %[[VAL_9]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_9]])
+// QIR:         %[[VAL_13:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_11]])
 // QIR:         %[[VAL_14:.*]] = icmp sgt i64 %[[VAL_13]], 0
 // QIR:         br i1 %[[VAL_14]], label %[[VAL_15:.*]], label %[[VAL_16:.*]]
-// QIR:                              ; preds = %[[VAL_17:.*]]
-// QIR:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_13]]
-// QIR:         br label %[[VAL_19:.*]]
-// QIR:                              ; preds = %[[VAL_17]], %[[VAL_15]]
-// QIR:         %[[VAL_20:.*]] = phi i64 [ %[[VAL_21:.*]], %[[VAL_15]] ], [ 0, %[[VAL_17]] ]
-// QIR:         %[[VAL_22:.*]] = call %[[VAL_23:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_12]]* %[[VAL_11]], i64 %[[VAL_20]])
-// QIR:         %[[VAL_24:.*]] = load %[[VAL_23]]*, %[[VAL_23]]** %[[VAL_22]]
-// QIR:         call void @__quantum__qis__h(%[[VAL_23]]* %[[VAL_24]])
-// QIR:         %[[VAL_21]] = add nuw nsw i64 %[[VAL_20]], 1
-// QIR:         %[[VAL_25:.*]] = icmp eq i64 %[[VAL_21]], %[[VAL_13]]
-// QIR:         br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_15]]
-// QIR:                                        ; preds = %[[VAL_15]]
-// QIR:         %[[VAL_27:.*]] = alloca i8, i64 %[[VAL_13]]
-// QIR:         br i1 %[[VAL_14]], label %[[VAL_28:.*]], label %[[VAL_19]]
-// QIR:       .lr.ph10:                                         ; preds = %[[VAL_26]], %[[VAL_28]]
-// QIR:         %[[VAL_29:.*]] = phi i64 [ %[[VAL_30:.*]], %[[VAL_28]] ], [ 0, %[[VAL_26]] ]
-// QIR:         %[[VAL_31:.*]] = call %[[VAL_23]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_12]]* %[[VAL_11]], i64 %[[VAL_29]])
-// QIR:         %[[VAL_32:.*]] = load %[[VAL_23]]*, %[[VAL_23]]** %[[VAL_31]]
-// QIR:         %[[VAL_33:.*]] = call %[[VAL_34:.*]]* @__quantum__qis__mz(%[[VAL_23]]* %[[VAL_32]])
-// QIR:         %[[VAL_35:.*]] = bitcast %[[VAL_34]]* %[[VAL_33]] to i1*
-// QIR:         %[[VAL_36:.*]] = load i1, i1* %[[VAL_35]]
-// QIR:         %[[VAL_37:.*]] = getelementptr i8, i8* %[[VAL_27]], i64 %[[VAL_29]]
-// QIR:         %[[VAL_38:.*]] = zext i1 %[[VAL_36]] to i8
-// QIR:         store i8 %[[VAL_38]], i8* %[[VAL_37]]
-// QIR:         %[[VAL_30]] = add nuw nsw i64 %[[VAL_29]], 1
-// QIR:         %[[VAL_39:.*]] = icmp eq i64 %[[VAL_30]], %[[VAL_13]]
-// QIR:         br i1 %[[VAL_39]], label %[[VAL_19]], label %[[VAL_28]]
-// QIR:                      ; preds = %[[VAL_28]], %[[VAL_16]], %[[VAL_26]]
-// QIR:         %[[VAL_40:.*]] = phi i8* [ %[[VAL_18]], %[[VAL_16]] ], [ %[[VAL_27]], %[[VAL_26]] ], [ %[[VAL_27]], %[[VAL_28]] ]
-// QIR:         %[[VAL_41:.*]] = call i8* @malloc(i64 %[[VAL_13]])
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}%[[VAL_41]], i8* nonnull {{.*}}%[[VAL_40]], i64 %[[VAL_13]], i1 false)
-// QIR:         %[[VAL_42:.*]] = bitcast i8* %[[VAL_41]] to i1*
-// QIR:         %[[VAL_43:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_42]], 0
-// QIR:         %[[VAL_44:.*]] = insertvalue { i1*, i64 } %[[VAL_43]], i64 %[[VAL_13]], 1
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_12]]* %[[VAL_11]])
-// QIR:         ret { i1*, i64 } %[[VAL_44]]
+// QIR:         %[[VAL_22:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_11]], i64 %{{.*}})
+// QIR:         %[[VAL_24:.*]] = load ptr, ptr %[[VAL_22]]
+// QIR:         call void @__quantum__qis__h(ptr %[[VAL_24]])
+// QIR:         %[[VAL_31:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_11]], i64 %{{.*}})
+// QIR:         %[[VAL_32:.*]] = load ptr, ptr %[[VAL_31]]
+// QIR:         %[[VAL_33:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_32]])
+// QIR:         %[[VAL_36:.*]] = load i1, ptr %[[VAL_33]]
+// QIR:         %[[VAL_41:.*]] = call ptr @malloc(i64 %[[VAL_13]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}%[[VAL_41]], ptr nonnull {{.*}}%{{.*}}, i64 %[[VAL_13]], i1 false)
+// QIR:         %[[VAL_43:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_41]], 0
+// QIR:         %[[VAL_44:.*]] = insertvalue { ptr, i64 } %[[VAL_43]], i64 %[[VAL_13]], 1
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_11]])
+// QIR:         ret { ptr, i64 } %[[VAL_44]]
 // QIR:       }
 
-// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__MooseTracks() local_unnamed_addr {
+// QIR-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__MooseTracks() local_unnamed_addr {
 // QIR:         %[[VAL_0:.*]] = alloca [4 x { double, double }]
-// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 0
-// QIR:         store double 0.000000e+00, double* %[[VAL_1]]
-// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 1
-// QIR:         store double 1.000000e+00, double* %[[VAL_2]]
-// QIR:         %[[VAL_3:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 0
-// QIR:         store double 7.500000e-01, double* %[[VAL_3]]
-// QIR:         %[[VAL_4:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 1
-// QIR:         store double 2.500000e-01, double* %[[VAL_4]]
-// QIR:         %[[VAL_5:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 0
-// QIR:         store double 1.000000e+00, double* %[[VAL_5]]
-// QIR:         %[[VAL_6:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 1
-// QIR:         %[[VAL_7:.*]] = bitcast [4 x { double, double }]* %[[VAL_0]] to i8*
-// QIR:         %[[VAL_8:.*]] = bitcast double* %[[VAL_6]] to i8*
-// QIR:         call void @llvm.memset.p0i8.i64(i8* noundef nonnull {{.*}}dereferenceable(24) %[[VAL_8]], i8 0, i64 24, i1 false)
-// QIR:         %[[VAL_9:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_complex_f64(i8* nonnull %[[VAL_7]], i64 4)
-// QIR:         %[[VAL_10:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_9]])
-// QIR:         %[[VAL_11:.*]] = call %[[VAL_12:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_10]], i8** %[[VAL_9]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_9]])
-// QIR:         %[[VAL_13:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_12]]* %[[VAL_11]])
+// QIR:         store double 0.000000e+00, ptr %[[VAL_0]]
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// QIR:         store double 1.000000e+00, ptr %[[VAL_2]]
+// QIR:         %[[VAL_3:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 16
+// QIR:         store double 7.500000e-01, ptr %[[VAL_3]]
+// QIR:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 24
+// QIR:         store double 2.500000e-01, ptr %[[VAL_4]]
+// QIR:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 32
+// QIR:         store double 1.000000e+00, ptr %[[VAL_5]]
+// QIR:         %[[VAL_6:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 40
+// QIR:         call void @llvm.memset.p0.i64(ptr noundef nonnull {{.*}}dereferenceable(24) %[[VAL_6]], i8 0, i64 24, i1 false)
+// QIR:         %[[VAL_9:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_complex_f64(ptr nonnull %[[VAL_0]], i64 4)
+// QIR:         %[[VAL_10:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_9]])
+// QIR:         %[[VAL_11:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_10]], ptr %[[VAL_9]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_9]])
+// QIR:         %[[VAL_13:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_11]])
 // QIR:         %[[VAL_14:.*]] = icmp sgt i64 %[[VAL_13]], 0
 // QIR:         br i1 %[[VAL_14]], label %[[VAL_15:.*]], label %[[VAL_16:.*]]
-// QIR:                                      ; preds = %[[VAL_17:.*]]
-// QIR:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_13]]
-// QIR:         br label %[[VAL_19:.*]]
-// QIR:                                    ; preds = %[[VAL_17]], %[[VAL_15]]
-// QIR:         %[[VAL_20:.*]] = phi i64 [ %[[VAL_21:.*]], %[[VAL_15]] ], [ 0, %[[VAL_17]] ]
-// QIR:         %[[VAL_22:.*]] = call %[[VAL_23:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_12]]* %[[VAL_11]], i64 %[[VAL_20]])
-// QIR:         %[[VAL_24:.*]] = load %[[VAL_23]]*, %[[VAL_23]]** %[[VAL_22]]
-// QIR:         call void @__quantum__qis__h(%[[VAL_23]]* %[[VAL_24]])
-// QIR:         %[[VAL_21]] = add nuw nsw i64 %[[VAL_20]], 1
-// QIR:         %[[VAL_25:.*]] = icmp eq i64 %[[VAL_21]], %[[VAL_13]]
-// QIR:         br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_15]]
-// QIR:                                           ; preds = %[[VAL_15]]
-// QIR:         %[[VAL_27:.*]] = alloca i8, i64 %[[VAL_13]]
-// QIR:         br i1 %[[VAL_14]], label %[[VAL_28:.*]], label %[[VAL_19]]
-// QIR:                                          ; preds = %[[VAL_26]], %[[VAL_28]]
-// QIR:         %[[VAL_29:.*]] = phi i64 [ %[[VAL_30:.*]], %[[VAL_28]] ], [ 0, %[[VAL_26]] ]
-// QIR:         %[[VAL_31:.*]] = call %[[VAL_23]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_12]]* %[[VAL_11]], i64 %[[VAL_29]])
-// QIR:         %[[VAL_32:.*]] = load %[[VAL_23]]*, %[[VAL_23]]** %[[VAL_31]]
-// QIR:         %[[VAL_33:.*]] = call %[[VAL_34:.*]]* @__quantum__qis__mz(%[[VAL_23]]* %[[VAL_32]])
-// QIR:         %[[VAL_35:.*]] = bitcast %[[VAL_34]]* %[[VAL_33]] to i1*
-// QIR:         %[[VAL_36:.*]] = load i1, i1* %[[VAL_35]]
-// QIR:         %[[VAL_37:.*]] = getelementptr i8, i8* %[[VAL_27]], i64 %[[VAL_29]]
-// QIR:         %[[VAL_38:.*]] = zext i1 %[[VAL_36]] to i8
-// QIR:         store i8 %[[VAL_38]], i8* %[[VAL_37]]
-// QIR:         %[[VAL_30]] = add nuw nsw i64 %[[VAL_29]], 1
-// QIR:         %[[VAL_39:.*]] = icmp eq i64 %[[VAL_30]], %[[VAL_13]]
-// QIR:         br i1 %[[VAL_39]], label %[[VAL_19]], label %[[VAL_28]]
-// QIR:                                     ; preds = %[[VAL_28]], %[[VAL_16]], %[[VAL_26]]
-// QIR:         %[[VAL_40:.*]] = phi i8* [ %[[VAL_18]], %[[VAL_16]] ], [ %[[VAL_27]], %[[VAL_26]] ], [ %[[VAL_27]], %[[VAL_28]] ]
-// QIR:         %[[VAL_41:.*]] = call i8* @malloc(i64 %[[VAL_13]])
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}%[[VAL_41]], i8* nonnull {{.*}}%[[VAL_40]], i64 %[[VAL_13]], i1 false)
-// QIR:         %[[VAL_42:.*]] = bitcast i8* %[[VAL_41]] to i1*
-// QIR:         %[[VAL_43:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_42]], 0
-// QIR:         %[[VAL_44:.*]] = insertvalue { i1*, i64 } %[[VAL_43]], i64 %[[VAL_13]], 1
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_12]]* %[[VAL_11]])
-// QIR:         ret { i1*, i64 } %[[VAL_44]]
+// QIR:         %[[VAL_22:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_11]], i64 %{{.*}})
+// QIR:         %[[VAL_24:.*]] = load ptr, ptr %[[VAL_22]]
+// QIR:         call void @__quantum__qis__h(ptr %[[VAL_24]])
+// QIR:         %[[VAL_31:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_11]], i64 %{{.*}})
+// QIR:         %[[VAL_32:.*]] = load ptr, ptr %[[VAL_31]]
+// QIR:         %[[VAL_33:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_32]])
+// QIR:         %[[VAL_36:.*]] = load i1, ptr %[[VAL_33]]
+// QIR:         %[[VAL_41:.*]] = call ptr @malloc(i64 %[[VAL_13]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}%[[VAL_41]], ptr nonnull {{.*}}%{{.*}}, i64 %[[VAL_13]], i1 false)
+// QIR:         %[[VAL_43:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_41]], 0
+// QIR:         %[[VAL_44:.*]] = insertvalue { ptr, i64 } %[[VAL_43]], i64 %[[VAL_13]], 1
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_11]])
+// QIR:         ret { ptr, i64 } %[[VAL_44]]
 // QIR:       }
 
-// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__RockyRoad() local_unnamed_addr {
+// QIR-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__RockyRoad() local_unnamed_addr {
 // QIR:         %[[VAL_0:.*]] = alloca double
-// QIR:         store double 0.000000e+00, double* %[[VAL_0]]
+// QIR:         store double 0.000000e+00, ptr %[[VAL_0]]
 // QIR:         %[[VAL_2:.*]] = alloca { double, double }
-// QIR:         %[[VAL_3:.*]] = extractvalue { double, double } %{{.*}}, 0
-// QIR:         %[[VAL_4:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_2]], i64 0, i32 0
-// QIR:         store double %[[VAL_3]], double* %[[VAL_4]]
-// QIR:         %[[VAL_5:.*]] = extractvalue { double, double } %{{.*}}, 1
-// QIR:         %[[VAL_6:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_2]], i64 0, i32 1
-// QIR:         store double %[[VAL_5]], double* %[[VAL_6]]
-// QIR:         %[[VAL_7:.*]] = call { double, double } @_Z{{.*}}(double* nonnull %[[VAL_0]], { double, double }* nonnull %[[VAL_2]])
+// QIR:         store double %{{.*}}, ptr %[[VAL_2]]
+// QIR:         %[[VAL_6:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_2]], i64 8
+// QIR:         store double %{{.*}}, ptr %[[VAL_6]]
+// QIR:         %[[VAL_7:.*]] = call { double, double } @_Z{{.*}}(ptr nonnull %[[VAL_0]], ptr nonnull %[[VAL_2]])
 // QIR:         %[[VAL_8:.*]] = alloca double
-// QIR:         store double 1.000000e+00, double* %[[VAL_8]]
+// QIR:         store double 1.000000e+00, ptr %[[VAL_8]]
 // QIR:         %[[VAL_10:.*]] = alloca { double, double }
-// QIR:         %[[VAL_11:.*]] = extractvalue { double, double } %{{.*}}, 0
-// QIR:         %[[VAL_12:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_10]], i64 0, i32 0
-// QIR:         store double %[[VAL_11]], double* %[[VAL_12]]
-// QIR:         %[[VAL_13:.*]] = extractvalue { double, double } %{{.*}}, 1
-// QIR:         %[[VAL_14:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_10]], i64 0, i32 1
-// QIR:         store double %[[VAL_13]], double* %[[VAL_14]]
-// QIR:         %[[VAL_15:.*]] = call { double, double } @_Z{{.*}}(double* nonnull %[[VAL_8]], { double, double }* nonnull %[[VAL_10]])
+// QIR:         store double %{{.*}}, ptr %[[VAL_10]]
+// QIR:         %[[VAL_14:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_10]], i64 8
+// QIR:         store double %{{.*}}, ptr %[[VAL_14]]
+// QIR:         %[[VAL_15:.*]] = call { double, double } @_Z{{.*}}(ptr nonnull %[[VAL_8]], ptr nonnull %[[VAL_10]])
 // QIR:         %[[VAL_16:.*]] = alloca [4 x { double, double }]
-// QIR:         %[[VAL_17:.*]] = extractvalue { double, double } %[[VAL_7]], 0
-// QIR:         %[[VAL_18:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 0, i32 0
-// QIR:         store double %[[VAL_17]], double* %[[VAL_18]]
-// QIR:         %[[VAL_19:.*]] = extractvalue { double, double } %[[VAL_7]], 1
-// QIR:         %[[VAL_20:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 0, i32 1
-// QIR:         store double %[[VAL_19]], double* %[[VAL_20]]
-// QIR:         %[[VAL_21:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 1, i32 0
-// QIR:         store double 8.000000e-01, double* %[[VAL_21]]
-// QIR:         %[[VAL_22:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 1, i32 1
-// QIR:         store double 2.000000e-01, double* %[[VAL_22]]
-// QIR:         %[[VAL_23:.*]] = extractvalue { double, double } %[[VAL_15]], 0
-// QIR:         %[[VAL_24:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 2, i32 0
-// QIR:         store double %[[VAL_23]], double* %[[VAL_24]]
-// QIR:         %[[VAL_25:.*]] = extractvalue { double, double } %[[VAL_15]], 1
-// QIR:         %[[VAL_26:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 2, i32 1
-// QIR:         store double %[[VAL_25]], double* %[[VAL_26]]
-// QIR:         %[[VAL_27:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 3, i32 0
-// QIR:         %[[VAL_28:.*]] = bitcast [4 x { double, double }]* %[[VAL_16]] to i8*
-// QIR:         %[[VAL_29:.*]] = bitcast double* %[[VAL_27]] to i8*
-// QIR:         call void @llvm.memset.p0i8.i64(i8* noundef nonnull {{.*}}dereferenceable(16) %[[VAL_29]], i8 0, i64 16, i1 false)
-// QIR:         %[[VAL_30:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_complex_f64(i8* nonnull %[[VAL_28]], i64 4)
-// QIR:         %[[VAL_31:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_30]])
-// QIR:         %[[VAL_32:.*]] = call %[[VAL_33:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_31]], i8** %[[VAL_30]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_30]])
-// QIR:         %[[VAL_34:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_33]]* %[[VAL_32]])
+// QIR:         store double %{{.*}}, ptr %[[VAL_16]]
+// QIR:         %[[VAL_20:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_16]], i64 8
+// QIR:         store double %{{.*}}, ptr %[[VAL_20]]
+// QIR:         %[[VAL_21:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_16]], i64 16
+// QIR:         store double 8.000000e-01, ptr %[[VAL_21]]
+// QIR:         %[[VAL_22:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_16]], i64 24
+// QIR:         store double 2.000000e-01, ptr %[[VAL_22]]
+// QIR:         %[[VAL_24:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_16]], i64 32
+// QIR:         store double %{{.*}}, ptr %[[VAL_24]]
+// QIR:         %[[VAL_26:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_16]], i64 40
+// QIR:         store double %{{.*}}, ptr %[[VAL_26]]
+// QIR:         %[[VAL_27:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_16]], i64 48
+// QIR:         call void @llvm.memset.p0.i64(ptr noundef nonnull {{.*}}dereferenceable(16) %[[VAL_27]], i8 0, i64 16, i1 false)
+// QIR:         %[[VAL_30:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_complex_f64(ptr nonnull %[[VAL_16]], i64 4)
+// QIR:         %[[VAL_31:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_30]])
+// QIR:         %[[VAL_32:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_31]], ptr %[[VAL_30]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_30]])
+// QIR:         %[[VAL_34:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_32]])
 // QIR:         %[[VAL_35:.*]] = icmp sgt i64 %[[VAL_34]], 0
 // QIR:         br i1 %[[VAL_35]], label %[[VAL_36:.*]], label %[[VAL_37:.*]]
-// QIR:                              ; preds = %[[VAL_38:.*]]
-// QIR:         %[[VAL_39:.*]] = alloca i8, i64 %[[VAL_34]]
-// QIR:         br label %[[VAL_40:.*]]
-// QIR:                                   ; preds = %[[VAL_38]], %[[VAL_36]]
-// QIR:         %[[VAL_41:.*]] = phi i64 [ %[[VAL_42:.*]], %[[VAL_36]] ], [ 0, %[[VAL_38]] ]
-// QIR:         %[[VAL_43:.*]] = call %[[VAL_44:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_33]]* %[[VAL_32]], i64 %[[VAL_41]])
-// QIR:         %[[VAL_45:.*]] = load %[[VAL_44]]*, %[[VAL_44]]** %[[VAL_43]]
-// QIR:         call void @__quantum__qis__h(%[[VAL_44]]* %[[VAL_45]])
-// QIR:         %[[VAL_42]] = add nuw nsw i64 %[[VAL_41]], 1
-// QIR:         %[[VAL_46:.*]] = icmp eq i64 %[[VAL_42]], %[[VAL_34]]
-// QIR:         br i1 %[[VAL_46]], label %[[VAL_47:.*]], label %[[VAL_36]]
-// QIR:                                     ; preds = %[[VAL_36]]
-// QIR:         %[[VAL_48:.*]] = alloca i8, i64 %[[VAL_34]]
-// QIR:         br i1 %[[VAL_35]], label %[[VAL_49:.*]], label %[[VAL_40]]
-// QIR:                              ; preds = %[[VAL_47]], %[[VAL_49]]
-// QIR:         %[[VAL_50:.*]] = phi i64 [ %[[VAL_51:.*]], %[[VAL_49]] ], [ 0, %[[VAL_47]] ]
-// QIR:         %[[VAL_52:.*]] = call %[[VAL_44]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_33]]* %[[VAL_32]], i64 %[[VAL_50]])
-// QIR:         %[[VAL_53:.*]] = load %[[VAL_44]]*, %[[VAL_44]]** %[[VAL_52]]
-// QIR:         %[[VAL_54:.*]] = call %[[VAL_55:.*]]* @__quantum__qis__mz(%[[VAL_44]]* %[[VAL_53]])
-// QIR:         %[[VAL_56:.*]] = bitcast %[[VAL_55]]* %[[VAL_54]] to i1*
-// QIR:         %[[VAL_57:.*]] = load i1, i1* %[[VAL_56]]
-// QIR:         %[[VAL_58:.*]] = getelementptr i8, i8* %[[VAL_48]], i64 %[[VAL_50]]
-// QIR:         %[[VAL_59:.*]] = zext i1 %[[VAL_57]] to i8
-// QIR:         store i8 %[[VAL_59]], i8* %[[VAL_58]]
-// QIR:         %[[VAL_51]] = add nuw nsw i64 %[[VAL_50]], 1
-// QIR:         %[[VAL_60:.*]] = icmp eq i64 %[[VAL_51]], %[[VAL_34]]
-// QIR:         br i1 %[[VAL_60]], label %[[VAL_40]], label %[[VAL_49]]
-// QIR:                                ; preds = %[[VAL_49]], %[[VAL_37]], %[[VAL_47]]
-// QIR:         %[[VAL_61:.*]] = phi i8* [ %[[VAL_39]], %[[VAL_37]] ], [ %[[VAL_48]], %[[VAL_47]] ], [ %[[VAL_48]], %[[VAL_49]] ]
-// QIR:         %[[VAL_62:.*]] = call i8* @malloc(i64 %[[VAL_34]])
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}%[[VAL_62]], i8* nonnull {{.*}}%[[VAL_61]], i64 %[[VAL_34]], i1 false)
-// QIR:         %[[VAL_63:.*]] = bitcast i8* %[[VAL_62]] to i1*
-// QIR:         %[[VAL_64:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_63]], 0
-// QIR:         %[[VAL_65:.*]] = insertvalue { i1*, i64 } %[[VAL_64]], i64 %[[VAL_34]], 1
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_33]]* %[[VAL_32]])
-// QIR:         ret { i1*, i64 } %[[VAL_65]]
+// QIR:         %[[VAL_43:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_32]], i64 %{{.*}})
+// QIR:         %[[VAL_45:.*]] = load ptr, ptr %[[VAL_43]]
+// QIR:         call void @__quantum__qis__h(ptr %[[VAL_45]])
+// QIR:         %[[VAL_52:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_32]], i64 %{{.*}})
+// QIR:         %[[VAL_53:.*]] = load ptr, ptr %[[VAL_52]]
+// QIR:         %[[VAL_54:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_53]])
+// QIR:         %[[VAL_57:.*]] = load i1, ptr %[[VAL_54]]
+// QIR:         %[[VAL_62:.*]] = call ptr @malloc(i64 %[[VAL_34]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}%[[VAL_62]], ptr nonnull {{.*}}%{{.*}}, i64 %[[VAL_34]], i1 false)
+// QIR:         %[[VAL_64:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_62]], 0
+// QIR:         %[[VAL_65:.*]] = insertvalue { ptr, i64 } %[[VAL_64]], i64 %[[VAL_34]], 1
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_32]])
+// QIR:         ret { ptr, i64 } %[[VAL_65]]
 // QIR:       }
 
 // QIR-LABEL: define i1 @__nvqpp__mlirgen__Pistachio() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call { double*, i64 } @_Z{{.*}}()
-// QIR:         %[[VAL_1:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 0
-// QIR:         %[[VAL_2:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 1
+// QIR:         %[[VAL_0:.*]] = tail call { ptr, i64 } @_Z{{.*}}()
+// QIR:         %[[VAL_1:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
 // QIR:         %[[VAL_3:.*]] = shl i64 %[[VAL_2]], 3
 // QIR:         %[[VAL_4:.*]] = alloca double, i64 %[[VAL_3]]
-// QIR:         %[[VAL_5:.*]] = bitcast double* %[[VAL_4]] to i8*
-// QIR:         %[[VAL_6:.*]] = bitcast double* %[[VAL_1]] to i8*
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull {{.*}}%[[VAL_5]], i8* {{.*}}%[[VAL_6]], i64 %[[VAL_3]], i1 false)
-// QIR:         tail call void @free(i8* %[[VAL_6]])
-// QIR:         %[[VAL_7:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_f64(i8* nonnull %[[VAL_5]], i64 %[[VAL_2]])
-// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_7]])
-// QIR:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], i8** %[[VAL_7]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_7]])
-// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_10]]* %[[VAL_9]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr nonnull {{.*}}%[[VAL_4]], ptr {{.*}}%[[VAL_1]], i64 %[[VAL_3]], i1 false)
+// QIR:         tail call void @free(ptr %[[VAL_1]])
+// QIR:         %[[VAL_7:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_f64(ptr nonnull %[[VAL_4]], i64 %[[VAL_2]])
+// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_7]])
+// QIR:         %[[VAL_9:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], ptr %[[VAL_7]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_7]])
+// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_9]])
 // QIR:         %[[VAL_12:.*]] = icmp sgt i64 %[[VAL_11]], 0
 // QIR:         br i1 %[[VAL_12]], label %[[VAL_13:.*]], label %[[VAL_14:.*]]
-// QIR:                                    ; preds = %[[VAL_15:.*]], %[[VAL_13]]
-// QIR:         %[[VAL_16:.*]] = phi i64 [ %[[VAL_17:.*]], %[[VAL_13]] ], [ 0, %[[VAL_15]] ]
-// QIR:         %[[VAL_18:.*]] = call %[[VAL_19:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 %[[VAL_16]])
-// QIR:         %[[VAL_20:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_18]]
-// QIR:         call void @__quantum__qis__h(%[[VAL_19]]* %[[VAL_20]])
-// QIR:         %[[VAL_17]] = add nuw nsw i64 %[[VAL_16]], 1
-// QIR:         %[[VAL_21:.*]] = icmp eq i64 %[[VAL_17]], %[[VAL_11]]
-// QIR:         br i1 %[[VAL_21]], label %[[VAL_14]], label %[[VAL_13]]
-// QIR:                               ; preds = %[[VAL_13]], %[[VAL_15]]
-// QIR:         %[[VAL_22:.*]] = call %[[VAL_19]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 0)
-// QIR:         %[[VAL_23:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_22]]
-// QIR:         %[[VAL_24:.*]] = call %[[VAL_25:.*]]* @__quantum__qis__mz(%[[VAL_19]]* %[[VAL_23]])
-// QIR:         %[[VAL_26:.*]] = bitcast %[[VAL_25]]* %[[VAL_24]] to i1*
-// QIR:         %[[VAL_27:.*]] = load i1, i1* %[[VAL_26]]
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_10]]* %[[VAL_9]])
+// QIR:         %[[VAL_18:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 %{{.*}})
+// QIR:         %[[VAL_20:.*]] = load ptr, ptr %[[VAL_18]]
+// QIR:         call void @__quantum__qis__h(ptr %[[VAL_20]])
+// QIR:         %[[VAL_22:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 0)
+// QIR:         %[[VAL_23:.*]] = load ptr, ptr %[[VAL_22]]
+// QIR:         %[[VAL_24:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_23]])
+// QIR:         %[[VAL_27:.*]] = load i1, ptr %[[VAL_24]]
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_9]])
 // QIR:         ret i1 %[[VAL_27]]
 // QIR:       }
 
 // QIR-LABEL: define i1 @__nvqpp__mlirgen__ChocolateMint() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call { double*, i64 } @_Z{{.*}}()
-// QIR:         %[[VAL_1:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 0
-// QIR:         %[[VAL_2:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 1
+// QIR:         %[[VAL_0:.*]] = tail call { ptr, i64 } @_Z{{.*}}()
+// QIR:         %[[VAL_1:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
 // QIR:         %[[VAL_3:.*]] = shl i64 %[[VAL_2]], 3
 // QIR:         %[[VAL_4:.*]] = alloca double, i64 %[[VAL_3]]
-// QIR:         %[[VAL_5:.*]] = bitcast double* %[[VAL_4]] to i8*
-// QIR:         %[[VAL_6:.*]] = bitcast double* %[[VAL_1]] to i8*
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull {{.*}}%[[VAL_5]], i8* {{.*}}%[[VAL_6]], i64 %[[VAL_3]], i1 false)
-// QIR:         tail call void @free(i8* %[[VAL_6]])
-// QIR:         %[[VAL_7:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_f64(i8* nonnull %[[VAL_5]], i64 %[[VAL_2]])
-// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_7]])
-// QIR:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], i8** %[[VAL_7]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_7]])
-// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_10]]* %[[VAL_9]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr nonnull {{.*}}%[[VAL_4]], ptr {{.*}}%[[VAL_1]], i64 %[[VAL_3]], i1 false)
+// QIR:         tail call void @free(ptr %[[VAL_1]])
+// QIR:         %[[VAL_7:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_f64(ptr nonnull %[[VAL_4]], i64 %[[VAL_2]])
+// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_7]])
+// QIR:         %[[VAL_9:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], ptr %[[VAL_7]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_7]])
+// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_9]])
 // QIR:         %[[VAL_12:.*]] = icmp sgt i64 %[[VAL_11]], 0
 // QIR:         br i1 %[[VAL_12]], label %[[VAL_13:.*]], label %[[VAL_14:.*]]
-// QIR:                                    ; preds = %[[VAL_15:.*]], %[[VAL_13]]
-// QIR:         %[[VAL_16:.*]] = phi i64 [ %[[VAL_17:.*]], %[[VAL_13]] ], [ 0, %[[VAL_15]] ]
-// QIR:         %[[VAL_18:.*]] = call %[[VAL_19:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 %[[VAL_16]])
-// QIR:         %[[VAL_20:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_18]]
-// QIR:         call void @__quantum__qis__h(%[[VAL_19]]* %[[VAL_20]])
-// QIR:         %[[VAL_17]] = add nuw nsw i64 %[[VAL_16]], 1
-// QIR:         %[[VAL_21:.*]] = icmp eq i64 %[[VAL_17]], %[[VAL_11]]
-// QIR:         br i1 %[[VAL_21]], label %[[VAL_14]], label %[[VAL_13]]
-// QIR:                                ; preds = %[[VAL_13]], %[[VAL_15]]
-// QIR:         %[[VAL_22:.*]] = call %[[VAL_19]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 0)
-// QIR:         %[[VAL_23:.*]] = load %[[VAL_19]]*, %[[VAL_19]]** %[[VAL_22]]
-// QIR:         %[[VAL_24:.*]] = call %[[VAL_25:.*]]* @__quantum__qis__mz(%[[VAL_19]]* %[[VAL_23]])
-// QIR:         %[[VAL_26:.*]] = bitcast %[[VAL_25]]* %[[VAL_24]] to i1*
-// QIR:         %[[VAL_27:.*]] = load i1, i1* %[[VAL_26]]
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_10]]* %[[VAL_9]])
+// QIR:         %[[VAL_18:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 %{{.*}})
+// QIR:         %[[VAL_20:.*]] = load ptr, ptr %[[VAL_18]]
+// QIR:         call void @__quantum__qis__h(ptr %[[VAL_20]])
+// QIR:         %[[VAL_22:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 0)
+// QIR:         %[[VAL_23:.*]] = load ptr, ptr %[[VAL_22]]
+// QIR:         %[[VAL_24:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_23]])
+// QIR:         %[[VAL_27:.*]] = load i1, ptr %[[VAL_24]]
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_9]])
 // QIR:         ret i1 %[[VAL_27]]
 // QIR:       }
 
-// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Neapolitan() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call { { double, double }*, i64 } @_Z{{.*}}()
-// QIR:         %[[VAL_1:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 0
-// QIR:         %[[VAL_2:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 1
+// QIR-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__Neapolitan() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = tail call { ptr, i64 } @_Z{{.*}}()
+// QIR:         %[[VAL_1:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
 // QIR:         %[[VAL_3:.*]] = shl i64 %[[VAL_2]], 4
 // QIR:         %[[VAL_4:.*]] = alloca { double, double }, i64 %[[VAL_3]]
-// QIR:         %[[VAL_5:.*]] = bitcast { double, double }* %[[VAL_4]] to i8*
-// QIR:         %[[VAL_6:.*]] = bitcast { double, double }* %[[VAL_1]] to i8*
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull {{.*}}%[[VAL_5]], i8* {{.*}}%[[VAL_6]], i64 %[[VAL_3]], i1 false)
-// QIR:         tail call void @free(i8* %[[VAL_6]])
-// QIR:         %[[VAL_7:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_complex_f64(i8* nonnull %[[VAL_5]], i64 %[[VAL_2]])
-// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_7]])
-// QIR:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], i8** %[[VAL_7]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_7]])
-// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_10]]* %[[VAL_9]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr nonnull {{.*}}%[[VAL_4]], ptr {{.*}}%[[VAL_1]], i64 %[[VAL_3]], i1 false)
+// QIR:         tail call void @free(ptr %[[VAL_1]])
+// QIR:         %[[VAL_7:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_complex_f64(ptr nonnull %[[VAL_4]], i64 %[[VAL_2]])
+// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_7]])
+// QIR:         %[[VAL_9:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], ptr %[[VAL_7]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_7]])
+// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_9]])
 // QIR:         %[[VAL_12:.*]] = icmp sgt i64 %[[VAL_11]], 0
 // QIR:         br i1 %[[VAL_12]], label %[[VAL_13:.*]], label %[[VAL_14:.*]]
-// QIR:                                ; preds = %[[VAL_15:.*]]
-// QIR:         %[[VAL_16:.*]] = alloca i8, i64 %[[VAL_11]]
-// QIR:         br label %[[VAL_17:.*]]
-// QIR:                                  ; preds = %[[VAL_15]], %[[VAL_13]]
-// QIR:         %[[VAL_18:.*]] = phi i64 [ %[[VAL_19:.*]], %[[VAL_13]] ], [ 0, %[[VAL_15]] ]
-// QIR:         %[[VAL_20:.*]] = call %[[VAL_21:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 %[[VAL_18]])
-// QIR:         %[[VAL_22:.*]] = load %[[VAL_21]]*, %[[VAL_21]]** %[[VAL_20]]
-// QIR:         call void @__quantum__qis__h(%[[VAL_21]]* %[[VAL_22]])
-// QIR:         %[[VAL_19]] = add nuw nsw i64 %[[VAL_18]], 1
-// QIR:         %[[VAL_23:.*]] = icmp eq i64 %[[VAL_19]], %[[VAL_11]]
-// QIR:         br i1 %[[VAL_23]], label %[[VAL_24:.*]], label %[[VAL_13]]
-// QIR:                              ; preds = %[[VAL_13]]
-// QIR:         %[[VAL_25:.*]] = alloca i8, i64 %[[VAL_11]]
-// QIR:         br i1 %[[VAL_12]], label %[[VAL_26:.*]], label %[[VAL_17]]
-// QIR:                                 ; preds = %[[VAL_24]], %[[VAL_26]]
-// QIR:         %[[VAL_27:.*]] = phi i64 [ %[[VAL_28:.*]], %[[VAL_26]] ], [ 0, %[[VAL_24]] ]
-// QIR:         %[[VAL_29:.*]] = call %[[VAL_21]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 %[[VAL_27]])
-// QIR:         %[[VAL_30:.*]] = load %[[VAL_21]]*, %[[VAL_21]]** %[[VAL_29]]
-// QIR:         %[[VAL_31:.*]] = call %[[VAL_32:.*]]* @__quantum__qis__mz(%[[VAL_21]]* %[[VAL_30]])
-// QIR:         %[[VAL_33:.*]] = bitcast %[[VAL_32]]* %[[VAL_31]] to i1*
-// QIR:         %[[VAL_34:.*]] = load i1, i1* %[[VAL_33]]
-// QIR:         %[[VAL_35:.*]] = getelementptr i8, i8* %[[VAL_25]], i64 %[[VAL_27]]
-// QIR:         %[[VAL_36:.*]] = zext i1 %[[VAL_34]] to i8
-// QIR:         store i8 %[[VAL_36]], i8* %[[VAL_35]]
-// QIR:         %[[VAL_28]] = add nuw nsw i64 %[[VAL_27]], 1
-// QIR:         %[[VAL_37:.*]] = icmp eq i64 %[[VAL_28]], %[[VAL_11]]
-// QIR:         br i1 %[[VAL_37]], label %[[VAL_17]], label %[[VAL_26]]
-// QIR:                                    ; preds = %[[VAL_26]], %[[VAL_14]], %[[VAL_24]]
-// QIR:         %[[VAL_38:.*]] = phi i8* [ %[[VAL_16]], %[[VAL_14]] ], [ %[[VAL_25]], %[[VAL_24]] ], [ %[[VAL_25]], %[[VAL_26]] ]
-// QIR:         %[[VAL_39:.*]] = call i8* @malloc(i64 %[[VAL_11]])
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}%[[VAL_39]], i8* nonnull {{.*}}%[[VAL_38]], i64 %[[VAL_11]], i1 false)
-// QIR:         %[[VAL_40:.*]] = bitcast i8* %[[VAL_39]] to i1*
-// QIR:         %[[VAL_41:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_40]], 0
-// QIR:         %[[VAL_42:.*]] = insertvalue { i1*, i64 } %[[VAL_41]], i64 %[[VAL_11]], 1
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_10]]* %[[VAL_9]])
-// QIR:         ret { i1*, i64 } %[[VAL_42]]
+// QIR:         %[[VAL_20:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 %{{.*}})
+// QIR:         %[[VAL_22:.*]] = load ptr, ptr %[[VAL_20]]
+// QIR:         call void @__quantum__qis__h(ptr %[[VAL_22]])
+// QIR:         %[[VAL_29:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 %{{.*}})
+// QIR:         %[[VAL_30:.*]] = load ptr, ptr %[[VAL_29]]
+// QIR:         %[[VAL_31:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_30]])
+// QIR:         %[[VAL_34:.*]] = load i1, ptr %[[VAL_31]]
+// QIR:         %[[VAL_39:.*]] = call ptr @malloc(i64 %[[VAL_11]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}%[[VAL_39]], ptr nonnull {{.*}}%{{.*}}, i64 %[[VAL_11]], i1 false)
+// QIR:         %[[VAL_41:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_39]], 0
+// QIR:         %[[VAL_42:.*]] = insertvalue { ptr, i64 } %[[VAL_41]], i64 %[[VAL_11]], 1
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_9]])
+// QIR:         ret { ptr, i64 } %[[VAL_42]]
 // QIR:       }
 
-// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__ButterPecan() local_unnamed_addr {
-// QIR:         %[[VAL_0:.*]] = tail call { { double, double }*, i64 } @_Z{{.*}}()
-// QIR:         %[[VAL_1:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 0
-// QIR:         %[[VAL_2:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 1
+// QIR-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__ButterPecan() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = tail call { ptr, i64 } @_Z{{.*}}()
+// QIR:         %[[VAL_1:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
 // QIR:         %[[VAL_3:.*]] = shl i64 %[[VAL_2]], 4
 // QIR:         %[[VAL_4:.*]] = alloca { double, double }, i64 %[[VAL_3]]
-// QIR:         %[[VAL_5:.*]] = bitcast { double, double }* %[[VAL_4]] to i8*
-// QIR:         %[[VAL_6:.*]] = bitcast { double, double }* %[[VAL_1]] to i8*
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull {{.*}}%[[VAL_5]], i8* {{.*}}%[[VAL_6]], i64 %[[VAL_3]], i1 false)
-// QIR:         tail call void @free(i8* %[[VAL_6]])
-// QIR:         %[[VAL_7:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_complex_f64(i8* nonnull %[[VAL_5]], i64 %[[VAL_2]])
-// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_7]])
-// QIR:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], i8** %[[VAL_7]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_7]])
-// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_10]]* %[[VAL_9]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr nonnull {{.*}}%[[VAL_4]], ptr {{.*}}%[[VAL_1]], i64 %[[VAL_3]], i1 false)
+// QIR:         tail call void @free(ptr %[[VAL_1]])
+// QIR:         %[[VAL_7:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_complex_f64(ptr nonnull %[[VAL_4]], i64 %[[VAL_2]])
+// QIR:         %[[VAL_8:.*]] = call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_7]])
+// QIR:         %[[VAL_9:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_8]], ptr %[[VAL_7]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_7]])
+// QIR:         %[[VAL_11:.*]] = call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_9]])
 // QIR:         %[[VAL_12:.*]] = icmp sgt i64 %[[VAL_11]], 0
 // QIR:         br i1 %[[VAL_12]], label %[[VAL_13:.*]], label %[[VAL_14:.*]]
-// QIR:                                     ; preds = %[[VAL_15:.*]]
-// QIR:         %[[VAL_16:.*]] = alloca i8, i64 %[[VAL_11]]
-// QIR:         br label %[[VAL_17:.*]]
-// QIR:                                  ; preds = %[[VAL_15]], %[[VAL_13]]
-// QIR:         %[[VAL_18:.*]] = phi i64 [ %[[VAL_19:.*]], %[[VAL_13]] ], [ 0, %[[VAL_15]] ]
-// QIR:         %[[VAL_20:.*]] = call %[[VAL_21:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 %[[VAL_18]])
-// QIR:         %[[VAL_22:.*]] = load %[[VAL_21]]*, %[[VAL_21]]** %[[VAL_20]]
-// QIR:         call void @__quantum__qis__h(%[[VAL_21]]* %[[VAL_22]])
-// QIR:         %[[VAL_19]] = add nuw nsw i64 %[[VAL_18]], 1
-// QIR:         %[[VAL_23:.*]] = icmp eq i64 %[[VAL_19]], %[[VAL_11]]
-// QIR:         br i1 %[[VAL_23]], label %[[VAL_24:.*]], label %[[VAL_13]]
-// QIR:                                 ; preds = %[[VAL_13]]
-// QIR:         %[[VAL_25:.*]] = alloca i8, i64 %[[VAL_11]]
-// QIR:         br i1 %[[VAL_12]], label %[[VAL_26:.*]], label %[[VAL_17]]
-// QIR:                                           ; preds = %[[VAL_24]], %[[VAL_26]]
-// QIR:         %[[VAL_27:.*]] = phi i64 [ %[[VAL_28:.*]], %[[VAL_26]] ], [ 0, %[[VAL_24]] ]
-// QIR:         %[[VAL_29:.*]] = call %[[VAL_21]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_10]]* %[[VAL_9]], i64 %[[VAL_27]])
-// QIR:         %[[VAL_30:.*]] = load %[[VAL_21]]*, %[[VAL_21]]** %[[VAL_29]]
-// QIR:         %[[VAL_31:.*]] = call %[[VAL_32:.*]]* @__quantum__qis__mz(%[[VAL_21]]* %[[VAL_30]])
-// QIR:         %[[VAL_33:.*]] = bitcast %[[VAL_32]]* %[[VAL_31]] to i1*
-// QIR:         %[[VAL_34:.*]] = load i1, i1* %[[VAL_33]]
-// QIR:         %[[VAL_35:.*]] = getelementptr i8, i8* %[[VAL_25]], i64 %[[VAL_27]]
-// QIR:         %[[VAL_36:.*]] = zext i1 %[[VAL_34]] to i8
-// QIR:         store i8 %[[VAL_36]], i8* %[[VAL_35]]
-// QIR:         %[[VAL_28]] = add nuw nsw i64 %[[VAL_27]], 1
-// QIR:         %[[VAL_37:.*]] = icmp eq i64 %[[VAL_28]], %[[VAL_11]]
-// QIR:         br i1 %[[VAL_37]], label %[[VAL_17]], label %[[VAL_26]]
-// QIR:                                     ; preds = %[[VAL_26]], %[[VAL_14]], %[[VAL_24]]
-// QIR:         %[[VAL_38:.*]] = phi i8* [ %[[VAL_16]], %[[VAL_14]] ], [ %[[VAL_25]], %[[VAL_24]] ], [ %[[VAL_25]], %[[VAL_26]] ]
-// QIR:         %[[VAL_39:.*]] = call i8* @malloc(i64 %[[VAL_11]])
-// QIR:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}%[[VAL_39]], i8* nonnull {{.*}}%[[VAL_38]], i64 %[[VAL_11]], i1 false)
-// QIR:         %[[VAL_40:.*]] = bitcast i8* %[[VAL_39]] to i1*
-// QIR:         %[[VAL_41:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_40]], 0
-// QIR:         %[[VAL_42:.*]] = insertvalue { i1*, i64 } %[[VAL_41]], i64 %[[VAL_11]], 1
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_10]]* %[[VAL_9]])
-// QIR:         ret { i1*, i64 } %[[VAL_42]]
+// QIR:         %[[VAL_20:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 %{{.*}})
+// QIR:         %[[VAL_22:.*]] = load ptr, ptr %[[VAL_20]]
+// QIR:         call void @__quantum__qis__h(ptr %[[VAL_22]])
+// QIR:         %[[VAL_29:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_9]], i64 %{{.*}})
+// QIR:         %[[VAL_30:.*]] = load ptr, ptr %[[VAL_29]]
+// QIR:         %[[VAL_31:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_30]])
+// QIR:         %[[VAL_34:.*]] = load i1, ptr %[[VAL_31]]
+// QIR:         %[[VAL_39:.*]] = call ptr @malloc(i64 %[[VAL_11]])
+// QIR:         call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}%[[VAL_39]], ptr nonnull {{.*}}%{{.*}}, i64 %[[VAL_11]], i1 false)
+// QIR:         %[[VAL_41:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_39]], 0
+// QIR:         %[[VAL_42:.*]] = insertvalue { ptr, i64 } %[[VAL_41]], i64 %[[VAL_11]], 1
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_9]])
+// QIR:         ret { ptr, i64 } %[[VAL_42]]
 // QIR:       }
 
 // QIR-LABEL: define i1 @__nvqpp__mlirgen__function_Strawberry._Z10Strawberryv() local_unnamed_addr {
 // QIR:         %[[VAL_0:.*]] = alloca [2 x double]
-// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[VAL_0]], i64 0, i64 0
-// QIR:         store double 0.000000e+00, double* %[[VAL_1]]
-// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[VAL_0]], i64 0, i64 1
-// QIR:         store double 1.000000e+00, double* %[[VAL_2]]
-// QIR:         %[[VAL_3:.*]] = bitcast [2 x double]* %[[VAL_0]] to i8*
-// QIR:         %[[VAL_4:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_f64(i8* nonnull %[[VAL_3]], i64 2)
-// QIR:         %[[VAL_5:.*]] = call %[[VAL_6:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 1, i8** %[[VAL_4]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_4]])
-// QIR:         %[[VAL_7:.*]] = call %[[VAL_8:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 0)
-// QIR:         %[[VAL_9:.*]] = load %[[VAL_8]]*, %[[VAL_8]]** %[[VAL_7]]
-// QIR:         %[[VAL_10:.*]] = call %[[VAL_11:.*]]* @__quantum__qis__mz(%[[VAL_8]]* %[[VAL_9]])
-// QIR:         %[[VAL_12:.*]] = bitcast %[[VAL_11]]* %[[VAL_10]] to i1*
-// QIR:         %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]]
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_6]]* %[[VAL_5]])
+// QIR:         store double 0.000000e+00, ptr %[[VAL_0]]
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// QIR:         store double 1.000000e+00, ptr %[[VAL_2]]
+// QIR:         %[[VAL_4:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_f64(ptr nonnull %[[VAL_0]], i64 2)
+// QIR:         %[[VAL_5:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 1, ptr %[[VAL_4]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_4]])
+// QIR:         %[[VAL_7:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_5]], i64 0)
+// QIR:         %[[VAL_9:.*]] = load ptr, ptr %[[VAL_7]]
+// QIR:         %[[VAL_10:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_9]])
+// QIR:         %[[VAL_13:.*]] = load i1, ptr %[[VAL_10]]
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_5]])
 // QIR:         ret i1 %[[VAL_13]]
 // QIR:       }
 
 // QIR-LABEL: define i1 @__nvqpp__mlirgen__function_Peppermint._Z10Peppermintv() local_unnamed_addr {
 // QIR:         %[[VAL_0:.*]] = alloca [2 x double]
-// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[VAL_0]], i64 0, i64 0
-// QIR:         store double 0x3FE6A09E667F3BCD, double* %[[VAL_1]]
-// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[VAL_0]], i64 0, i64 1
-// QIR:         store double 0x3FE6A09E667F3BCD, double* %[[VAL_2]]
-// QIR:         %[[VAL_3:.*]] = bitcast [2 x double]* %[[VAL_0]] to i8*
-// QIR:         %[[VAL_4:.*]] = call i8** @__nvqpp_cudaq_state_createFromData_f64(i8* nonnull %[[VAL_3]], i64 2)
-// QIR:         %[[VAL_5:.*]] = call %[[VAL_6:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 1, i8** %[[VAL_4]])
-// QIR:         call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_4]])
-// QIR:         %[[VAL_7:.*]] = call %[[VAL_8:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 0)
-// QIR:         %[[VAL_9:.*]] = load %[[VAL_8]]*, %[[VAL_8]]** %[[VAL_7]]
-// QIR:         %[[VAL_10:.*]] = call %[[VAL_11:.*]]* @__quantum__qis__mz(%[[VAL_8]]* %[[VAL_9]])
-// QIR:         %[[VAL_12:.*]] = bitcast %[[VAL_11]]* %[[VAL_10]] to i1*
-// QIR:         %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]]
-// QIR:         call void @__quantum__rt__qubit_release_array(%[[VAL_6]]* %[[VAL_5]])
+// QIR:         store double 0x3FE6A09E667F3BCD, ptr %[[VAL_0]]
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// QIR:         store double 0x3FE6A09E667F3BCD, ptr %[[VAL_2]]
+// QIR:         %[[VAL_4:.*]] = call ptr @__nvqpp_cudaq_state_createFromData_f64(ptr nonnull %[[VAL_0]], i64 2)
+// QIR:         %[[VAL_5:.*]] = call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 1, ptr %[[VAL_4]])
+// QIR:         call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_4]])
+// QIR:         %[[VAL_7:.*]] = call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_5]], i64 0)
+// QIR:         %[[VAL_9:.*]] = load ptr, ptr %[[VAL_7]]
+// QIR:         %[[VAL_10:.*]] = call ptr @__quantum__qis__mz(ptr %[[VAL_9]])
+// QIR:         %[[VAL_13:.*]] = load i1, ptr %[[VAL_10]]
+// QIR:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_5]])
 // QIR:         ret i1 %[[VAL_13]]
 // QIR:       }
diff --git a/test/AST-Quake/to_qir.cpp b/test/AST-Quake/to_qir.cpp
index e09c4998bec..d6f4da826d6 100644
--- a/test/AST-Quake/to_qir.cpp
+++ b/test/AST-Quake/to_qir.cpp
@@ -33,34 +33,32 @@ struct kernel {
 
 // clang-format off
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
-// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 3)
-// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
-// CHECK:         %[[VAL_6:.*]] = load %Qubit*, %Qubit** %[[VAL_5]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_4]], %Qubit* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_8:.*]] = load %Qubit*, %Qubit** %[[VAL_7]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_8]], %Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_8]])
-// CHECK:         %[[VAL_9:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_8]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
-// CHECK:         %[[VAL_11:.*]] = bitcast %Result* %[[VAL_9]] to i1*
-// CHECK:         %[[VAL_12:.*]] = load i1, i1* %[[VAL_11]], align 1
-// CHECK:         %[[VAL_13:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_4]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
-// CHECK:         %[[VAL_14:.*]] = bitcast %Result* %[[VAL_13]] to i1*
-// CHECK:         %[[VAL_15:.*]] = load i1, i1* %[[VAL_14]], align 1
-// CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
-// CHECK:       14:                                               ; preds = %[[VAL_18:.*]]
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_6]])
-// CHECK:         br label %[[VAL_17]]
-// CHECK:       15:                                               ; preds = %[[VAL_16]], %[[VAL_18]]
-// CHECK:         br i1 %[[VAL_12]], label %[[VAL_19:.*]], label %[[VAL_20:.*]]
-// CHECK:       16:                                               ; preds = %[[VAL_17]]
-// CHECK:         tail call void @__quantum__qis__z(%Qubit* %[[VAL_6]])
-// CHECK:         br label %[[VAL_20]]
-// CHECK:       17:                                               ; preds = %[[VAL_19]], %[[VAL_17]]
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_4]], ptr %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_7]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_8]], ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_8]])
+// CHECK:         %[[VAL_9:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_8]], ptr nonnull @cstr.623000)
+// CHECK-DAG:     %[[VAL_10:.*]] = load i1, ptr %[[VAL_9]], align 1
+// CHECK-DAG:     %[[VAL_11:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_4]], ptr nonnull @cstr.623100)
+// CHECK:         %[[VAL_12:.*]] = load i1, ptr %[[VAL_11]], align 1
+// CHECK:         br i1 %[[VAL_12]], label %[[VAL_14:.*]], label %[[VAL_15:.*]]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_6]])
+// CHECK:         br label %[[VAL_15]]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         br i1 %[[VAL_10]], label %[[VAL_18:.*]], label %[[VAL_19:.*]]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         tail call void @__quantum__qis__z(ptr %[[VAL_6]])
+// CHECK:         br label %[[VAL_19]]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/AST-Quake/vector_int-1.cpp b/test/AST-Quake/vector_int-1.cpp
index 59f2334d05d..1522680bad6 100644
--- a/test/AST-Quake/vector_int-1.cpp
+++ b/test/AST-Quake/vector_int-1.cpp
@@ -23,9 +23,9 @@ __qpu__ void touringLondon() {
 }
 
 // CHECK-LABEL:  func.func @__nvqpp__mlirgen__function_doubleDeckerBus._Z15doubleDeckerBusv() -> !cc.stdvec<i32> attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 4 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i32
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 2 : i64
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 4 : i64
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : i32
 // CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<i32 x 2>
 // CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i32 x 2>>) -> !cc.ptr<i32>
 // CHECK:           cc.store %[[VAL_3]], %[[VAL_5]] : !cc.ptr<i32>
diff --git a/test/AST-error/apply_noise.cpp b/test/AST-error/apply_noise.cpp
index 1a1db4f108d..a6482c5c752 100644
--- a/test/AST-error/apply_noise.cpp
+++ b/test/AST-error/apply_noise.cpp
@@ -22,6 +22,6 @@ struct testApplyNoise {
     cudaq::qubit q0, q1;
     // expected-error@+1{{no matching function for call to 'apply_noise'}}
     cudaq::apply_noise<SantaKraus>(q0, q1);
-    // expected-note@* 2-3 {{}}
+    // expected-note@* 2-7 {{}}
   }
 };
diff --git a/test/AST-error/statements.cpp b/test/AST-error/statements.cpp
index 325de3bd1e9..841eea69f04 100644
--- a/test/AST-error/statements.cpp
+++ b/test/AST-error/statements.cpp
@@ -52,7 +52,6 @@ struct S5 {
 
 struct S6 {
   auto operator()() __qpu__ {
-    // expected-error@*{{union types are not allowed in kernels}}
     // expected-error@+1{{statement not supported in qpu kernel}}
     std::cout << "Hello\n";
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 49a70201702..1f7202d9886 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -6,6 +6,12 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
+# Create imported targets for test dependencies
+if(NOT TARGET FileCheck)
+  add_executable(FileCheck IMPORTED)
+  set_target_properties(FileCheck PROPERTIES IMPORTED_LOCATION "/usr/local/llvm/bin/FileCheck")
+endif()
+
 llvm_canonicalize_cmake_booleans(
   CUDA_FOUND
   CUDAQ_TEST_REMOTE_SIM
diff --git a/test/NVQPP/qir_gen.cpp b/test/NVQPP/qir_gen.cpp
index c932ca037d5..16919ce822c 100644
--- a/test/NVQPP/qir_gen.cpp
+++ b/test/NVQPP/qir_gen.cpp
@@ -28,8 +28,8 @@ struct branching {
 
 // clang-format off
 // CHECK-LABEL:   define void @__nvqpp__mlirgen__branching()
-// CHECK:   %[[VAL_0:.*]] = select i1 %{{.*}}, %Qubit* %{{.*}}, %Qubit* %{{.*}}
-// CHECK:   tail call void @__quantum__qis__h(%Qubit* %[[VAL_0]])
+// CHECK:   %[[VAL_0:.*]] = select i1 %{{.*}}, ptr %{{.*}}, ptr %{{.*}}
+// CHECK:   tail call void @__quantum__qis__h(ptr %[[VAL_0]])
 // clang-format on
 
 int main() {
diff --git a/test/NVQPP/struct_arg.cpp b/test/NVQPP/struct_arg.cpp
index 1ef2f634630..d6a8ccc9049 100644
--- a/test/NVQPP/struct_arg.cpp
+++ b/test/NVQPP/struct_arg.cpp
@@ -27,7 +27,7 @@ struct foo {
 
 // clang-format off
 // CHECK-LABEL: define void @_ZN3fooclI3bazEEvOT_i
-// CHECK-SAME: (i8* nocapture readnone %[[ARG0:[0-9]*]], {{.*}} {{.*}}%[[ARG1:[0-9]*]], i32 %[[ARG2:[0-9]*]])
+// CHECK-SAME: (ptr readnone captures(none) %[[ARG0:[0-9]*]], {{.*}} {{.*}}%[[ARG1:[0-9]*]], i32 %[[ARG2:[0-9]*]])
 // clang-format on
 
 int main() {
diff --git a/test/Transforms/aggressive_inline_prevented.qke b/test/Transforms/aggressive_inline_prevented.qke
index 30ff429e408..a787b43fe8c 100644
--- a/test/Transforms/aggressive_inline_prevented.qke
+++ b/test/Transforms/aggressive_inline_prevented.qke
@@ -80,4 +80,6 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__kernel = "__nvqpp
 
 // CHECK-LABEL: func.func @__nvqpp__mlirgen__trotter
 // CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel
-// CHECK: func.call @__nvqpp__mlirgen__trotter
+// The trotter function is now inlined with the func inliner extension.
+// CHECK-NOT: func.call @__nvqpp__mlirgen__trotter
+// CHECK: quake.exp_pauli
diff --git a/test/Transforms/apply-2.qke b/test/Transforms/apply-2.qke
index 2a60898c18d..99d42cbf0da 100644
--- a/test/Transforms/apply-2.qke
+++ b/test/Transforms/apply-2.qke
@@ -25,20 +25,20 @@ module {
 }
 
 // CHECK-LABEL:   llvm.func @test.adj.ctrl(
-// CHECK-SAME:          %[[VAL_0:.*]]: !llvm.ptr<struct<"Array", opaque>>,
-// CHECK-SAME:          %[[VAL_1:.*]]: !llvm.ptr<struct<"Qubit", opaque>>)
+// CHECK-SAME:          %[[VAL_0:.*]]: !llvm.ptr,
+// CHECK-SAME:          %[[VAL_1:.*]]: !llvm.ptr)
 
 // CHECK-LABEL:   llvm.func @do_apply(
-// CHECK-SAME:          %[[VAL_0:.*]]: !llvm.ptr<struct<"Qubit", opaque>>,
-// CHECK-SAME:          %[[VAL_1:.*]]: !llvm.ptr<struct<"Qubit", opaque>>) {
+// CHECK-SAME:          %[[VAL_0:.*]]: !llvm.ptr,
+// CHECK-SAME:          %[[VAL_1:.*]]: !llvm.ptr) {
 // CHECK-DAG:       %[[VAL_2:.*]] = llvm.mlir.constant(0 : i64) : i64
 // CHECK-DAG:       %[[VAL_3:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK-DAG:       %[[VAL_4:.*]] = llvm.mlir.constant(8 : i32) : i32
-// CHECK:           %[[VAL_5:.*]] = llvm.call @__quantum__rt__array_create_1d(%[[VAL_4]], %[[VAL_3]]) : (i32, i64) -> !llvm.ptr<struct<"Array", opaque>>
-// CHECK:           %[[VAL_6:.*]] = llvm.call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_5]], %[[VAL_2]]) : (!llvm.ptr<struct<"Array", opaque>>, i64) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_7:.*]] = llvm.bitcast %[[VAL_6]] : !llvm.ptr<i8> to !llvm.ptr<ptr<i8>>
-// CHECK:           %[[VAL_8:.*]] = llvm.bitcast %[[VAL_1]] : !llvm.ptr<struct<"Qubit", opaque>> to !llvm.ptr<i8>
-// CHECK:           llvm.store %[[VAL_8]], %[[VAL_7]] : !llvm.ptr<ptr<i8>>
-// CHECK:           llvm.call @test.adj.ctrl(%[[VAL_5]], %[[VAL_0]]) : (!llvm.ptr<struct<"Array", opaque>>, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
+// CHECK:           %[[VAL_5:.*]] = llvm.call @__quantum__rt__array_create_1d(%[[VAL_4]], %[[VAL_3]]) : (i32, i64) -> !llvm.ptr
+// CHECK:           %[[VAL_6:.*]] = llvm.call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_5]], %[[VAL_2]]) : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK:           %[[VAL_7:.*]] = llvm.bitcast %[[VAL_6]] : !llvm.ptr to !llvm.ptr
+// CHECK:           %[[VAL_8:.*]] = llvm.bitcast %[[VAL_1]] : !llvm.ptr to !llvm.ptr
+// CHECK:           llvm.store %[[VAL_8]], %[[VAL_7]] : !llvm.ptr
+// CHECK:           llvm.call @test.adj.ctrl(%[[VAL_5]], %[[VAL_0]]) : (!llvm.ptr, !llvm.ptr) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
diff --git a/test/Transforms/apply_noise-0.qke b/test/Transforms/apply_noise-0.qke
new file mode 100644
index 00000000000..d534e1cf452
--- /dev/null
+++ b/test/Transforms/apply_noise-0.qke
@@ -0,0 +1,26 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt --convert-to-qir-api=api=full --symbol-dce %s | FileCheck %s
+
+func.func @__nvqpp__mlirgen__bell_error_vecI10SantaKrausE(%arg0: !quake.veq<2>, %arg1: !cc.ptr<f64>) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+  quake.apply_noise @shimzoo(%arg1) %arg0 : (!cc.ptr<f64>, !quake.veq<2>) -> ()
+  return
+}
+
+func.func private @shimzoo(!cc.ptr<f64>, !quake.veq<?>)
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__bell_error_vecI10SantaKrausE(
+// CHECK-SAME:      %[[ARG0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>,
+// CHECK-SAME:      %[[ARG1:.*]]: !cc.ptr<f64>) attributes {"cudaq-entrypoint", "cudaq-kernel", "qir-api"} {
+// CHECK:           %[[VAL_0:.*]] = call @__quantum__qis__convert_array_to_stdvector(%[[ARG0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.ptr<!llvm.struct<"Array", opaque>>
+// CHECK:           call @shimzoo(%[[ARG1]], %[[VAL_0]]) : (!cc.ptr<f64>, !cc.ptr<!llvm.struct<"Array", opaque>>) -> ()
+// CHECK:           call @__quantum__qis__free_converted_stdvector(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> ()
+// CHECK:           return
+// CHECK:         }
+
diff --git a/test/Transforms/apply_noise_conversion.qke b/test/Transforms/apply_noise_conversion.qke
index e5fc37bd6e8..8a009aedbba 100644
--- a/test/Transforms/apply_noise_conversion.qke
+++ b/test/Transforms/apply_noise_conversion.qke
@@ -39,10 +39,10 @@ func.func private @_ZN5cudaq11apply_noiseI10SantaKrausJRdRNS_7qvectorILm2EEEEEEv
 // CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
 // CHECK:           %[[VAL_11:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
 // CHECK:           %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
-// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_3]], %[[VAL_3]], %[[VAL_2]], %[[VAL_2]], %[[VAL_14]], %[[VAL_13]], %[[VAL_15]]) : (i64, i64, i64, i64, !llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr
+// CHECK:           %[[VAL_14:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr
+// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr
+// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_3]], %[[VAL_3]], %[[VAL_2]], %[[VAL_2]], %[[VAL_14]], %[[VAL_13]], %[[VAL_15]]) : (i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
 // CHECK:           %[[VAL_16:.*]] = call @__quantum__qis__convert_array_to_stdvector(%[[VAL_6]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.ptr<!llvm.struct<"Array", opaque>>
 // CHECK:           call @_ZN5cudaq11apply_noiseI10SantaKrausJRdRNS_7qvectorILm2EEEEEEvDpOT0_(%[[VAL_5]], %[[VAL_16]]) : (!cc.ptr<f64>, !cc.ptr<!llvm.struct<"Array", opaque>>) -> ()
 // CHECK:           call @__quantum__qis__free_converted_stdvector(%[[VAL_16]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> ()
diff --git a/test/Transforms/cc_execution_manager.qke b/test/Transforms/cc_execution_manager.qke
index fb761db8b88..6af8b11a375 100644
--- a/test/Transforms/cc_execution_manager.qke
+++ b/test/Transforms/cc_execution_manager.qke
@@ -7,7 +7,7 @@
 // ========================================================================== //
 
 // RUN: cudaq-opt -lower-quake -canonicalize %s | FileCheck %s
-// RUN: cudaq-opt -lower-quake -lower-to-cfg -cse -cc-to-llvm %s | FileCheck --check-prefix=LLVM %s
+// RUN: cudaq-opt -lower-quake -lower-to-cfg -cse -canonicalize -cc-to-llvm %s | FileCheck --check-prefix=LLVM %s
 
 func.func @tocc.test() {
   %qubits = quake.alloca !quake.veq<3>
@@ -323,7 +323,7 @@ func.func @tocc.test() {
 // CHECK:           return
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__cudaq_em_apply(!cc.ptr<i8>, i64, !cc.ptr<!cc.array<f64 x ?>>, !cc.ptr<!cc.struct<".qubit_span" {!cc.ptr<!cc.array<i64 x ?>>, i64}>>, !cc.ptr<!cc.struct<".qubit_span" {!cc.ptr<!cc.array<i64 x ?>>, i64}>>, i1)
-// CHECK:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+// CHECK:         func.func private @llvm.memcpy.p0.p0.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
 // CHECK-LABEL:   func.func private @__nvqpp__cudaq_em_concatSpan(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<i64>, %[[VAL_1:.*]]: !cc.ptr<!cc.struct<".qubit_span" {!cc.ptr<!cc.array<i64 x ?>>, i64}>>, %[[VAL_2:.*]]: i64) {
@@ -334,7 +334,7 @@ func.func @tocc.test() {
 // CHECK:           %[[VAL_7:.*]] = arith.muli %[[VAL_2]], %[[VAL_4]] : i64
 // CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i64>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i8>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_8]], %[[VAL_9]], %[[VAL_7]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_8]], %[[VAL_9]], %[[VAL_7]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           return
 // CHECK:         }
 // CHECK:         func.func private @__nvqpp__cudaq_em_measure(!cc.ptr<!cc.struct<".qubit_span" {!cc.ptr<!cc.array<i64 x ?>>, i64}>>, !cc.ptr<i8>) -> i32
@@ -368,296 +368,305 @@ func.func @tocc.test() {
 // CHECK-DAG:     llvm.mlir.global private constant @cstr.72{{[0-9]+}}00("r{{.*}}\00") {addr_space = 0 : i32}
 
 
-
 // LLVM-LABEL:   llvm.func @tocc.test() {
-// LLVM-DAG:       %[[VAL_0:.*]] = llvm.mlir.constant(0.000000e+00 : f64) : f64
-// LLVM-DAG:       %[[VAL_1:.*]] = llvm.mlir.constant(1.500000e+00 : f64) : f64
-// LLVM-DAG:       %[[VAL_2:.*]] = llvm.mlir.constant(2.600000e+00 : f64) : f64
-// LLVM-DAG:       %[[VAL_3:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_4:.*]] = llvm.alloca %[[VAL_3]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           %[[VAL_5:.*]] = llvm.mlir.constant(3 : i64) : i64
-// LLVM:           %[[VAL_6:.*]] = llvm.alloca %[[VAL_5]] x i64 : (i64) -> !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_4]], %[[VAL_6]], %[[VAL_5]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_allocate_veq(%[[VAL_4]], %[[VAL_5]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, i64) -> ()
-// LLVM:           %[[VAL_7:.*]] = llvm.mlir.constant(0 : i64) : i64
-// LLVM:           %[[VAL_8:.*]] = llvm.getelementptr %[[VAL_4]][0, 0] : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_9:.*]] = llvm.load %[[VAL_8]] : !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_10:.*]] = llvm.getelementptr %[[VAL_9]][0] : (!llvm.ptr<i64>) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_11:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_12:.*]] = llvm.alloca %[[VAL_11]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           %[[VAL_13:.*]] = llvm.mlir.constant(1 : i64) : i64
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_12]], %{{.*}}, %[[VAL_13]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_14:.*]] = llvm.load %[[VAL_8]] : !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_15:.*]] = llvm.getelementptr %[[VAL_14]][1] : (!llvm.ptr<i64>) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_16:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_17:.*]] = llvm.alloca %[[VAL_16]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_17]], %{{.*}}, %[[VAL_13]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_18:.*]] = llvm.mlir.constant(2 : i64) : i64
-// LLVM:           %[[VAL_19:.*]] = llvm.load %[[VAL_8]] : !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_20:.*]] = llvm.getelementptr %[[VAL_19]][2] : (!llvm.ptr<i64>) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_21:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_22:.*]] = llvm.alloca %[[VAL_21]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_22]], %{{.*}}, %[[VAL_13]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_23:.*]] = llvm.mlir.addressof @cstr.6800 : !llvm.ptr<array<2 x i8>>
-// LLVM:           %[[VAL_24:.*]] = llvm.bitcast %[[VAL_23]] : !llvm.ptr<array<2 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_25:.*]] = llvm.inttoptr %[[VAL_7]] : i64 to !llvm.ptr<f64>
-// LLVM:           %[[VAL_26:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_27:.*]] = llvm.alloca %[[VAL_26]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           %[[VAL_28:.*]] = llvm.inttoptr %[[VAL_7]] : i64 to !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_27]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_29:.*]] = llvm.mlir.constant(false) : i1
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_24]], %[[VAL_7]], %[[VAL_25]], %[[VAL_27]], %[[VAL_12]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_30:.*]] = llvm.mlir.addressof @cstr.7800 : !llvm.ptr<array<2 x i8>>
-// LLVM:           %[[VAL_31:.*]] = llvm.bitcast %[[VAL_30]] : !llvm.ptr<array<2 x i8>> to !llvm.ptr<i8>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_31]], %[[VAL_7]], %[[VAL_25]], %[[VAL_12]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_32:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_33:.*]] = llvm.alloca %[[VAL_32]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_33]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_31]], %[[VAL_7]], %[[VAL_25]], %[[VAL_33]], %[[VAL_12]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_34:.*]] = llvm.mlir.addressof @cstr.7900 : !llvm.ptr<array<2 x i8>>
-// LLVM:           %[[VAL_35:.*]] = llvm.bitcast %[[VAL_34]] : !llvm.ptr<array<2 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_36:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_37:.*]] = llvm.alloca %[[VAL_36]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_37]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_35]], %[[VAL_7]], %[[VAL_25]], %[[VAL_37]], %[[VAL_22]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_38:.*]] = llvm.mlir.addressof @cstr.7A00 : !llvm.ptr<array<2 x i8>>
-// LLVM:           %[[VAL_39:.*]] = llvm.bitcast %[[VAL_38]] : !llvm.ptr<array<2 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_40:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_41:.*]] = llvm.alloca %[[VAL_40]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           %[[VAL_42:.*]] = llvm.getelementptr %[[VAL_22]][0, 1] : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_43:.*]] = llvm.load %[[VAL_42]] : !llvm.ptr<i64>
-// LLVM:           %[[VAL_44:.*]] = llvm.add %[[VAL_7]], %[[VAL_43]]  : i64
-// LLVM:           %[[VAL_45:.*]] = llvm.getelementptr %[[VAL_12]][0, 1] : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_46:.*]] = llvm.load %[[VAL_45]] : !llvm.ptr<i64>
-// LLVM:           %[[VAL_47:.*]] = llvm.add %[[VAL_44]], %[[VAL_46]]  : i64
-// LLVM:           %[[VAL_48:.*]] = llvm.alloca %[[VAL_47]] x i64 : (i64) -> !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_41]], %[[VAL_48]], %[[VAL_47]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_49:.*]] = llvm.getelementptr %[[VAL_48]][0] : (!llvm.ptr<i64>) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_50:.*]] = llvm.load %[[VAL_42]] : !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[VAL_49]], %[[VAL_22]], %[[VAL_50]]) : (!llvm.ptr<i64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i64) -> ()
-// LLVM:           %[[VAL_51:.*]] = llvm.add %[[VAL_7]], %[[VAL_50]]  : i64
-// LLVM:           %[[VAL_52:.*]] = llvm.getelementptr %[[VAL_48]]{{\[}}%[[VAL_51]]] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_53:.*]] = llvm.load %[[VAL_45]] : !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[VAL_52]], %[[VAL_12]], %[[VAL_53]]) : (!llvm.ptr<i64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_39]], %[[VAL_7]], %[[VAL_25]], %[[VAL_41]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_54:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_55:.*]] = llvm.alloca %[[VAL_54]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_55]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_39]], %[[VAL_7]], %[[VAL_25]], %[[VAL_55]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_56:.*]] = llvm.mlir.addressof @cstr.7400 : !llvm.ptr<array<2 x i8>>
-// LLVM:           %[[VAL_57:.*]] = llvm.bitcast %[[VAL_56]] : !llvm.ptr<array<2 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_58:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_59:.*]] = llvm.alloca %[[VAL_58]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_59]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_57]], %[[VAL_7]], %[[VAL_25]], %[[VAL_59]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_60:.*]] = llvm.mlir.addressof @cstr.7300 : !llvm.ptr<array<2 x i8>>
-// LLVM:           %[[VAL_61:.*]] = llvm.bitcast %[[VAL_60]] : !llvm.ptr<array<2 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_62:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_63:.*]] = llvm.alloca %[[VAL_62]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_63]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_61]], %[[VAL_7]], %[[VAL_25]], %[[VAL_63]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_64:.*]] = llvm.mlir.addressof @cstr.727800 : !llvm.ptr<array<3 x i8>>
-// LLVM:           %[[VAL_65:.*]] = llvm.bitcast %[[VAL_64]] : !llvm.ptr<array<3 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_66:.*]] = llvm.alloca %[[VAL_13]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_67:.*]] = llvm.getelementptr %[[VAL_66]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_2]], %[[VAL_67]] : !llvm.ptr<f64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_65]], %[[VAL_13]], %[[VAL_66]], %[[VAL_12]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_68:.*]] = llvm.mlir.addressof @cstr.7068617365645F727800 : !llvm.ptr<array<10 x i8>>
-// LLVM:           %[[VAL_69:.*]] = llvm.bitcast %[[VAL_68]] : !llvm.ptr<array<10 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_70:.*]] = llvm.alloca %[[VAL_18]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_71:.*]] = llvm.getelementptr %[[VAL_70]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_2]], %[[VAL_71]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_72:.*]] = llvm.getelementptr %[[VAL_70]][1] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_1]], %[[VAL_72]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_73:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_74:.*]] = llvm.alloca %[[VAL_73]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_74]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_69]], %[[VAL_18]], %[[VAL_70]], %[[VAL_74]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_75:.*]] = llvm.alloca %[[VAL_13]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_76:.*]] = llvm.getelementptr %[[VAL_75]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_1]], %[[VAL_76]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_77:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_78:.*]] = llvm.alloca %[[VAL_77]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_78]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_65]], %[[VAL_13]], %[[VAL_75]], %[[VAL_78]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_79:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_80:.*]] = llvm.alloca %[[VAL_79]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           %[[VAL_81:.*]] = llvm.alloca %[[VAL_13]] x i64 : (i64) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_82:.*]] = llvm.call @__nvqpp__cudaq_em_allocate() : () -> i64
-// LLVM:           %[[VAL_83:.*]] = llvm.getelementptr %[[VAL_81]][0] : (!llvm.ptr<i64>) -> !llvm.ptr<i64>
-// LLVM:           llvm.store %[[VAL_82]], %[[VAL_83]] : !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_80]], %[[VAL_81]], %[[VAL_13]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_84:.*]] = llvm.alloca %[[VAL_13]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_85:.*]] = llvm.getelementptr %[[VAL_84]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_1]], %[[VAL_85]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_86:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_87:.*]] = llvm.alloca %[[VAL_86]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_87]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_65]], %[[VAL_13]], %[[VAL_84]], %[[VAL_87]], %[[VAL_80]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_88:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_89:.*]] = llvm.alloca %[[VAL_88]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_89]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_31]], %[[VAL_7]], %[[VAL_25]], %[[VAL_89]], %[[VAL_80]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_return(%[[VAL_80]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> ()
-// LLVM:           %[[VAL_90:.*]] = llvm.mlir.addressof @cstr.727900 : !llvm.ptr<array<3 x i8>>
-// LLVM:           %[[VAL_91:.*]] = llvm.bitcast %[[VAL_90]] : !llvm.ptr<array<3 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_92:.*]] = llvm.alloca %[[VAL_13]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_93:.*]] = llvm.getelementptr %[[VAL_92]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_2]], %[[VAL_93]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_94:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_95:.*]] = llvm.alloca %[[VAL_94]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_95]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_91]], %[[VAL_13]], %[[VAL_92]], %[[VAL_95]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_96:.*]] = llvm.mlir.addressof @cstr.727A00 : !llvm.ptr<array<3 x i8>>
-// LLVM:           %[[VAL_97:.*]] = llvm.bitcast %[[VAL_96]] : !llvm.ptr<array<3 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_98:.*]] = llvm.alloca %[[VAL_13]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_99:.*]] = llvm.getelementptr %[[VAL_98]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_1]], %[[VAL_99]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_100:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_101:.*]] = llvm.alloca %[[VAL_100]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_101]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_97]], %[[VAL_13]], %[[VAL_98]], %[[VAL_101]], %[[VAL_12]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_102:.*]] = llvm.mlir.addressof @cstr.753200 : !llvm.ptr<array<3 x i8>>
-// LLVM:           %[[VAL_103:.*]] = llvm.bitcast %[[VAL_102]] : !llvm.ptr<array<3 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_104:.*]] = llvm.alloca %[[VAL_18]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_105:.*]] = llvm.getelementptr %[[VAL_104]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_1]], %[[VAL_105]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_106:.*]] = llvm.getelementptr %[[VAL_104]][1] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_2]], %[[VAL_106]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_107:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_108:.*]] = llvm.alloca %[[VAL_107]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_108]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_103]], %[[VAL_18]], %[[VAL_104]], %[[VAL_108]], %[[VAL_12]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_109:.*]] = llvm.mlir.addressof @cstr.753300 : !llvm.ptr<array<3 x i8>>
-// LLVM:           %[[VAL_110:.*]] = llvm.bitcast %[[VAL_109]] : !llvm.ptr<array<3 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_111:.*]] = llvm.alloca %[[VAL_5]] x f64 : (i64) -> !llvm.ptr<f64>
-// LLVM:           %[[VAL_112:.*]] = llvm.getelementptr %[[VAL_111]][0] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_1]], %[[VAL_112]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_113:.*]] = llvm.getelementptr %[[VAL_111]][1] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_2]], %[[VAL_113]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_114:.*]] = llvm.getelementptr %[[VAL_111]][2] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-// LLVM:           llvm.store %[[VAL_0]], %[[VAL_114]] : !llvm.ptr<f64>
-// LLVM:           %[[VAL_115:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_116:.*]] = llvm.alloca %[[VAL_115]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_116]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_110]], %[[VAL_5]], %[[VAL_111]], %[[VAL_116]], %[[VAL_12]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_117:.*]] = llvm.mlir.addressof @cstr.7377617000 : !llvm.ptr<array<5 x i8>>
-// LLVM:           %[[VAL_118:.*]] = llvm.bitcast %[[VAL_117]] : !llvm.ptr<array<5 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_119:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_120:.*]] = llvm.alloca %[[VAL_119]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_120]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_121:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_122:.*]] = llvm.alloca %[[VAL_121]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           %[[VAL_123:.*]] = llvm.load %[[VAL_45]] : !llvm.ptr<i64>
-// LLVM:           %[[VAL_124:.*]] = llvm.add %[[VAL_7]], %[[VAL_123]]  : i64
-// LLVM:           %[[VAL_125:.*]] = llvm.load %[[VAL_42]] : !llvm.ptr<i64>
-// LLVM:           %[[VAL_126:.*]] = llvm.add %[[VAL_124]], %[[VAL_125]]  : i64
-// LLVM:           %[[VAL_127:.*]] = llvm.alloca %[[VAL_126]] x i64 : (i64) -> !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_122]], %[[VAL_127]], %[[VAL_126]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_128:.*]] = llvm.getelementptr %[[VAL_127]][0] : (!llvm.ptr<i64>) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_129:.*]] = llvm.load %[[VAL_45]] : !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[VAL_128]], %[[VAL_12]], %[[VAL_129]]) : (!llvm.ptr<i64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i64) -> ()
-// LLVM:           %[[VAL_130:.*]] = llvm.add %[[VAL_7]], %[[VAL_129]]  : i64
-// LLVM:           %[[VAL_131:.*]] = llvm.getelementptr %[[VAL_127]]{{\[}}%[[VAL_130]]] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
-// LLVM:           %[[VAL_132:.*]] = llvm.load %[[VAL_42]] : !llvm.ptr<i64>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[VAL_131]], %[[VAL_22]], %[[VAL_132]]) : (!llvm.ptr<i64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_118]], %[[VAL_7]], %[[VAL_25]], %[[VAL_120]], %[[VAL_122]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_133:.*]] = llvm.mlir.addressof @cstr.6D696B6500 : !llvm.ptr<array<5 x i8>>
-// LLVM:           %[[VAL_134:.*]] = llvm.bitcast %[[VAL_133]] : !llvm.ptr<array<5 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_135:.*]] = llvm.call @__nvqpp__cudaq_em_measure(%[[VAL_22]], %[[VAL_134]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i8>) -> i32
-// LLVM:           %[[VAL_136:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_137:.*]] = llvm.alloca %[[VAL_136]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_137]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_24]], %[[VAL_7]], %[[VAL_25]], %[[VAL_137]], %[[VAL_17]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_138:.*]] = llvm.mlir.addressof @cstr.746F6D00 : !llvm.ptr<array<4 x i8>>
-// LLVM:           %[[VAL_139:.*]] = llvm.bitcast %[[VAL_138]] : !llvm.ptr<array<4 x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_140:.*]] = llvm.call @__nvqpp__cudaq_em_measure(%[[VAL_17]], %[[VAL_139]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i8>) -> i32
-// LLVM:           %[[VAL_141:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_142:.*]] = llvm.alloca %[[VAL_141]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_142]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           %[[VAL_143:.*]] = llvm.mlir.constant(true) : i1
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_61]], %[[VAL_7]], %[[VAL_25]], %[[VAL_142]], %[[VAL_12]], %[[VAL_143]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_144:.*]] = llvm.mlir.constant(1 : i32) : i32
-// LLVM:           %[[VAL_145:.*]] = llvm.alloca %[[VAL_144]] x !llvm.struct<(ptr<i64>, i64)> : (i32) -> !llvm.ptr<struct<(ptr<i64>, i64)>>
-// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[VAL_145]], %[[VAL_28]], %[[VAL_7]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i64>, i64) -> ()
-// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[VAL_24]], %[[VAL_7]], %[[VAL_25]], %[[VAL_145]], %[[VAL_12]], %[[VAL_29]]) : (!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) -> ()
-// LLVM:           %[[VAL_146:.*]] = llvm.mlir.addressof @cstr.72{{[0-9]+}}00 : !llvm.ptr<array<[[LEN:.*]] x i8>>
-// LLVM:           %[[VAL_147:.*]] = llvm.bitcast %[[VAL_146]] : !llvm.ptr<array<[[LEN]] x i8>> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_148:.*]] = llvm.call @__nvqpp__cudaq_em_measure(%[[VAL_12]], %[[VAL_147]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i8>) -> i32
-// LLVM:           llvm.call @__nvqpp__cudaq_em_return(%[[VAL_4]]) : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> ()
+// LLVM-DAG:       %[[MLIR_0:.*]] = llvm.mlir.constant(true) : i1
+// LLVM-DAG:       %[[MLIR_1:.*]] = llvm.mlir.constant(false) : i1
+// LLVM-DAG:       %[[MLIR_2:.*]] = llvm.mlir.constant(2 : i64) : i64
+// LLVM-DAG:       %[[MLIR_3:.*]] = llvm.mlir.constant(1 : i64) : i64
+// LLVM-DAG:       %[[MLIR_4:.*]] = llvm.mlir.constant(0 : i64) : i64
+// LLVM-DAG:       %[[MLIR_5:.*]] = llvm.mlir.constant(3 : i64) : i64
+// LLVM-DAG:       %[[MLIR_6:.*]] = llvm.mlir.constant(0.000000e+00 : f64) : f64
+// LLVM-DAG:       %[[MLIR_7:.*]] = llvm.mlir.constant(1.500000e+00 : f64) : f64
+// LLVM-DAG:       %[[MLIR_8:.*]] = llvm.mlir.constant(2.600000e+00 : f64) : f64
+// LLVM-DAG:       %[[MLIR_9:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM-DAG:       %[[ALLOCA_0:.*]] = llvm.alloca %[[MLIR_9]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM-DAG:       %[[MLIR_10:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM-DAG:       %[[ALLOCA_1:.*]] = llvm.alloca %[[MLIR_10]] x !llvm.array<3 x i64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_0:.*]] = llvm.bitcast %[[ALLOCA_1]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_0]], %[[BITCAST_0]], %[[MLIR_5]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_allocate_veq(%[[ALLOCA_0]], %[[MLIR_5]]) : (!llvm.ptr, i64) -> ()
+// LLVM:           %[[BITCAST_1:.*]] = llvm.bitcast %[[ALLOCA_0]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[LOAD_0:.*]] = llvm.load %[[BITCAST_1]] : !llvm.ptr -> !llvm.ptr
+// LLVM:           %[[MLIR_11:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_2:.*]] = llvm.alloca %[[MLIR_11]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_2]], %[[LOAD_0]], %[[MLIR_3]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[LOAD_1:.*]] = llvm.load %[[BITCAST_1]] : !llvm.ptr -> !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_0:.*]] = llvm.getelementptr %[[LOAD_1]][1] : (!llvm.ptr) -> !llvm.ptr, i64
+// LLVM:           %[[MLIR_12:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_3:.*]] = llvm.alloca %[[MLIR_12]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_2:.*]] = llvm.bitcast %[[GETELEMENTPTR_0]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_3]], %[[BITCAST_2]], %[[MLIR_3]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[LOAD_2:.*]] = llvm.load %[[BITCAST_1]] : !llvm.ptr -> !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_1:.*]] = llvm.getelementptr %[[LOAD_2]][2] : (!llvm.ptr) -> !llvm.ptr, i64
+// LLVM:           %[[MLIR_13:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_4:.*]] = llvm.alloca %[[MLIR_13]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_3:.*]] = llvm.bitcast %[[GETELEMENTPTR_1]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_4]], %[[BITCAST_3]], %[[MLIR_3]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[MLIR_14:.*]] = llvm.mlir.addressof @cstr.6800 : !llvm.ptr
+// LLVM:           %[[BITCAST_4:.*]] = llvm.bitcast %[[MLIR_14]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[INTTOPTR_0:.*]] = llvm.inttoptr %[[MLIR_4]] : i64 to !llvm.ptr
+// LLVM:           %[[MLIR_15:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_5:.*]] = llvm.alloca %[[MLIR_15]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           %[[INTTOPTR_1:.*]] = llvm.inttoptr %[[MLIR_4]] : i64 to !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_5]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_4]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_5]], %[[ALLOCA_2]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_16:.*]] = llvm.mlir.addressof @cstr.7800 : !llvm.ptr
+// LLVM:           %[[BITCAST_5:.*]] = llvm.bitcast %[[MLIR_16]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_5]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_2]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_17:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_6:.*]] = llvm.alloca %[[MLIR_17]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_6]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_5]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_6]], %[[ALLOCA_2]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_18:.*]] = llvm.mlir.addressof @cstr.7900 : !llvm.ptr
+// LLVM:           %[[BITCAST_6:.*]] = llvm.bitcast %[[MLIR_18]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_19:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_7:.*]] = llvm.alloca %[[MLIR_19]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_7]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_6]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_7]], %[[ALLOCA_4]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_20:.*]] = llvm.mlir.addressof @cstr.7A00 : !llvm.ptr
+// LLVM:           %[[BITCAST_7:.*]] = llvm.bitcast %[[MLIR_20]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_21:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_8:.*]] = llvm.alloca %[[MLIR_21]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_2:.*]] = llvm.getelementptr %[[ALLOCA_4]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+// LLVM:           %[[LOAD_3:.*]] = llvm.load %[[GETELEMENTPTR_2]] : !llvm.ptr -> i64
+// LLVM:           %[[GETELEMENTPTR_3:.*]] = llvm.getelementptr %[[ALLOCA_2]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+// LLVM:           %[[LOAD_4:.*]] = llvm.load %[[GETELEMENTPTR_3]] : !llvm.ptr -> i64
+// LLVM:           %[[ADD_0:.*]] = llvm.add %[[LOAD_3]], %[[LOAD_4]] : i64
+// LLVM:           %[[ALLOCA_9:.*]] = llvm.alloca %[[ADD_0]] x i64 : (i64) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_8]], %[[ALLOCA_9]], %[[ADD_0]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[BITCAST_8:.*]] = llvm.bitcast %[[ALLOCA_9]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[LOAD_5:.*]] = llvm.load %[[GETELEMENTPTR_2]] : !llvm.ptr -> i64
+// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[BITCAST_8]], %[[ALLOCA_4]], %[[LOAD_5]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[GETELEMENTPTR_4:.*]] = llvm.getelementptr %[[ALLOCA_9]]{{\[}}%[[LOAD_5]]] : (!llvm.ptr, i64) -> !llvm.ptr, i64
+// LLVM:           %[[LOAD_6:.*]] = llvm.load %[[GETELEMENTPTR_3]] : !llvm.ptr -> i64
+// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[GETELEMENTPTR_4]], %[[ALLOCA_2]], %[[LOAD_6]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_7]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_8]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_22:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_10:.*]] = llvm.alloca %[[MLIR_22]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_10]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_7]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_10]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_23:.*]] = llvm.mlir.addressof @cstr.7400 : !llvm.ptr
+// LLVM:           %[[BITCAST_9:.*]] = llvm.bitcast %[[MLIR_23]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_24:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_11:.*]] = llvm.alloca %[[MLIR_24]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_11]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_9]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_11]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_25:.*]] = llvm.mlir.addressof @cstr.7300 : !llvm.ptr
+// LLVM:           %[[BITCAST_10:.*]] = llvm.bitcast %[[MLIR_25]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_26:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_12:.*]] = llvm.alloca %[[MLIR_26]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_12]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_10]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_12]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_27:.*]] = llvm.mlir.addressof @cstr.727800 : !llvm.ptr
+// LLVM:           %[[BITCAST_11:.*]] = llvm.bitcast %[[MLIR_27]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_28:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_13:.*]] = llvm.alloca %[[MLIR_28]] x !llvm.array<1 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_12:.*]] = llvm.bitcast %[[ALLOCA_13]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_13:.*]] = llvm.bitcast %[[ALLOCA_13]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_8]], %[[BITCAST_13]] : f64, !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_11]], %[[MLIR_3]], %[[BITCAST_12]], %[[ALLOCA_2]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_29:.*]] = llvm.mlir.addressof @cstr.7068617365645F727800 : !llvm.ptr
+// LLVM:           %[[BITCAST_14:.*]] = llvm.bitcast %[[MLIR_29]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_30:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_14:.*]] = llvm.alloca %[[MLIR_30]] x !llvm.array<2 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_15:.*]] = llvm.bitcast %[[ALLOCA_14]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_16:.*]] = llvm.bitcast %[[ALLOCA_14]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_8]], %[[BITCAST_16]] : f64, !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_5:.*]] = llvm.getelementptr %[[ALLOCA_14]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x f64>
+// LLVM:           llvm.store %[[MLIR_7]], %[[GETELEMENTPTR_5]] : f64, !llvm.ptr
+// LLVM:           %[[MLIR_31:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_15:.*]] = llvm.alloca %[[MLIR_31]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_15]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_14]], %[[MLIR_2]], %[[BITCAST_15]], %[[ALLOCA_15]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_32:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_16:.*]] = llvm.alloca %[[MLIR_32]] x !llvm.array<1 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_17:.*]] = llvm.bitcast %[[ALLOCA_16]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_18:.*]] = llvm.bitcast %[[ALLOCA_16]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_7]], %[[BITCAST_18]] : f64, !llvm.ptr
+// LLVM:           %[[MLIR_33:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_17:.*]] = llvm.alloca %[[MLIR_33]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_17]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_11]], %[[MLIR_3]], %[[BITCAST_17]], %[[ALLOCA_17]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_34:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_18:.*]] = llvm.alloca %[[MLIR_34]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           %[[MLIR_35:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_19:.*]] = llvm.alloca %[[MLIR_35]] x !llvm.array<1 x i64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_19:.*]] = llvm.bitcast %[[ALLOCA_19]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[CALL_0:.*]] = llvm.call @__nvqpp__cudaq_em_allocate() : () -> i64
+// LLVM:           %[[BITCAST_20:.*]] = llvm.bitcast %[[ALLOCA_19]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[CALL_0]], %[[BITCAST_20]] : i64, !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_18]], %[[BITCAST_19]], %[[MLIR_3]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[MLIR_36:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_20:.*]] = llvm.alloca %[[MLIR_36]] x !llvm.array<1 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_21:.*]] = llvm.bitcast %[[ALLOCA_20]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_22:.*]] = llvm.bitcast %[[ALLOCA_20]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_7]], %[[BITCAST_22]] : f64, !llvm.ptr
+// LLVM:           %[[MLIR_37:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_21:.*]] = llvm.alloca %[[MLIR_37]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_21]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_11]], %[[MLIR_3]], %[[BITCAST_21]], %[[ALLOCA_21]], %[[ALLOCA_18]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_38:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_22:.*]] = llvm.alloca %[[MLIR_38]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_22]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_5]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_22]], %[[ALLOCA_18]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_return(%[[ALLOCA_18]]) : (!llvm.ptr) -> ()
+// LLVM:           %[[MLIR_39:.*]] = llvm.mlir.addressof @cstr.727900 : !llvm.ptr
+// LLVM:           %[[BITCAST_23:.*]] = llvm.bitcast %[[MLIR_39]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_40:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_23:.*]] = llvm.alloca %[[MLIR_40]] x !llvm.array<1 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_24:.*]] = llvm.bitcast %[[ALLOCA_23]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_25:.*]] = llvm.bitcast %[[ALLOCA_23]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_8]], %[[BITCAST_25]] : f64, !llvm.ptr
+// LLVM:           %[[MLIR_41:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_24:.*]] = llvm.alloca %[[MLIR_41]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_24]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_23]], %[[MLIR_3]], %[[BITCAST_24]], %[[ALLOCA_24]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_42:.*]] = llvm.mlir.addressof @cstr.727A00 : !llvm.ptr
+// LLVM:           %[[BITCAST_26:.*]] = llvm.bitcast %[[MLIR_42]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_43:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_25:.*]] = llvm.alloca %[[MLIR_43]] x !llvm.array<1 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_27:.*]] = llvm.bitcast %[[ALLOCA_25]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_28:.*]] = llvm.bitcast %[[ALLOCA_25]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_7]], %[[BITCAST_28]] : f64, !llvm.ptr
+// LLVM:           %[[MLIR_44:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_26:.*]] = llvm.alloca %[[MLIR_44]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_26]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_26]], %[[MLIR_3]], %[[BITCAST_27]], %[[ALLOCA_26]], %[[ALLOCA_2]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_45:.*]] = llvm.mlir.addressof @cstr.753200 : !llvm.ptr
+// LLVM:           %[[BITCAST_29:.*]] = llvm.bitcast %[[MLIR_45]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_46:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_27:.*]] = llvm.alloca %[[MLIR_46]] x !llvm.array<2 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_30:.*]] = llvm.bitcast %[[ALLOCA_27]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_31:.*]] = llvm.bitcast %[[ALLOCA_27]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_7]], %[[BITCAST_31]] : f64, !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_6:.*]] = llvm.getelementptr %[[ALLOCA_27]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x f64>
+// LLVM:           llvm.store %[[MLIR_8]], %[[GETELEMENTPTR_6]] : f64, !llvm.ptr
+// LLVM:           %[[MLIR_47:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_28:.*]] = llvm.alloca %[[MLIR_47]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_28]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_29]], %[[MLIR_2]], %[[BITCAST_30]], %[[ALLOCA_28]], %[[ALLOCA_2]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_48:.*]] = llvm.mlir.addressof @cstr.753300 : !llvm.ptr
+// LLVM:           %[[BITCAST_32:.*]] = llvm.bitcast %[[MLIR_48]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_49:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_29:.*]] = llvm.alloca %[[MLIR_49]] x !llvm.array<3 x f64> : (i32) -> !llvm.ptr
+// LLVM:           %[[BITCAST_33:.*]] = llvm.bitcast %[[ALLOCA_29]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_34:.*]] = llvm.bitcast %[[ALLOCA_29]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[MLIR_7]], %[[BITCAST_34]] : f64, !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_7:.*]] = llvm.getelementptr %[[ALLOCA_29]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<3 x f64>
+// LLVM:           llvm.store %[[MLIR_8]], %[[GETELEMENTPTR_7]] : f64, !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_8:.*]] = llvm.getelementptr %[[ALLOCA_29]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<3 x f64>
+// LLVM:           llvm.store %[[MLIR_6]], %[[GETELEMENTPTR_8]] : f64, !llvm.ptr
+// LLVM:           %[[MLIR_50:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_30:.*]] = llvm.alloca %[[MLIR_50]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_30]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_32]], %[[MLIR_5]], %[[BITCAST_33]], %[[ALLOCA_30]], %[[ALLOCA_2]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_51:.*]] = llvm.mlir.addressof @cstr.7377617000 : !llvm.ptr
+// LLVM:           %[[BITCAST_35:.*]] = llvm.bitcast %[[MLIR_51]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[MLIR_52:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_31:.*]] = llvm.alloca %[[MLIR_52]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_31]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[MLIR_53:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_32:.*]] = llvm.alloca %[[MLIR_53]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           %[[LOAD_7:.*]] = llvm.load %[[GETELEMENTPTR_3]] : !llvm.ptr -> i64
+// LLVM:           %[[LOAD_8:.*]] = llvm.load %[[GETELEMENTPTR_2]] : !llvm.ptr -> i64
+// LLVM:           %[[ADD_1:.*]] = llvm.add %[[LOAD_7]], %[[LOAD_8]] : i64
+// LLVM:           %[[ALLOCA_33:.*]] = llvm.alloca %[[ADD_1]] x i64 : (i64) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_32]], %[[ALLOCA_33]], %[[ADD_1]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[BITCAST_36:.*]] = llvm.bitcast %[[ALLOCA_33]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[LOAD_9:.*]] = llvm.load %[[GETELEMENTPTR_3]] : !llvm.ptr -> i64
+// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[BITCAST_36]], %[[ALLOCA_2]], %[[LOAD_9]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           %[[GETELEMENTPTR_9:.*]] = llvm.getelementptr %[[ALLOCA_33]]{{\[}}%[[LOAD_9]]] : (!llvm.ptr, i64) -> !llvm.ptr, i64
+// LLVM:           %[[LOAD_10:.*]] = llvm.load %[[GETELEMENTPTR_2]] : !llvm.ptr -> i64
+// LLVM:           llvm.call @__nvqpp__cudaq_em_concatSpan(%[[GETELEMENTPTR_9]], %[[ALLOCA_4]], %[[LOAD_10]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_35]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_31]], %[[ALLOCA_32]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_54:.*]] = llvm.mlir.addressof @cstr.6D696B6500 : !llvm.ptr
+// LLVM:           %[[BITCAST_37:.*]] = llvm.bitcast %[[MLIR_54]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[CALL_1:.*]] = llvm.call @__nvqpp__cudaq_em_measure(%[[ALLOCA_4]], %[[BITCAST_37]]) : (!llvm.ptr, !llvm.ptr) -> i32
+// LLVM:           %[[MLIR_55:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_34:.*]] = llvm.alloca %[[MLIR_55]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_34]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_4]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_34]], %[[ALLOCA_3]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_56:.*]] = llvm.mlir.addressof @cstr.746F6D00 : !llvm.ptr
+// LLVM:           %[[BITCAST_38:.*]] = llvm.bitcast %[[MLIR_56]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[CALL_2:.*]] = llvm.call @__nvqpp__cudaq_em_measure(%[[ALLOCA_3]], %[[BITCAST_38]]) : (!llvm.ptr, !llvm.ptr) -> i32
+// LLVM:           %[[MLIR_57:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_35:.*]] = llvm.alloca %[[MLIR_57]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_35]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_10]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_35]], %[[ALLOCA_2]], %[[MLIR_0]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_58:.*]] = llvm.mlir.constant(1 : i32) : i32
+// LLVM:           %[[ALLOCA_36:.*]] = llvm.alloca %[[MLIR_58]] x !llvm.struct<(ptr, i64)> : (i32) -> !llvm.ptr
+// LLVM:           llvm.call @__nvqpp__cudaq_em_writeToSpan(%[[ALLOCA_36]], %[[INTTOPTR_1]], %[[MLIR_4]]) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// LLVM:           llvm.call @__nvqpp__cudaq_em_apply(%[[BITCAST_4]], %[[MLIR_4]], %[[INTTOPTR_0]], %[[ALLOCA_36]], %[[ALLOCA_2]], %[[MLIR_1]]) : (!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) -> ()
+// LLVM:           %[[MLIR_59:.*]] = llvm.mlir.addressof @cstr.72
+// LLVM:           %[[BITCAST_39:.*]] = llvm.bitcast %[[MLIR_59]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[CALL_3:.*]] = llvm.call @__nvqpp__cudaq_em_measure(%[[ALLOCA_2]], %[[BITCAST_39]]) : (!llvm.ptr, !llvm.ptr) -> i32
+// LLVM:           llvm.call @__nvqpp__cudaq_em_return(%[[ALLOCA_0]]) : (!llvm.ptr) -> ()
 // LLVM:           llvm.return
 // LLVM:         }
 // LLVM:         llvm.func @__nvqpp__cudaq_em_allocate() -> i64 attributes {sym_visibility = "private"}
 
 // LLVM-LABEL:   llvm.func @__nvqpp__cudaq_em_allocate_veq(
-// LLVM-SAME:      %[[VAL_0:.*]]: !llvm.ptr<struct<(ptr<i64>, i64)>>,
-// LLVM-SAME:      %[[VAL_1:.*]]: i64) attributes {sym_visibility = "private"} {
-// LLVM:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
-// LLVM:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : i64) : i64
-// LLVM:           %[[VAL_4:.*]] = llvm.getelementptr %[[VAL_0]][0, 0] : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_13:.*]] = llvm.load %[[VAL_4]] : !llvm.ptr<ptr<i64>>
-// LLVM:           llvm.br ^bb1(%[[VAL_3]] : i64)
-// LLVM:         ^bb1(%[[VAL_5:.*]]: i64):
-// LLVM:           %[[VAL_6:.*]] = llvm.icmp "slt" %[[VAL_5]], %[[VAL_1]] : i64
-// LLVM:           llvm.cond_br %[[VAL_6]], ^bb2(%[[VAL_5]] : i64), ^bb4(%[[VAL_5]] : i64)
-// LLVM:         ^bb2(%[[VAL_7:.*]]: i64):
-// LLVM:           %[[VAL_8:.*]] = llvm.call @__nvqpp__cudaq_em_allocate() : () -> i64
-// LLVM:           %[[VAL_9:.*]] = llvm.getelementptr %[[VAL_13]][%[[VAL_7]]] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
-// LLVM:           llvm.store %[[VAL_8]], %[[VAL_9]] : !llvm.ptr<i64>
-// LLVM:           llvm.br ^bb3(%[[VAL_7]] : i64)
-// LLVM:         ^bb3(%[[VAL_10:.*]]: i64):
-// LLVM:           %[[VAL_11:.*]] = llvm.add %[[VAL_10]], %[[VAL_2]]  : i64
-// LLVM:           llvm.br ^bb1(%[[VAL_11]] : i64)
-// LLVM:         ^bb4(%[[VAL_12:.*]]: i64):
-// LLVM:           llvm.br ^bb5
-// LLVM:         ^bb5:
+// LLVM-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// LLVM-SAME:      %[[ARG1:.*]]: i64) attributes {sym_visibility = "private"} {
+// LLVM:           %[[MLIR_0:.*]] = llvm.mlir.constant(1 : i64) : i64
+// LLVM:           %[[MLIR_1:.*]] = llvm.mlir.constant(0 : i64) : i64
+// LLVM:           %[[BITCAST_0:.*]] = llvm.bitcast %[[ARG0]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[LOAD_0:.*]] = llvm.load %[[BITCAST_0]] : !llvm.ptr -> !llvm.ptr
+// LLVM:           llvm.br ^bb1(%[[MLIR_1]] : i64)
+// LLVM:         ^bb1(%[[VAL_0:.*]]: i64):
+// LLVM:           %[[ICMP_0:.*]] = llvm.icmp "slt" %[[VAL_0]], %[[ARG1]] : i64
+// LLVM:           llvm.cond_br %[[ICMP_0]], ^bb2(%[[VAL_0]] : i64), ^bb3
+// LLVM:         ^bb2(%[[VAL_1:.*]]: i64):
+// LLVM:           %[[CALL_0:.*]] = llvm.call @__nvqpp__cudaq_em_allocate() : () -> i64
+// LLVM:           %[[GETELEMENTPTR_0:.*]] = llvm.getelementptr %[[LOAD_0]]{{\[}}%[[VAL_1]]] : (!llvm.ptr, i64) -> !llvm.ptr, i64
+// LLVM:           llvm.store %[[CALL_0]], %[[GETELEMENTPTR_0]] : i64, !llvm.ptr
+// LLVM:           %[[ADD_0:.*]] = llvm.add %[[VAL_1]], %[[MLIR_0]] : i64
+// LLVM:           llvm.br ^bb1(%[[ADD_0]] : i64)
+// LLVM:         ^bb3:
 // LLVM:           llvm.return
 // LLVM:         }
-// LLVM:         llvm.func @__nvqpp__cudaq_em_apply(!llvm.ptr<i8>, i64, !llvm.ptr<f64>, !llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<struct<(ptr<i64>, i64)>>, i1) attributes {sym_visibility = "private"}
-// LLVM:         llvm.func @llvm.memcpy.p0i8.p0i8.i64(!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) attributes {sym_visibility = "private"}
+// LLVM:         llvm.func @__nvqpp__cudaq_em_apply(!llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr, i1) attributes {sym_visibility = "private"}
+// LLVM:         llvm.func @llvm.memcpy.p0.p0.i64(!llvm.ptr, !llvm.ptr, i64, i1) attributes {sym_visibility = "private"}
 
 // LLVM-LABEL:   llvm.func @__nvqpp__cudaq_em_concatSpan(
-// LLVM-SAME:      %[[VAL_0:.*]]: !llvm.ptr<i64>,
-// LLVM-SAME:      %[[VAL_1:.*]]: !llvm.ptr<struct<(ptr<i64>, i64)>>,
-// LLVM-SAME:      %[[VAL_2:.*]]: i64) attributes {sym_visibility = "private"} {
-// LLVM:           %[[VAL_3:.*]] = llvm.mlir.constant(false) : i1
-// LLVM:           %[[VAL_4:.*]] = llvm.mlir.constant(8 : i64) : i64
-// LLVM:           %[[VAL_5:.*]] = llvm.getelementptr %[[VAL_1]][0, 0] : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_6:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_7:.*]] = llvm.mul %[[VAL_2]], %[[VAL_4]]  : i64
-// LLVM:           %[[VAL_8:.*]] = llvm.bitcast %[[VAL_0]] : !llvm.ptr<i64> to !llvm.ptr<i8>
-// LLVM:           %[[VAL_9:.*]] = llvm.bitcast %[[VAL_6]] : !llvm.ptr<i64> to !llvm.ptr<i8>
-// LLVM:           llvm.call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_8]], %[[VAL_9]], %[[VAL_7]], %[[VAL_3]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) -> ()
+// LLVM-SAME:      %[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: i64)
+// LLVM:           %[[MLIR_0:.*]] = llvm.mlir.constant(false) : i1
+// LLVM:           %[[MLIR_1:.*]] = llvm.mlir.constant(8 : i64) : i64
+// LLVM:           %[[BITCAST_0:.*]] = llvm.bitcast %[[ARG1]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[LOAD_0:.*]] = llvm.load %[[BITCAST_0]] : !llvm.ptr -> !llvm.ptr
+// LLVM:           %[[MUL_0:.*]] = llvm.mul %[[ARG2]], %[[MLIR_1]] : i64
+// LLVM:           %[[BITCAST_1:.*]] = llvm.bitcast %[[ARG0]] : !llvm.ptr to !llvm.ptr
+// LLVM:           %[[BITCAST_2:.*]] = llvm.bitcast %[[LOAD_0]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.call @llvm.memcpy.p0.p0.i64(%[[BITCAST_1]], %[[BITCAST_2]], %[[MUL_0]], %[[MLIR_0]]) : (!llvm.ptr, !llvm.ptr, i64, i1) -> ()
 // LLVM:           llvm.return
 // LLVM:         }
-// LLVM:         llvm.func @__nvqpp__cudaq_em_measure(!llvm.ptr<struct<(ptr<i64>, i64)>>, !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"}
-// LLVM:         llvm.func @__nvqpp__cudaq_em_reset(!llvm.ptr<struct<(ptr<i64>, i64)>>) attributes {sym_visibility = "private"}
-// LLVM:         llvm.func @__nvqpp__cudaq_em_return(!llvm.ptr<struct<(ptr<i64>, i64)>>) attributes {sym_visibility = "private"}
+// LLVM:         llvm.func @__nvqpp__cudaq_em_measure(!llvm.ptr, !llvm.ptr) -> i32 attributes {sym_visibility = "private"}
+// LLVM:         llvm.func @__nvqpp__cudaq_em_reset(!llvm.ptr) attributes {sym_visibility = "private"}
+// LLVM:         llvm.func @__nvqpp__cudaq_em_return(!llvm.ptr) attributes {sym_visibility = "private"}
 
 // LLVM-LABEL:   llvm.func @__nvqpp__cudaq_em_writeToSpan(
-// LLVM-SAME:      %[[VAL_0:.*]]: !llvm.ptr<struct<(ptr<i64>, i64)>>,
-// LLVM-SAME:      %[[VAL_1:.*]]: !llvm.ptr<i64>,
-// LLVM-SAME:      %[[VAL_2:.*]]: i64) attributes {sym_visibility = "private"} {
-// LLVM:           %[[VAL_3:.*]] = llvm.getelementptr %[[VAL_0]][0, 0] : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> !llvm.ptr<ptr<i64>>
-// LLVM:           llvm.store %[[VAL_1]], %[[VAL_3]] : !llvm.ptr<ptr<i64>>
-// LLVM:           %[[VAL_4:.*]] = llvm.getelementptr %[[VAL_0]][0, 1] : (!llvm.ptr<struct<(ptr<i64>, i64)>>) -> !llvm.ptr<i64>
-// LLVM:           llvm.store %[[VAL_2]], %[[VAL_4]] : !llvm.ptr<i64>
+// LLVM-SAME:      %[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: i64)
+// LLVM:           %[[BITCAST_0:.*]] = llvm.bitcast %[[ARG0]] : !llvm.ptr to !llvm.ptr
+// LLVM:           llvm.store %[[ARG1]], %[[BITCAST_0]] : !llvm.ptr, !llvm.ptr
+// LLVM:           %[[GETELEMENTPTR_0:.*]] = llvm.getelementptr %[[ARG0]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+// LLVM:           llvm.store %[[ARG2]], %[[GETELEMENTPTR_0]] : i64, !llvm.ptr
 // LLVM:           llvm.return
 // LLVM:         }
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.6800("h\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.7800("x\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.7900("y\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.7A00("z\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.7400("t\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.7300("s\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.727800("rx\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.7068617365645F727800("phased_rx\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.727900("ry\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.727A00("rz\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.753200("u2\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.753300("u3\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.7377617000("swap\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.6D696B6500("mike\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.746F6D00("tom\00") {addr_space = 0 : i32}
-// LLVM-DAG:     llvm.mlir.global private constant @cstr.72{{[0-9]+}}00("r{{[0-9]+}}\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.6800("h\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.7800("x\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.7900("y\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.7A00("z\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.7400("t\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.7300("s\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.727800("rx\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.7068617365645F727800("phased_rx\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.727900("ry\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.727A00("rz\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.753200("u2\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.753300("u3\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.7377617000("swap\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.6D696B6500("mike\00") {addr_space = 0 : i32}
+// LLVM:         llvm.mlir.global private constant @cstr.746F6D00("tom\00") {addr_space = 0 : i32}
+// LLVM:         llvm.func @llvm.stackrestore(!llvm.ptr) attributes {sym_visibility = "private"}
+// LLVM:         llvm.func @llvm.stacksave() -> !llvm.ptr attributes {sym_visibility = "private"}
 
diff --git a/test/Transforms/cc_to_llvm.qke b/test/Transforms/cc_to_llvm.qke
index ce0851a55a0..c2b78dda777 100644
--- a/test/Transforms/cc_to_llvm.qke
+++ b/test/Transforms/cc_to_llvm.qke
@@ -48,12 +48,12 @@ func.func private @__quantum__qis__x(!cc.ptr<!llvm.struct<"Qubit", opaque>>)
 
 
 // CHECK-LABEL:   llvm.func @__nvqpp__callable.thunk.lambda.0(
-// CHECK-SAME:      %[[VAL_0:.*]]: !llvm.struct<(ptr<i8>, ptr<i8>)>) attributes {"cudaq-kernel", "qir-api", sym_visibility = "private"} {
-// CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(ptr<i8>, ptr<i8>)>
-// CHECK:           %[[VAL_2:.*]] = llvm.bitcast %[[VAL_1]] : !llvm.ptr<i8> to !llvm.ptr<struct<(i1, ptr<i1>, ptr<struct<"Qubit", opaque>>)>>
-// CHECK:           %[[VAL_3:.*]] = llvm.load %[[VAL_2]] : !llvm.ptr<struct<(i1, ptr<i1>, ptr<struct<"Qubit", opaque>>)>>
-// CHECK:           %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_3]][0] : !llvm.struct<(i1, ptr<i1>, ptr<struct<"Qubit", opaque>>)>
-// CHECK:           %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_3]][1] : !llvm.struct<(i1, ptr<i1>, ptr<struct<"Qubit", opaque>>)>
-// CHECK:           %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_3]][2] : !llvm.struct<(i1, ptr<i1>, ptr<struct<"Qubit", opaque>>)>
-// CHECK:           llvm.store %[[VAL_4]], %[[VAL_5]] : !llvm.ptr<i1>
-// CHECK:           %[[VAL_7:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr<i1>
+// CHECK-SAME:      %[[VAL_0:.*]]: !llvm.struct<(ptr, ptr)>) attributes {"cudaq-kernel", "qir-api", sym_visibility = "private"} {
+// CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(ptr, ptr)>
+// CHECK:           %[[VAL_2:.*]] = llvm.bitcast %[[VAL_1]] : !llvm.ptr to !llvm.ptr
+// CHECK:           %[[VAL_3:.*]] = llvm.load %[[VAL_2]] : !llvm.ptr -> !llvm.struct<(i1, ptr, ptr)>
+// CHECK:           %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_3]][0] : !llvm.struct<(i1, ptr, ptr)>
+// CHECK:           %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_3]][1] : !llvm.struct<(i1, ptr, ptr)>
+// CHECK:           %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_3]][2] : !llvm.struct<(i1, ptr, ptr)>
+// CHECK:           llvm.store %[[VAL_4]], %[[VAL_5]] : i1, !llvm.ptr
+// CHECK:           %[[VAL_7:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> i1
diff --git a/test/Transforms/classical_optimization.qke b/test/Transforms/classical_optimization.qke
index 3d9fc0c4326..f54bd269eed 100644
--- a/test/Transforms/classical_optimization.qke
+++ b/test/Transforms/classical_optimization.qke
@@ -243,11 +243,8 @@ func.func @test_nested_loop_unroll() {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<6>
 // CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<6>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<6>) -> !quake.ref
-// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][{{.*}}] : (!quake.veq<6>, i64) -> !quake.ref
+// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][{{.*}}] : (!quake.veq<6>, i64) -> !quake.ref
 // CHECK:           quake.x [%[[VAL_2]]] %[[VAL_3]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<6>) -> !quake.ref
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][2] : (!quake.veq<6>) -> !quake.ref
-// CHECK:           quake.x [%[[VAL_4]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/controlled_rotation_varargs_regression.qke b/test/Transforms/controlled_rotation_varargs_regression.qke
index 7baf1316e7a..9c29d63c62e 100644
--- a/test/Transforms/controlled_rotation_varargs_regression.qke
+++ b/test/Transforms/controlled_rotation_varargs_regression.qke
@@ -34,8 +34,8 @@ func.func @test_controlled_rx_two_refs() attributes {"cudaq-entrypoint", "cudaq-
 
 // CHECK-LABEL: llvm.func @test_controlled_rx_two_refs()
 // CHECK: @invokeRotationWithControlQubits
-// CHECK-NOT: !llvm.ptr<func<void (f64, ptr<struct<"Array", opaque>>, ptr<struct<"Qubit", opaque>>)>>, f64
-// CHECK-SAME: !llvm.ptr<func<void (f64, ptr<struct<"Array", opaque>>, ptr<struct<"Qubit", opaque>>)>>, !llvm.ptr<struct<"Qubit", opaque>>
+// CHECK-NOT: vararg(!llvm.func<void (f64, i64, ptr, ptr, ...)>) : (f64, i64, !llvm.ptr, !llvm.ptr, f64
+// CHECK-SAME: vararg(!llvm.func<void (f64, i64, ptr, ptr, ...)>) : (f64, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr
 
 
 func.func @test_controlled_ry_single_ref() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
@@ -48,5 +48,5 @@ func.func @test_controlled_ry_single_ref() attributes {"cudaq-entrypoint", "cuda
 
 // CHECK-LABEL: llvm.func @test_controlled_ry_single_ref()
 // CHECK: @invokeRotationWithControlQubits
-// CHECK-NOT: !llvm.ptr<func<void (f64, ptr<struct<"Array", opaque>>, ptr<struct<"Qubit", opaque>>)>>, f64
-// CHECK-SAME: !llvm.ptr<func<void (f64, ptr<struct<"Array", opaque>>, ptr<struct<"Qubit", opaque>>)>>, !llvm.ptr<struct<"Qubit", opaque>>
+// CHECK-NOT: vararg(!llvm.func<void (f64, i64, ptr, ptr, ...)>) : (f64, i64, !llvm.ptr, !llvm.ptr, f64
+// CHECK-SAME: vararg(!llvm.func<void (f64, i64, ptr, ptr, ...)>) : (f64, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr
diff --git a/test/Transforms/cse.qke b/test/Transforms/cse.qke
index d095cf815e9..936c7440fab 100644
--- a/test/Transforms/cse.qke
+++ b/test/Transforms/cse.qke
@@ -16,19 +16,19 @@ func.func @__nvqpp__mlirgen__deuteron_n3_ansatz2(%arg0: !cc.stdvec<f64>) {
   %0 = quake.alloca  !quake.veq<3>
   %1 = quake.extract_ref %0[%c0_i64] : (!quake.veq<3>,i64) -> !quake.ref
   quake.x %1 : (!quake.ref) -> ()
-  %2 = cc.stdvec_data %arg0 : (!cc.stdvec<f64>) -> !llvm.ptr<f64>
-  %3 = llvm.load %2 : !llvm.ptr<f64>
+  %2 = cc.stdvec_data %arg0 : (!cc.stdvec<f64>) -> !llvm.ptr
+  %3 = llvm.load %2 : !llvm.ptr -> f64
   %4 = quake.extract_ref %0[%c1_i64] : (!quake.veq<3>,i64) -> !quake.ref
   quake.ry (%3) %4 : (f64, !quake.ref) -> ()
-  %5 = cc.stdvec_data %arg0 : (!cc.stdvec<f64>) -> !llvm.ptr<f64>
-  %6 = llvm.getelementptr %5[1] : (!llvm.ptr<f64>) -> !llvm.ptr<f64>
-  %7 = llvm.load %6 : !llvm.ptr<f64>
+  %5 = cc.stdvec_data %arg0 : (!cc.stdvec<f64>) -> !llvm.ptr
+  %6 = llvm.getelementptr %5[1] : (!llvm.ptr) -> !llvm.ptr, f64
+  %7 = llvm.load %6 : !llvm.ptr -> f64
   %8 = quake.extract_ref %0[%c2_i64] : (!quake.veq<3>, i64) -> !quake.ref
   quake.ry (%7) %8 : (f64, !quake.ref) -> ()
   quake.x [%8] %1 : (!quake.ref, !quake.ref) -> ()
   quake.x [%1] %4 : (!quake.ref, !quake.ref) -> ()
-  %9 = cc.stdvec_data %arg0 : (!cc.stdvec<f64>) -> !llvm.ptr<f64>
-  %10 = llvm.load %9 : !llvm.ptr<f64>
+  %9 = cc.stdvec_data %arg0 : (!cc.stdvec<f64>) -> !llvm.ptr
+  %10 = llvm.load %9 : !llvm.ptr -> f64
   %11 = arith.mulf %10, %cst : f64
   quake.ry (%11) %4 : (f64, !quake.ref) -> ()
   quake.x [%1] %4   : (!quake.ref, !quake.ref) -> ()
diff --git a/test/Transforms/custom_pass.qke b/test/Transforms/custom_pass.qke
index 47322cceea2..2016f3fb01f 100644
--- a/test/Transforms/custom_pass.qke
+++ b/test/Transforms/custom_pass.qke
@@ -6,6 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
+// REQUIRES: custom-pass-plugin
 // RUN: cudaq-opt %s --load-cudaq-plugin %cudaq_lib_dir/CustomPassPlugin%cudaq_plugin_ext --cudaq-custom-pass  | FileCheck %s
 
 module {
diff --git a/test/Transforms/exp_pauli-1.qke b/test/Transforms/exp_pauli-1.qke
index d7af4970ace..466c04b6262 100644
--- a/test/Transforms/exp_pauli-1.qke
+++ b/test/Transforms/exp_pauli-1.qke
@@ -6,7 +6,7 @@
 // the terms of the Apache License 2.0 which accompanies this distribution.   //
 // ========================================================================== //
 
-// RUN: cudaq-opt --convert-to-qir-api=api=base-profile %s | FileCheck %s
+// RUN: cudaq-opt --convert-to-qir-api=api=base-profile --cse %s | FileCheck %s
 
 func.func @ep_0() {
   %0 = quake.alloca !quake.ref
@@ -26,12 +26,12 @@ func.func @ep_1() {
 }
 
 // CHECK-LABEL:   func.func @ep_0() {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 8 : i32
-// CHECK:           %[[VAL_1:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 2.000000e+00 : f64
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_5:.*]] = cc.alloca !cc.array<!cc.struct<{!cc.ptr<i8>, i64}> x 1>
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 8 : i32
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 3 : i64
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_5:.*]] = cc.alloca !cc.array<!cc.struct<{!cc.ptr<i8>, i64}> x 1>
 // CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
 // CHECK:           %[[VAL_7:.*]] = call @__quantum__rt__array_create_1d(%[[VAL_0]], %[[VAL_4]]) : (i32, i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
 // CHECK:           %[[VAL_8:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_7]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
diff --git a/test/Transforms/invalid.qke b/test/Transforms/invalid.qke
index d5bb36707f1..12ddfaa7da9 100644
--- a/test/Transforms/invalid.qke
+++ b/test/Transforms/invalid.qke
@@ -13,7 +13,7 @@ func.func @test_struq() {
   %1 = arith.constant 1 : i32
   %2 = arith.constant 2.0 : f32
   // expected-error@+2 {{invalid struq member type}}
-  // expected-error@+1 {{must be non-struct quantum reference type}}
+  // expected-error@+1 {{must be variadic of non-struct quantum reference type}}
   %6 = quake.make_struq %0, %1, %2 : (!quake.veq<4>, i32, f32) -> !quake.struq<!quake.veq<?>, i32, f32>
   return
 }
diff --git a/test/Transforms/kernel_exec-1.qke b/test/Transforms/kernel_exec-1.qke
index 24e20ab60b3..6017e732c1b 100644
--- a/test/Transforms/kernel_exec-1.qke
+++ b/test/Transforms/kernel_exec-1.qke
@@ -94,8 +94,8 @@ module attributes {quake.mangled_name_map = {
 // ALT:           %[[VAL_8:.*]] = cc.func_ptr %[[VAL_7]] : ((!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // ALT:           %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<i8>
 // ALT:           %[[VAL_10:.*]] = cc.offsetof !cc.struct<{i32, f64}> [1] : i64
-// ALT:           %[[VAL_11:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// ALT:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// ALT:           %[[VAL_11:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr
+// ALT:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!llvm.ptr) -> !cc.ptr<i8>
 // ALT:           %[[VAL_13:.*]] = call @altLaunchKernel(%[[VAL_12]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // ALT:           %[[VAL_14:.*]] = cc.extract_value %[[VAL_13]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // ALT:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (!cc.ptr<i8>) -> i64
@@ -116,7 +116,7 @@ module attributes {quake.mangled_name_map = {
 // ALT:         }
 // ALT:         func.func private @altLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // ALT:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
-// ALT:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// ALT:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
 // ALT:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
 // ALT:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
 // ALT:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
@@ -124,7 +124,7 @@ module attributes {quake.mangled_name_map = {
 // ALT:         func.func private @free(!cc.ptr<i8>)
 // ALT:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
 // ALT:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x {{32|16}}>}>>, !cc.ptr<!cc.ptr<i8>>)
-// ALT:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+// ALT:         func.func private @llvm.memcpy.p0.p0.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
 // ALT-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // ALT:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -146,11 +146,11 @@ module attributes {quake.mangled_name_map = {
 // ALT:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
 // ALT:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
 // ALT:           %[[VAL_9:.*]] = arith.constant false
-// ALT:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// ALT:           call @llvm.memcpy.p0.p0.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // ALT:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
 // ALT:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
 // ALT:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// ALT:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// ALT:           call @llvm.memcpy.p0.p0.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // ALT:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
 // ALT:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_13]][0], %[[VAL_7]] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // ALT:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_14]][1], %[[VAL_6]] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
@@ -201,15 +201,15 @@ module attributes {quake.mangled_name_map = {
 // ALT:         }
 
 // ALT-LABEL:   llvm.func @ghz.kernelRegFunc() {
-// ALT:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// ALT:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// ALT:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr
+// ALT:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr) -> !cc.ptr<i8>
 // ALT:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
 // ALT:           %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
 // ALT:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
 // ALT:           func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
 // ALT:           llvm.return
 // ALT:         }
-// ALT:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
+// ALT:         llvm.mlir.global_ctors ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]
 
 // STREAMLINED-LABEL:   func.func @_ZN3ghzclEi(
 // STREAMLINED-SAME:                           %[[VAL_0:.*]]: !cc.ptr<i8>,
@@ -235,15 +235,15 @@ module attributes {quake.mangled_name_map = {
 // STREAMLINED:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
 // STREAMLINED:           cc.store %[[VAL_16]], %[[VAL_14]] : !cc.ptr<!cc.ptr<i8>>
 // STREAMLINED:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// STREAMLINED:           %[[VAL_18:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// STREAMLINED:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// STREAMLINED:           %[[VAL_18:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr
+// STREAMLINED:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!llvm.ptr) -> !cc.ptr<i8>
 // STREAMLINED:           call @streamlinedLaunchKernel(%[[VAL_19]], %[[VAL_17]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
 // STREAMLINED:           %[[VAL_20:.*]] = cc.undef f64
 // STREAMLINED:           return %[[VAL_20]] : f64
 // STREAMLINED:         }
 // STREAMLINED:         func.func private @streamlinedLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>)
 // STREAMLINED:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
-// STREAMLINED:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// STREAMLINED:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
 // STREAMLINED:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
 // STREAMLINED:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
 // STREAMLINED:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
@@ -251,7 +251,7 @@ module attributes {quake.mangled_name_map = {
 // STREAMLINED:         func.func private @free(!cc.ptr<i8>)
 // STREAMLINED:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
 // STREAMLINED:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x {{32|16}}>}>>, !cc.ptr<!cc.ptr<i8>>)
-// STREAMLINED:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+// STREAMLINED:         func.func private @llvm.memcpy.p0.p0.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
 // STREAMLINED-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // STREAMLINED:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -273,11 +273,11 @@ module attributes {quake.mangled_name_map = {
 // STREAMLINED:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
 // STREAMLINED:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
 // STREAMLINED:           %[[VAL_9:.*]] = arith.constant false
-// STREAMLINED:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// STREAMLINED:           call @llvm.memcpy.p0.p0.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // STREAMLINED:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
 // STREAMLINED:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
 // STREAMLINED:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// STREAMLINED:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// STREAMLINED:           call @llvm.memcpy.p0.p0.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // STREAMLINED:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
 // STREAMLINED:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_13]][0], %[[VAL_7]] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // STREAMLINED:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_14]][1], %[[VAL_6]] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
@@ -289,12 +289,12 @@ module attributes {quake.mangled_name_map = {
 // STREAMLINED:         llvm.mlir.global external constant @ghz.kernelName("ghz\00") {addr_space = 0 : i32}
 
 // STREAMLINED-LABEL:   llvm.func @ghz.kernelRegFunc() {
-// STREAMLINED:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// STREAMLINED:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// STREAMLINED:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr
+// STREAMLINED:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr) -> !cc.ptr<i8>
 // STREAMLINED:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
 // STREAMLINED:           llvm.return
 // STREAMLINED:         }
-// STREAMLINED:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
+// STREAMLINED:         llvm.mlir.global_ctors ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]
 
 
 
@@ -330,8 +330,8 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:           %[[VAL_23:.*]] = cc.cast %[[VAL_22]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
 // HYBRID:           cc.store %[[VAL_23]], %[[VAL_21]] : !cc.ptr<!cc.ptr<i8>>
 // HYBRID:           %[[VAL_24:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// HYBRID:           %[[VAL_25:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// HYBRID:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_25:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr
+// HYBRID:           %[[VAL_26:.*]] = cc.cast %[[VAL_25]] : (!llvm.ptr) -> !cc.ptr<i8>
 // HYBRID:           %[[VAL_27:.*]] = call @hybridLaunchKernel(%[[VAL_26]], %[[VAL_8]], %[[VAL_9]], %[[VAL_3]], %[[VAL_10]], %[[VAL_24]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // HYBRID:           %[[VAL_28:.*]] = cc.extract_value %[[VAL_27]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // HYBRID:           %[[VAL_29:.*]] = cc.cast %[[VAL_28]] : (!cc.ptr<i8>) -> i64
@@ -352,7 +352,7 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:         }
 // HYBRID:         func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // HYBRID:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
-// HYBRID:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// HYBRID:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
 // HYBRID:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
 // HYBRID:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
 // HYBRID:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
@@ -360,7 +360,7 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:         func.func private @free(!cc.ptr<i8>)
 // HYBRID:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
 // HYBRID:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x {{32|16}}>}>>, !cc.ptr<!cc.ptr<i8>>)
-// HYBRID:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+// HYBRID:         func.func private @llvm.memcpy.p0.p0.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
 // HYBRID-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // HYBRID:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -382,11 +382,11 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
 // HYBRID:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
 // HYBRID:           %[[VAL_9:.*]] = arith.constant false
-// HYBRID:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// HYBRID:           call @llvm.memcpy.p0.p0.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // HYBRID:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
 // HYBRID:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
 // HYBRID:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// HYBRID:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// HYBRID:           call @llvm.memcpy.p0.p0.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // HYBRID:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
 // HYBRID:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_13]][0], %[[VAL_7]] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // HYBRID:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_14]][1], %[[VAL_6]] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
@@ -437,13 +437,12 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:         }
 
 // HYBRID-LABEL:   llvm.func @ghz.kernelRegFunc() {
-// HYBRID:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr<array<4 x i8>>
-// HYBRID:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<4 x i8>>) -> !cc.ptr<i8>
+// HYBRID:           %[[VAL_0:.*]] = llvm.mlir.addressof @ghz.kernelName : !llvm.ptr
+// HYBRID:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr) -> !cc.ptr<i8>
 // HYBRID:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
 // HYBRID:           %[[VAL_2:.*]] = func.constant @ghz.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
 // HYBRID:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
 // HYBRID:           func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
 // HYBRID:           llvm.return
 // HYBRID:         }
-// HYBRID:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
-
+// HYBRID:         llvm.mlir.global_ctors ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]
diff --git a/test/Transforms/kernel_exec-2.qke b/test/Transforms/kernel_exec-2.qke
index b8b08962060..43b4bca607f 100644
--- a/test/Transforms/kernel_exec-2.qke
+++ b/test/Transforms/kernel_exec-2.qke
@@ -88,7 +88,7 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           %[[VAL_27:.*]] = cc.load %[[VAL_26]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_28:.*]] = arith.constant false
 // CHECK:           %[[VAL_29:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<i8>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_29]], %[[VAL_27]], %[[VAL_25]], %[[VAL_28]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_29]], %[[VAL_27]], %[[VAL_25]], %[[VAL_28]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
 // CHECK:           %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_30]]{{\[}}%[[VAL_25]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_32:.*]] = constant @function_hawaiian.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
@@ -117,22 +117,22 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           %[[VAL_50:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>>) -> !cc.ptr<i8>
 // CHECK:           cc.store %[[VAL_50]], %[[VAL_49]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_51:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_52:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
-// CHECK:           %[[VAL_53:.*]] = cc.cast %[[VAL_52]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_52:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr
+// CHECK:           %[[VAL_53:.*]] = cc.cast %[[VAL_52]] : (!llvm.ptr) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_54:.*]] = call @hybridLaunchKernel(%[[VAL_53]], %[[VAL_33]], %[[VAL_34]], %[[VAL_11]], %[[VAL_35]], %[[VAL_51]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           return
 // CHECK:         }
 // CHECK:         func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
-// CHECK:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// CHECK:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
 // CHECK:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
 // CHECK:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
 // CHECK:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
 // CHECK:         func.func private @malloc(i64) -> !cc.ptr<i8>
 // CHECK:         func.func private @free(!cc.ptr<i8>)
 // CHECK:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
-// CHECK:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x {{32|16}}>}>>, !cc.ptr<!cc.ptr<i8>>)
-// CHECK:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+// CHECK:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x {{(16|32)}}>}>>, !cc.ptr<!cc.ptr<i8>>)
+// CHECK:         func.func private @llvm.memcpy.p0.p0.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
 // CHECK-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -151,11 +151,11 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           %[[VAL_7:.*]] = call @malloc(%[[VAL_6]]) : (i64) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
 // CHECK:           %[[VAL_9:.*]] = arith.constant false
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_7]], %[[VAL_0]], %[[VAL_1]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_5]], %[[VAL_9]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_13]][0], %[[VAL_7]] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_14]][1], %[[VAL_6]] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
@@ -235,7 +235,7 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:           %[[VAL_36:.*]] = cc.load %[[VAL_35]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_37:.*]] = arith.constant false
 // CHECK:           %[[VAL_38:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i8>) -> !cc.ptr<i8>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_38]], %[[VAL_36]], %[[VAL_34]], %[[VAL_37]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_38]], %[[VAL_36]], %[[VAL_34]], %[[VAL_37]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_39:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
 // CHECK:           %[[VAL_40:.*]] = cc.compute_ptr %[[VAL_39]]{{\[}}%[[VAL_34]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
 // CHECK:           cc.store %[[VAL_20]], %[[VAL_1]] : !cc.ptr<!cc.ptr<i8>>
@@ -243,13 +243,13 @@ __nvqpp__mlirgen__function_cargo = "pants"}} {
 // CHECK:         }
 
 // CHECK-LABEL:   llvm.func @function_hawaiian.kernelRegFunc() {
-// CHECK:           %[[VAL_0:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr<array<18 x i8>>
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<18 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_0:.*]] = llvm.mlir.addressof @function_hawaiian.kernelName : !llvm.ptr
+// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr) -> !cc.ptr<i8>
 // CHECK:           func.call @cudaqRegisterKernelName(%[[VAL_1]]) : (!cc.ptr<i8>) -> ()
 // CHECK:           %[[VAL_2:.*]] = func.constant @function_hawaiian.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
 // CHECK:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_2]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
 // CHECK:           func.call @cudaqRegisterArgsCreator(%[[VAL_1]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
-// CHECK:         llvm.mlir.global_ctors {ctors = [@function_hawaiian.kernelRegFunc], priorities = [17 : i32]}
+// CHECK:         llvm.mlir.global_ctors ctors = [@function_hawaiian.kernelRegFunc], priorities = [17 : i32], data = [#llvm.zero]
 
diff --git a/test/Transforms/lambda_kernel_exec.qke b/test/Transforms/lambda_kernel_exec.qke
index 751257de7b5..cbe070c4da0 100644
--- a/test/Transforms/lambda_kernel_exec.qke
+++ b/test/Transforms/lambda_kernel_exec.qke
@@ -10,9 +10,9 @@
 
 // CHECK:   llvm.mlir.global external constant @lambda.main.canHaveMultiple.lambdaName("main::$_1\00") {addr_space = 0 : i32}
 // CHECK:   llvm.mlir.global external constant @lambda.main.test.lambdaName("main::$_0\00") {addr_space = 0 : i32}
-// CHECK: %[[VAL_0:.*]] = llvm.mlir.addressof @lambda.main.test.lambdaName : !llvm.ptr<array<10 x i8>>
-// CHECK-NEXT: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr<array<10 x i8>>) -> !llvm.ptr<i8>
-// CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_1]], %{{.*}}) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.addressof @lambda.main.test.lambdaName : !llvm.ptr
+// CHECK-NEXT: %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!llvm.ptr) -> !llvm.ptr
+// CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_1]], %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
 
 module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHaveMultiple = "_ZZ4mainENK3$_1clEv", __nvqpp__mlirgen__lambda.main.test = "_ZZ4mainENK3$_0clEv"}} {
   func.func @__nvqpp__mlirgen__lambda.main.test() attributes {"cudaq-entrypoint", no_this} {
@@ -50,9 +50,9 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHa
     return
   }
 
-// CHECK: %[[VAL_3:.*]] = llvm.mlir.addressof @lambda.main.canHaveMultiple.lambdaName : !llvm.ptr<array<10 x i8>>
-// CHECK-NEXT: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!llvm.ptr<array<10 x i8>>) -> !llvm.ptr<i8>
-// CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_4]], %{{.*}}) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.addressof @lambda.main.canHaveMultiple.lambdaName : !llvm.ptr
+// CHECK-NEXT: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!llvm.ptr) -> !llvm.ptr
+// CHECK: llvm.call @cudaqRegisterLambdaName(%[[VAL_4]], %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
 
   func.func @__nvqpp__mlirgen__lambda.main.canHaveMultiple() attributes {"cudaq-entrypoint", no_this} {
     %c2_i32 = arith.constant 2 : i32
diff --git a/test/Transforms/lambda_lifting-3.qke b/test/Transforms/lambda_lifting-3.qke
index d2e3443e1fc..d10804c5a51 100644
--- a/test/Transforms/lambda_lifting-3.qke
+++ b/test/Transforms/lambda_lifting-3.qke
@@ -61,11 +61,11 @@ func.func private @__nvqpp__mlirgen__func0..0x7df00edf9130(%arg0: !quake.ref) at
 
 // CHECK-LABEL:   func.func private @__nvqpp__lifted.lambda.2(
 
-// CHECK-LABEL:   func.func private @__nvqpp__callable.thunk.lambda.1(
-
-// CHECK-LABEL:   func.func private @__nvqpp__lifted.lambda.1(
-
 // CHECK-LABEL:   func.func private @__nvqpp__callable.thunk.lambda.0(
 
 // CHECK-LABEL:   func.func private @__nvqpp__lifted.lambda.0(
 
+// CHECK-LABEL:   func.func private @__nvqpp__callable.thunk.lambda.1(
+
+// CHECK-LABEL:   func.func private @__nvqpp__lifted.lambda.1(
+
diff --git a/test/Transforms/lambda_variable-2.qke b/test/Transforms/lambda_variable-2.qke
index 4efd3262b56..4342b0de2eb 100644
--- a/test/Transforms/lambda_variable-2.qke
+++ b/test/Transforms/lambda_variable-2.qke
@@ -95,7 +95,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__kernel_a = "_ZN8k
 // CHECK:           return
 // CHECK:         }
 
-// QIR-LABEL: define void @__nvqpp__mlirgen__kernel_b({ i8*, i8* } 
+// QIR-LABEL: define void @__nvqpp__mlirgen__kernel_b({ ptr, ptr } 
 
 // QIR-LABEL: define void @__nvqpp__mlirgen__kernel_a()
 // QIR:         call {{.*}} @__quantum__rt__qubit_allocate_array(i64 4)
diff --git a/test/Transforms/loop_peeling.qke b/test/Transforms/loop_peeling.qke
index 3695af31447..aa2952f1702 100644
--- a/test/Transforms/loop_peeling.qke
+++ b/test/Transforms/loop_peeling.qke
@@ -59,13 +59,21 @@ func.func @peel_do_while() {
 // CHECK:           %[[VAL_9:.*]] = arith.extui %[[VAL_7]] : i32 to i64
 // CHECK:           %[[VAL_10:.*]] = quake.extract_ref %[[VAL_4]][%[[VAL_9]]] : (!quake.veq<10>, i64) -> !quake.ref
 // CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_10]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_111:.*]] = quake.discriminate %[[VAL_11]] :
-// CHECK:           cc.store %[[VAL_111]], %[[VAL_6]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_11]] : (!quake.measure) -> i1
+// CHECK:           cc.store %[[VAL_12]], %[[VAL_6]] : !cc.ptr<i1>
 // CHECK:           cf.br ^bb2
 // CHECK:         ^bb2:
 // CHECK:           cc.loop while {
-// CHECK:             cc.if(%
-// CHECK:             cc.condition %
+// CHECK:             %[[VAL_13:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i1>
+// CHECK:             %[[VAL_14:.*]] = arith.cmpi eq, %[[VAL_13]], %[[VAL_1]] : i1
+// CHECK:             %[[VAL_15:.*]] = cc.if(%[[VAL_14]]) -> i1 {
+// CHECK:               cc.continue %[[VAL_1]] : i1
+// CHECK:             } else {
+// CHECK:               %[[VAL_16:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_3]] : i32
+// CHECK:               cc.continue %[[VAL_17]] : i1
+// CHECK:             }
+// CHECK:             cc.condition %[[VAL_15]]
 // CHECK:           } do {
 // CHECK:             %[[VAL_18:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_0]] : i32
@@ -73,8 +81,8 @@ func.func @peel_do_while() {
 // CHECK:             %[[VAL_20:.*]] = arith.extui %[[VAL_18]] : i32 to i64
 // CHECK:             %[[VAL_21:.*]] = quake.extract_ref %[[VAL_4]][%[[VAL_20]]] : (!quake.veq<10>, i64) -> !quake.ref
 // CHECK:             %[[VAL_22:.*]] = quake.mz %[[VAL_21]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_122:.*]] = quake.discriminate %[[VAL_22]] :
-// CHECK:             cc.store %[[VAL_122]], %[[VAL_6]] : !cc.ptr<i1>
+// CHECK:             %[[VAL_23:.*]] = quake.discriminate %[[VAL_22]] : (!quake.measure) -> i1
+// CHECK:             cc.store %[[VAL_23]], %[[VAL_6]] : !cc.ptr<i1>
 // CHECK:             cc.continue
 // CHECK:           }
 // CHECK:           cf.br ^bb3
@@ -113,37 +121,35 @@ func.func @peel_do_while_with_args() {
 }
 
 // CHECK-LABEL:   func.func @peel_do_while_with_args() {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant false
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 10 : i32
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 10 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant false
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : i32
 // CHECK-DAG:       %[[VAL_4:.*]] = quake.alloca !quake.veq<10>
-// CHECK:           cf.br ^bb1(%[[VAL_2]] : i32)
-// CHECK:         ^bb1(%[[VAL_5:.*]]: i32):
-// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_0]] : i32
-// CHECK:           %[[VAL_7:.*]] = arith.extui %[[VAL_5]] : i32 to i64
-// CHECK:           %[[VAL_8:.*]] = quake.extract_ref %[[VAL_4]][%[[VAL_7]]] : (!quake.veq<10>, i64) -> !quake.ref
-// CHECK:           %[[VAL_109:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_109]] :
-// CHECK:           cf.br ^bb2(%[[VAL_6]], %[[VAL_9]] : i32, i1)
-// CHECK:         ^bb2(%[[VAL_10:.*]]: i32, %[[VAL_11:.*]]: i1):
-// CHECK:           %[[VAL_12:.*]]:2 = cc.loop while ((%[[VAL_13:.*]] = %[[VAL_10]], %[[VAL_14:.*]] = %[[VAL_11]]) -> (i32, i1)) {
-// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_1]] : i1
-// CHECK:             %[[VAL_16:.*]] = cc.if(%[[VAL_15]]) -> i1 {
-// CHECK:               cc.continue %[[VAL_1]] : i1
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_4]][%[[VAL_0]]] : (!quake.veq<10>, i64) -> !quake.ref
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_7:.*]] = quake.discriminate %[[VAL_6]] : (!quake.measure) -> i1
+// CHECK:           cf.br ^bb2
+// CHECK:         ^bb2:
+// CHECK:           %[[VAL_8:.*]]:2 = cc.loop while ((%[[VAL_9:.*]] = %[[VAL_3]], %[[VAL_10:.*]] = %[[VAL_7]]) -> (i32, i1)) {
+// CHECK:             %[[VAL_11:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_2]] : i1
+// CHECK:             %[[VAL_12:.*]] = cc.if(%[[VAL_11]]) -> i1 {
+// CHECK:               cc.continue %[[VAL_2]] : i1
 // CHECK:             } else {
-// CHECK:               %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_13]], %[[VAL_3]] : i32
-// CHECK:               cc.continue %[[VAL_17]] : i1
+// CHECK:               %[[VAL_13:.*]] = arith.cmpi ult, %[[VAL_9]], %[[VAL_1]] : i32
+// CHECK:               cc.continue %[[VAL_13]] : i1
 // CHECK:             }
-// CHECK:             cc.condition %[[VAL_18:.*]](%[[VAL_13]], %[[VAL_14]] : i32, i1)
+// CHECK:             cc.condition %[[VAL_12]](%[[VAL_9]], %[[VAL_10]] : i32, i1)
 // CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_19:.*]]: i32, %[[VAL_20:.*]]: i1):
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_19]], %[[VAL_0]] : i32
-// CHECK:             %[[VAL_22:.*]] = arith.extui %[[VAL_19]] : i32 to i64
-// CHECK:             %[[VAL_23:.*]] = quake.extract_ref %[[VAL_4]][%[[VAL_22]]] : (!quake.veq<10>, i64) -> !quake.ref
-// CHECK:             %[[VAL_124:.*]] = quake.mz %[[VAL_23]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_24:.*]] = quake.discriminate %[[VAL_124]] :
-// CHECK:             cc.continue %[[VAL_21]], %[[VAL_24]] : i32, i1
+// CHECK:           ^bb0(%[[VAL_14:.*]]: i32, %[[VAL_15:.*]]: i1):
+// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_3]] : i32
+// CHECK:             %[[VAL_17:.*]] = arith.extui %[[VAL_14]] : i32 to i64
+// CHECK:             %[[VAL_18:.*]] = quake.extract_ref %[[VAL_4]][%[[VAL_17]]] : (!quake.veq<10>, i64) -> !quake.ref
+// CHECK:             %[[VAL_19:.*]] = quake.mz %[[VAL_18]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_20:.*]] = quake.discriminate %[[VAL_19]] : (!quake.measure) -> i1
+// CHECK:             cc.continue %[[VAL_16]], %[[VAL_20]] : i32, i1
 // CHECK:           }
 // CHECK:           cf.br ^bb3
 // CHECK:         ^bb3:
diff --git a/test/Transforms/memtoreg-7.qke b/test/Transforms/memtoreg-7.qke
index 111d7934abb..8f6f1be15a9 100644
--- a/test/Transforms/memtoreg-7.qke
+++ b/test/Transforms/memtoreg-7.qke
@@ -81,7 +81,6 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CHECK:               cc.scope {
 // CHECK:                 %[[VAL_21:.*]] = cc.undef !cc.stdvec<i1>
 // CHECK:               }
-// CHECK:             } else {
 // CHECK:             }
 // CHECK:             cc.continue %[[VAL_11]], %[[VAL_17]] : i64, i1
 // CHECK:           } step {
@@ -89,14 +88,12 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CHECK:             %[[VAL_24:.*]] = arith.addi %[[VAL_22]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_24]], %[[VAL_23]] : i64, i1
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_25:.*]] = arith.cmpi eq, %[[VAL_26:.*]]#1, %[[VAL_3]] : i1
-// CHECK:           cc.if(%[[VAL_25]]) {
+// CHECK:           cc.if(%[[VAL_26:.*]]#1) {
 // CHECK:             %[[VAL_27:.*]] = quake.mz %[[VAL_5]] name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:             %[[VAL_28:.*]] = quake.discriminate %[[VAL_27]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:             cc.scope {
 // CHECK:               %[[VAL_29:.*]] = cc.undef !cc.stdvec<i1>
 // CHECK:             }
-// CHECK:           } else {
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
@@ -128,7 +125,6 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_20:.*]] = cc.undef !cc.stdvec<i1>
 // CANOE:             }
-// CANOE:           } else {
 // CANOE:           }
 // CANOE:           %[[VAL_21:.*]] = quake.mz %[[VAL_4]] name "res" : (!quake.ref) -> !quake.measure
 // CANOE:           %[[VAL_22:.*]] = quake.discriminate %[[VAL_21]] : (!quake.measure) -> i1
@@ -150,27 +146,24 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_35:.*]] = cc.undef !cc.stdvec<i1>
 // CANOE:             }
-// CANOE:           } else {
 // CANOE:           }
-// CANOE:           %[[VAL_36:.*]] = arith.cmpi eq, %[[VAL_22]], %[[VAL_2]] : i1
-// CANOE:           cc.if(%[[VAL_36]]) {
-// CANOE:             %[[VAL_37:.*]] = cc.alloca !cc.array<i8 x 2>
-// CANOE:             %[[VAL_38:.*]] = quake.mz %[[VAL_3]] name "outer_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_39:.*]] = quake.discriminate %[[VAL_38]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_40:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CANOE:             %[[VAL_41:.*]] = cc.cast unsigned %[[VAL_39]] : (i1) -> i8
-// CANOE:             cc.store %[[VAL_41]], %[[VAL_40]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_42:.*]] = quake.mz %[[VAL_4]] name "outer_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_43:.*]] = quake.discriminate %[[VAL_42]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:           cc.if(%[[VAL_22]]) {
+// CANOE:             %[[VAL_39:.*]] = cc.alloca !cc.array<i8 x 2>
+// CANOE:             %[[VAL_37:.*]] = quake.mz %[[VAL_3]] name "outer_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_40:.*]] = quake.discriminate %[[VAL_37]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_41:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_42:.*]] = cc.cast unsigned %[[VAL_40]] : (i1) -> i8
+// CANOE:             cc.store %[[VAL_42]], %[[VAL_41]] : !cc.ptr<i8>
+// CANOE:             %[[VAL_38:.*]] = quake.mz %[[VAL_4]] name "outer_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_43:.*]] = quake.discriminate %[[VAL_38]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_39]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // CANOE:             %[[VAL_45:.*]] = cc.cast unsigned %[[VAL_43]] : (i1) -> i8
 // CANOE:             cc.store %[[VAL_45]], %[[VAL_44]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_46:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
+// CANOE:             %[[VAL_46:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
 // CANOE:             %[[VAL_47:.*]] = cc.stdvec_init %[[VAL_46]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_48:.*]] = cc.undef !cc.stdvec<i1>
 // CANOE:             }
-// CANOE:           } else {
 // CANOE:           }
 // CANOE:           return
 // CANOE:         }
diff --git a/test/Transforms/qir_api_branching.qke b/test/Transforms/qir_api_branching.qke
index 75ccad0945b..f8ec88e1742 100644
--- a/test/Transforms/qir_api_branching.qke
+++ b/test/Transforms/qir_api_branching.qke
@@ -69,28 +69,27 @@ func.func @__nvqpp__mlirgen__kernel() attributes {"cudaq-entrypoint", "cudaq-ker
 // CHECK:           cf.br ^bb1(%[[VAL_2]] : i64)
 // CHECK:         ^bb1(%[[VAL_18:.*]]: i64):
 // CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_17]] : i64
-// CHECK:           cf.cond_br %[[VAL_19]], ^bb2(%[[VAL_18]] : i64), ^bb3(%[[VAL_13]] : !cc.ptr<!llvm.struct<"Qubit", opaque>>)
-// CHECK:         ^bb2(%[[VAL_20:.*]]: i64):
-// CHECK:           %[[VAL_21:.*]] = arith.muli %[[VAL_20]], %[[VAL_5]] : i64
-// CHECK:           %[[VAL_22:.*]] = arith.addi %[[VAL_11]], %[[VAL_21]] : i64
-// CHECK:           %[[VAL_23:.*]] = arith.subi %[[VAL_22]], %[[VAL_4]] : i64
-// CHECK:           %[[VAL_24:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_23]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           cf.cond_br %[[VAL_19]], ^bb2, ^bb3
+// CHECK:         ^bb2:
+// CHECK:           %[[VAL_20:.*]] = arith.subi %[[VAL_11]], %[[VAL_18]] : i64
+// CHECK:           %[[VAL_21:.*]] = arith.subi %[[VAL_20]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_22:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_21]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_23:.*]] = cc.load %[[VAL_22]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_24:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_20]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
 // CHECK:           %[[VAL_25:.*]] = cc.load %[[VAL_24]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_26:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_22]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_27:.*]] = cc.load %[[VAL_26]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_28:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_29:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
-// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_2]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]], %[[VAL_29]], %[[VAL_28]], %[[VAL_30]]) : (i64, i64, i64, i64, !llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
-// CHECK:           %[[VAL_31:.*]] = arith.addi %[[VAL_20]], %[[VAL_4]] : i64
-// CHECK:           cf.br ^bb1(%[[VAL_31]] : i64)
-// CHECK:         ^bb3(%[[VAL_32:.*]]: !cc.ptr<!llvm.struct<"Qubit", opaque>>):
-// CHECK:           %[[VAL_33:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_34:.*]] = cc.load %[[VAL_33]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_36:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr<i8>
-// CHECK:           %[[VAL_37:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
-// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_2]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]], %[[VAL_36]], %[[VAL_35]], %[[VAL_37]]) : (i64, i64, i64, i64, !llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK:           %[[VAL_26:.*]] = cc.cast %[[VAL_23]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr
+// CHECK:           %[[VAL_27:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr
+// CHECK:           %[[VAL_28:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr
+// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_2]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]], %[[VAL_27]], %[[VAL_26]], %[[VAL_28]]) : (i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
+// CHECK:           %[[VAL_29:.*]] = arith.addi %[[VAL_18]], %[[VAL_4]] : i64
+// CHECK:           cf.br ^bb1(%[[VAL_29]] : i64)
+// CHECK:         ^bb3:
+// CHECK:           %[[VAL_30:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_31:.*]] = cc.load %[[VAL_30]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr
+// CHECK:           %[[VAL_33:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr
+// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr
+// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_2]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]], %[[VAL_33]], %[[VAL_32]], %[[VAL_34]]) : (i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> ()
 // CHECK:           call @__quantum__rt__qubit_release_array(%[[VAL_8]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> ()
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/qir_base_profile.qke b/test/Transforms/qir_base_profile.qke
index 398ec109167..fb59d9eae7d 100644
--- a/test/Transforms/qir_base_profile.qke
+++ b/test/Transforms/qir_base_profile.qke
@@ -190,7 +190,7 @@ module attributes {cc.sizeof_string = 32 : i64, llvm.data_layout = "e-m:e-p270:3
 // CHECK:         func.func private @__quantum__qis__exp_pauli__ctl(f64, !cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<i8>)
 // CHECK:         func.func private @__quantum__qis__custom_unitary(!cc.ptr<complex<f64>>, !cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<i8>)
 // CHECK:         func.func private @__quantum__qis__custom_unitary__adj(!cc.ptr<complex<f64>>, !cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<i8>)
-// CHECK:         llvm.func @generalizedInvokeWithRotationsControlsTargets(i64, i64, i64, i64, !llvm.ptr<i8>, ...) attributes {sym_visibility = "private"}
+// CHECK:         llvm.func @generalizedInvokeWithRotationsControlsTargets(i64, i64, i64, i64, !llvm.ptr, ...) attributes {sym_visibility = "private"}
 // CHECK:         func.func private @__quantum__qis__h__body(!cc.ptr<!llvm.struct<"Qubit", opaque>>)
 // CHECK:         func.func private @__quantum__qis__x__body(!cc.ptr<!llvm.struct<"Qubit", opaque>>)
 // CHECK:         func.func private @__quantum__qis__y__body(!cc.ptr<!llvm.struct<"Qubit", opaque>>)
diff --git a/test/Transforms/return_vector.qke b/test/Transforms/return_vector.qke
index ddeccffa1c8..b93d608f801 100644
--- a/test/Transforms/return_vector.qke
+++ b/test/Transforms/return_vector.qke
@@ -40,6 +40,7 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>> {llvm.sret = !cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i32>}>}, %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 4 : i64
 // CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_34:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_5:.*]] = constant @test_0.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_7:.*]] = cc.alloca !cc.ptr<i8>
@@ -79,8 +80,7 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_31]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
 // CHECK:           cc.store %[[VAL_32]], %[[VAL_30]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_33:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_34:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!llvm.ptr) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_36:.*]] = call @hybridLaunchKernel(%[[VAL_35]], %[[VAL_18]], %[[VAL_19]], %[[VAL_10]], %[[VAL_20]], %[[VAL_33]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_37:.*]] = cc.extract_value %[[VAL_36]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_38:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<i8>) -> i64
@@ -137,6 +137,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK-LABEL:   func.func @test_1(
 // CHECK-SAME:      %[[VAL_0:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>> {llvm.sret = !cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f64>}>}, %[[VAL_1:.*]]: !cc.ptr<i8>, %[[VAL_2:.*]]: i32) {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 8 : i64
+// CHECK:           %[[VAL_33:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr
 // CHECK:           %[[VAL_4:.*]] = constant @test_1.thunk : (!cc.ptr<i8>, i1) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_6:.*]] = cc.alloca !cc.ptr<i8>
@@ -176,8 +177,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<i32>) -> !cc.ptr<i8>
 // CHECK:           cc.store %[[VAL_31]], %[[VAL_29]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_20]] : (!cc.ptr<!cc.struct<{!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>}>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_33:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!llvm.ptr) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_35:.*]] = call @hybridLaunchKernel(%[[VAL_34]], %[[VAL_17]], %[[VAL_18]], %[[VAL_9]], %[[VAL_19]], %[[VAL_32]]) : (!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_36:.*]] = cc.extract_value %[[VAL_35]][0] : (!cc.struct<{!cc.ptr<i8>, i64}>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_37:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<i8>) -> i64
@@ -214,14 +214,14 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK:         func.func private @hybridLaunchKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>, i64, i64, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:         func.func private @cudaqRegisterArgsCreator(!cc.ptr<i8>, !cc.ptr<i8>)
-// CHECK:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {sym_visibility = "private"}
+// CHECK:         llvm.func @cudaqRegisterLambdaName(!llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
 // CHECK:         func.func private @__cudaq_registerLinkableKernel(!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>)
 // CHECK:         func.func private @__cudaq_getLinkableKernelKey(!cc.ptr<i8>) -> i64
 // CHECK:         func.func private @cudaqRegisterKernelName(!cc.ptr<i8>)
 // CHECK:         func.func private @free(!cc.ptr<i8>)
 // CHECK:         func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64)
 // CHECK:         func.func private @__nvqpp_vector_bool_to_initializer_list(!cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.ptr<i1>, !cc.ptr<i1>}>>, !cc.ptr<!cc.struct<{!cc.ptr<i1>, !cc.array<i8 x {{32|16}}>}>>, !cc.ptr<!cc.ptr<i8>>)
-// CHECK:         func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+// CHECK:         func.func private @llvm.memcpy.p0.p0.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
 
 // CHECK-LABEL:   func.func private @__nvqpp_zeroDynamicResult() -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
@@ -243,11 +243,11 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:           %[[VAL_7:.*]] = arith.addi %[[VAL_1]], %[[VAL_6]] : i64
 // CHECK:           %[[VAL_8:.*]] = call @malloc(%[[VAL_7]]) : (i64) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_8]], %[[VAL_0]], %[[VAL_1]], %[[VAL_4]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_8]], %[[VAL_0]], %[[VAL_1]], %[[VAL_4]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_1]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_6]], %[[VAL_4]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @llvm.memcpy.p0.p0.i64(%[[VAL_12]], %[[VAL_11]], %[[VAL_6]], %[[VAL_4]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
 // CHECK:           %[[VAL_13:.*]] = cc.undef !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_14:.*]] = cc.insert_value %[[VAL_13]][0], %[[VAL_8]] : (!cc.struct<{!cc.ptr<i8>, i64}>, !cc.ptr<i8>) -> !cc.struct<{!cc.ptr<i8>, i64}>
 // CHECK:           %[[VAL_15:.*]] = cc.insert_value %[[VAL_14]][1], %[[VAL_7]] : (!cc.struct<{!cc.ptr<i8>, i64}>, i64) -> !cc.struct<{!cc.ptr<i8>, i64}>
@@ -303,14 +303,14 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   llvm.func @test_0.kernelRegFunc() {
 // CHECK:           %[[VAL_0:.*]] = func.constant @test_0.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
-// CHECK:           %[[VAL_1:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_1:.*]] = llvm.mlir.addressof @test_0.kernelName : !llvm.ptr
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr) -> !cc.ptr<i8>
 // CHECK:           func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr<i8>) -> ()
 // CHECK:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
 // CHECK:           func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
-// CHECK:         llvm.mlir.global_ctors {ctors = [@test_0.kernelRegFunc], priorities = [17 : i32]}
+// CHECK:         llvm.mlir.global_ctors ctors = [@test_0.kernelRegFunc], priorities = [17 : i32]
 // CHECK:         llvm.mlir.global external constant @test_1.kernelName("test_1\00") {addr_space = 0 : i32}
 
 // CHECK-LABEL:   func.func @test_1.returnOffset() -> i64 {
@@ -358,12 +358,11 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   llvm.func @test_1.kernelRegFunc() {
 // CHECK:           %[[VAL_0:.*]] = func.constant @test_1.argsCreator : (!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64
-// CHECK:           %[[VAL_1:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr<array<7 x i8>>
-// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr<array<7 x i8>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_1:.*]] = llvm.mlir.addressof @test_1.kernelName : !llvm.ptr
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!llvm.ptr) -> !cc.ptr<i8>
 // CHECK:           func.call @cudaqRegisterKernelName(%[[VAL_2]]) : (!cc.ptr<i8>) -> ()
 // CHECK:           %[[VAL_3:.*]] = cc.func_ptr %[[VAL_0]] : ((!cc.ptr<!cc.ptr<i8>>, !cc.ptr<!cc.ptr<i8>>) -> i64) -> !cc.ptr<i8>
 // CHECK:           func.call @cudaqRegisterArgsCreator(%[[VAL_2]], %[[VAL_3]]) : (!cc.ptr<i8>, !cc.ptr<i8>) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
-// CHECK:         llvm.mlir.global_ctors {ctors = [@test_1.kernelRegFunc], priorities = [17 : i32]}
-
+// CHECK:         llvm.mlir.global_ctors ctors = [@test_1.kernelRegFunc], priorities = [17 : i32]
diff --git a/test/Transforms/state_prep.qke b/test/Transforms/state_prep.qke
index e779117dee1..36374460ccb 100644
--- a/test/Transforms/state_prep.qke
+++ b/test/Transforms/state_prep.qke
@@ -21,8 +21,8 @@ module {
   cc.global constant private @test_complex_constant_array.rodata_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
 
 // CHECK-LABEL:   func.func @test_complex_constant_array() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_3]] : (f64, !quake.ref) -> ()
@@ -46,8 +46,8 @@ module {
   cc.global constant private @test_real_constant_array.rodata_0 (dense<[0.70710678118654757, 0.70710678118654757, 0.000000e+00, 0.000000e+00]> : tensor<4xf64>) : !cc.array<f64 x 4>
 
 // CHECK-LABEL:   func.func @test_real_constant_array() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_4]] : (f64, !quake.ref) -> ()
@@ -72,8 +72,8 @@ module {
   cc.global constant private @test_complex_array_param.rodata_init_state.0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
 
 // CHECK-LABEL:   func.func @test_complex_array_param() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_3]] : (f64, !quake.ref) -> ()
@@ -98,8 +98,8 @@ module {
   cc.global constant private @test_real_array_param.rodata_init_state.1 (dense<[0.707106769, 0.707106769, 0.000000e+00, 0.000000e+00]> : tensor<4xf32>) : !cc.array<f32 x 4>
 
 // CHECK-LABEL:   func.func @test_real_array_param() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0.78539816339744839 : f64
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2:.*]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.ry (%[[VAL_1]]) %[[VAL_3]] : (f64, !quake.ref) -> ()
diff --git a/test/Transforms/vector.qke b/test/Transforms/vector.qke
index a72cc2ee5e0..cee2db4855b 100644
--- a/test/Transforms/vector.qke
+++ b/test/Transforms/vector.qke
@@ -94,35 +94,35 @@ func.func @vector_vector(%vecvec : !cc.stdvec<!cc.stdvec<i32>>, %retval : !cc.st
 // CHECK:           return
 // CHECK:         }
 
-// QIR-LABEL: define void @vector_vector({ { i32*, i64 }*, i64 } 
-// QIR-SAME:    %[[VAL_0:.*]], { { double*, i64 }*, i64 } %[[VAL_1:.*]])
-// QIR:         %[[VAL_2:.*]] = extractvalue { { i32*, i64 }*, i64 } %[[VAL_0]], 1
-// QIR:         %[[VAL_3:.*]] = extractvalue { { double*, i64 }*, i64 } %[[VAL_1]], 0
+// QIR-LABEL: define void @vector_vector({ ptr, i64 }
+// QIR-SAME:    %[[VAL_0:.*]], { ptr, i64 } %[[VAL_1:.*]])
+// QIR:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
+// QIR:         %[[VAL_3:.*]] = extractvalue { ptr, i64 } %[[VAL_1]], 0
 // QIR:         %[[VAL_4:.*]] = icmp eq i64 %[[VAL_2]], 0
 // QIR:         br i1 %[[VAL_4]], label %[[VAL_5:.*]], label %[[VAL_6:.*]]
 // QIR:       .{{.*}}:
 // QIR-SAME:  ; preds = %[[VAL_7:.*]]
-// QIR:         %[[VAL_8:.*]] = extractvalue { { i32*, i64 }*, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_8:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
 // QIR:         br label %[[VAL_9:.*]]
 // QIR:       {{.*}}:
 // QIR-SAME:  ; preds = %[[VAL_6]], %[[VAL_10:.*]]
 // QIR:         %[[VAL_11:.*]] = phi i64 [ 0, %[[VAL_6]] ], [ %[[VAL_12:.*]], %[[VAL_10]] ]
-// QIR:         %[[VAL_13:.*]] = getelementptr { i32*, i64 }, { i32*, i64 }* %[[VAL_8]], i64 %[[VAL_11]], i32 0
-// QIR:         %[[VAL_14:.*]] = load i32*, i32** %[[VAL_13]], align 8
-// QIR:         %[[VAL_15:.*]] = getelementptr { i32*, i64 }, { i32*, i64 }* %[[VAL_8]], i64 %[[VAL_11]], i32 1
-// QIR:         %[[VAL_16:.*]] = load i64, i64* %[[VAL_15]], align 8
-// QIR:         %[[VAL_17:.*]] = getelementptr { double*, i64 }, { double*, i64 }* %[[VAL_3]], i64 %[[VAL_11]], i32 0
-// QIR:         %[[VAL_18:.*]] = load double*, double** %[[VAL_17]], align 8
+// QIR:         %[[VAL_13:.*]] = getelementptr { ptr, i64 }, ptr %[[VAL_8]], i64 %[[VAL_11]]
+// QIR:         %[[VAL_14:.*]] = load ptr, ptr %[[VAL_13]], align 8
+// QIR:         %[[VAL_15:.*]] = getelementptr {{.*}} i8, ptr %[[VAL_13]], i64 8
+// QIR:         %[[VAL_16:.*]] = load i64, ptr %[[VAL_15]], align 8
+// QIR:         %[[VAL_17:.*]] = getelementptr { ptr, i64 }, ptr %[[VAL_3]], i64 %[[VAL_11]]
+// QIR:         %[[VAL_18:.*]] = load ptr, ptr %[[VAL_17]], align 8
 // QIR:         %[[VAL_19:.*]] = icmp eq i64 %[[VAL_16]], 0
 // QIR:         br i1 %[[VAL_19]], label %[[VAL_10]], label %[[VAL_20:.*]]
 // QIR:       .{{.*}}:
 // QIR-SAME:  ; preds = %[[VAL_9]], %[[VAL_20]]
 // QIR:         %[[VAL_21:.*]] = phi i64 [ %[[VAL_22:.*]], %[[VAL_20]] ], [ 0, %[[VAL_9]] ]
-// QIR:         %[[VAL_23:.*]] = getelementptr i32, i32* %[[VAL_14]], i64 %[[VAL_21]]
-// QIR:         %[[VAL_24:.*]] = load i32, i32* %[[VAL_23]], align 4
+// QIR:         %[[VAL_23:.*]] = getelementptr i32, ptr %[[VAL_14]], i64 %[[VAL_21]]
+// QIR:         %[[VAL_24:.*]] = load i32, ptr %[[VAL_23]], align 4
 // QIR:         %[[VAL_25:.*]] = sitofp i32 %[[VAL_24]] to double
-// QIR:         %[[VAL_26:.*]] = getelementptr double, double* %[[VAL_18]], i64 %[[VAL_21]]
-// QIR:         store double %[[VAL_25]], double* %[[VAL_26]], align 8
+// QIR:         %[[VAL_26:.*]] = getelementptr double, ptr %[[VAL_18]], i64 %[[VAL_21]]
+// QIR:         store double %[[VAL_25]], ptr %[[VAL_26]], align 8
 // QIR:         %[[VAL_22]] = add nuw i64 %[[VAL_21]], 1
 // QIR:         %[[VAL_27:.*]] = icmp eq i64 %[[VAL_22]], %[[VAL_16]]
 // QIR:         br i1 %[[VAL_27]], label %[[VAL_10]], label %[[VAL_20]]
diff --git a/test/Transforms/wireset_codegen.qke b/test/Transforms/wireset_codegen.qke
index 8d47238c225..54cb563a6e1 100644
--- a/test/Transforms/wireset_codegen.qke
+++ b/test/Transforms/wireset_codegen.qke
@@ -133,82 +133,82 @@ func.func @__nvqpp__mlirgen__comprehensive() attributes {"cudaq-entrypoint", "cu
 // BASE-DAG:       %[[VAL_10:.*]] = arith.constant -1.000000e+00 : f64
 // BASE-DAG:       %[[VAL_11:.*]] = arith.constant 0 : i64
 // BASE:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// BASE:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // BASE:           %[[VAL_14:.*]] = arith.constant 1 : i64
 // BASE:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// BASE:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // BASE:           %[[VAL_17:.*]] = arith.constant 2 : i64
 // BASE:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// BASE:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // BASE:           %[[VAL_20:.*]] = arith.constant 3 : i64
 // BASE:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// BASE:           %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // BASE:           %[[VAL_23:.*]] = arith.constant 4 : i64
 // BASE:           %[[VAL_24:.*]] = cc.cast %[[VAL_23]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// BASE:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // BASE:           %[[VAL_26:.*]] = arith.constant 5 : i64
 // BASE:           %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// BASE:           %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // BASE:           %[[VAL_29:.*]] = arith.constant 6 : i64
 // BASE:           %[[VAL_30:.*]] = cc.cast %[[VAL_29]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
-// BASE:           call @__quantum__qis__h__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__h__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__h__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__h__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__x__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__x__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__x__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__x__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__cnot__body(%[[VAL_19]], %[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__y__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__y__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__y__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__y__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__z__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__z__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__z__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__z__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__t__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__t__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__t__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__t__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__tdg__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__s__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__s__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__s__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__s__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__sdg__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__rx__body(%[[VAL_7]], %[[VAL_28]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__rx__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__ry__body(%[[VAL_6]], %[[VAL_25]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__ry__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__rz__body(%[[VAL_4]], %[[VAL_28]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__rz__body(%[[VAL_3]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__r1__body(%[[VAL_2]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__r1__body(%[[VAL_1]], %[[VAL_31]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__r1__body(%[[VAL_0]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__swap__body(%[[VAL_13]], %[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// BASE:           call @__quantum__qis__u3__body(%[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_22]]) : (f64, f64, f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
+// BASE:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__h__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__h__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__h__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__h__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__x__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__x__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__x__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__x__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__cnot__body(%[[VAL_19]], %[[VAL_25]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__y__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__y__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__y__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__y__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__z__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__z__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__z__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__z__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__t__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__t__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__t__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__t__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__tdg__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__s__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__s__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__s__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__s__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__sdg__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__rx__body(%[[VAL_7]], %[[VAL_28]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__rx__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__ry__body(%[[VAL_6]], %[[VAL_25]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__ry__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__rz__body(%[[VAL_4]], %[[VAL_28]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__rz__body(%[[VAL_3]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__r1__body(%[[VAL_2]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__r1__body(%[[VAL_1]], %[[VAL_31]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__r1__body(%[[VAL_0]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__swap__body(%[[VAL_13]], %[[VAL_31]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
+// BASE:           call @__quantum__qis__u3__body(%[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_22]]) : (f64, f64, f64, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_32:.*]] = arith.constant 0 : i64
 // BASE:           %[[VAL_33:.*]] = cc.cast %[[VAL_32]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// BASE:           call @__quantum__qis__mz__body(%[[VAL_13]], %[[VAL_34]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// BASE:           %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__mz__body(%[[VAL_13]], %[[VAL_34]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_35:.*]] = cc.alloca i8
 // BASE:           %[[VAL_36:.*]] = cc.address_of @cstr.73696E676C65746F6E00 : !cc.ptr<!llvm.array<10 x i8>>
 // BASE:           %[[VAL_37:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<!llvm.array<10 x i8>>) -> !cc.ptr<i8>
-// BASE:           call @__quantum__rt__result_record_output(%[[VAL_34]], %[[VAL_37]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
+// BASE:           call @__quantum__rt__result_record_output(%[[VAL_34]], %[[VAL_37]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
 // BASE:           %[[VAL_38:.*]] = cc.undef i1
 // BASE:           %[[VAL_39:.*]] = cc.cast unsigned %[[VAL_38]] : (i1) -> i8
 // BASE:           cc.store %[[VAL_39]], %[[VAL_35]] : !cc.ptr<i8>
 // BASE:           %[[VAL_40:.*]] = cc.alloca !cc.array<i8 x 1>
 // BASE:           %[[VAL_41:.*]] = arith.constant 1 : i64
 // BASE:           %[[VAL_42:.*]] = cc.cast %[[VAL_41]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// BASE:           call @__quantum__qis__mz__body(%[[VAL_16]], %[[VAL_43]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// BASE:           %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__mz__body(%[[VAL_16]], %[[VAL_43]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_44:.*]] = cc.address_of @cstr.65696E7300 : !cc.ptr<!llvm.array<5 x i8>>
 // BASE:           %[[VAL_45:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// BASE:           call @__quantum__rt__result_record_output(%[[VAL_43]], %[[VAL_45]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
+// BASE:           call @__quantum__rt__result_record_output(%[[VAL_43]], %[[VAL_45]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
 // BASE:           %[[VAL_46:.*]] = cc.undef i1
 // BASE:           %[[VAL_47:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr<!cc.array<i8 x 1>>) -> !cc.ptr<i8>
 // BASE:           %[[VAL_48:.*]] = cc.cast unsigned %[[VAL_46]] : (i1) -> i8
@@ -216,22 +216,22 @@ func.func @__nvqpp__mlirgen__comprehensive() attributes {"cudaq-entrypoint", "cu
 // BASE:           %[[VAL_49:.*]] = cc.alloca !cc.array<i8 x 2>
 // BASE:           %[[VAL_50:.*]] = arith.constant 2 : i64
 // BASE:           %[[VAL_51:.*]] = cc.cast %[[VAL_50]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_52:.*]] = cc.cast %[[VAL_51]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// BASE:           call @__quantum__qis__mz__body(%[[VAL_19]], %[[VAL_52]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// BASE:           %[[VAL_52:.*]] = cc.cast %[[VAL_51]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__mz__body(%[[VAL_19]], %[[VAL_52]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_53:.*]] = cc.address_of @cstr.64756200 : !cc.ptr<!llvm.array<4 x i8>>
 // BASE:           %[[VAL_54:.*]] = cc.cast %[[VAL_53]] : (!cc.ptr<!llvm.array<4 x i8>>) -> !cc.ptr<i8>
-// BASE:           call @__quantum__rt__result_record_output(%[[VAL_52]], %[[VAL_54]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
+// BASE:           call @__quantum__rt__result_record_output(%[[VAL_52]], %[[VAL_54]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
 // BASE:           %[[VAL_55:.*]] = cc.undef i1
 // BASE:           %[[VAL_56:.*]] = cc.cast %[[VAL_49]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // BASE:           %[[VAL_57:.*]] = cc.cast unsigned %[[VAL_55]] : (i1) -> i8
 // BASE:           cc.store %[[VAL_57]], %[[VAL_56]] : !cc.ptr<i8>
 // BASE:           %[[VAL_58:.*]] = arith.constant 3 : i64
 // BASE:           %[[VAL_59:.*]] = cc.cast %[[VAL_58]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_60:.*]] = cc.cast %[[VAL_59]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// BASE:           call @__quantum__qis__mz__body(%[[VAL_22]], %[[VAL_60]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// BASE:           %[[VAL_60:.*]] = cc.cast %[[VAL_59]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__mz__body(%[[VAL_22]], %[[VAL_60]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_61:.*]] = cc.address_of @cstr.64756200 : !cc.ptr<!llvm.array<4 x i8>>
 // BASE:           %[[VAL_62:.*]] = cc.cast %[[VAL_61]] : (!cc.ptr<!llvm.array<4 x i8>>) -> !cc.ptr<i8>
-// BASE:           call @__quantum__rt__result_record_output(%[[VAL_60]], %[[VAL_62]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
+// BASE:           call @__quantum__rt__result_record_output(%[[VAL_60]], %[[VAL_62]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
 // BASE:           %[[VAL_63:.*]] = cc.undef i1
 // BASE:           %[[VAL_64:.*]] = cc.compute_ptr %[[VAL_49]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // BASE:           %[[VAL_65:.*]] = cc.cast unsigned %[[VAL_63]] : (i1) -> i8
@@ -239,33 +239,33 @@ func.func @__nvqpp__mlirgen__comprehensive() attributes {"cudaq-entrypoint", "cu
 // BASE:           %[[VAL_66:.*]] = cc.alloca !cc.array<i8 x 3>
 // BASE:           %[[VAL_67:.*]] = arith.constant 4 : i64
 // BASE:           %[[VAL_68:.*]] = cc.cast %[[VAL_67]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_69:.*]] = cc.cast %[[VAL_68]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// BASE:           call @__quantum__qis__mz__body(%[[VAL_25]], %[[VAL_69]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// BASE:           %[[VAL_69:.*]] = cc.cast %[[VAL_68]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__mz__body(%[[VAL_25]], %[[VAL_69]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_70:.*]] = cc.address_of @cstr.7472697000 : !cc.ptr<!llvm.array<5 x i8>>
 // BASE:           %[[VAL_71:.*]] = cc.cast %[[VAL_70]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// BASE:           call @__quantum__rt__result_record_output(%[[VAL_69]], %[[VAL_71]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
+// BASE:           call @__quantum__rt__result_record_output(%[[VAL_69]], %[[VAL_71]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
 // BASE:           %[[VAL_72:.*]] = cc.undef i1
 // BASE:           %[[VAL_73:.*]] = cc.cast %[[VAL_66]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // BASE:           %[[VAL_74:.*]] = cc.cast unsigned %[[VAL_72]] : (i1) -> i8
 // BASE:           cc.store %[[VAL_74]], %[[VAL_73]] : !cc.ptr<i8>
 // BASE:           %[[VAL_75:.*]] = arith.constant 5 : i64
 // BASE:           %[[VAL_76:.*]] = cc.cast %[[VAL_75]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_77:.*]] = cc.cast %[[VAL_76]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// BASE:           call @__quantum__qis__mz__body(%[[VAL_28]], %[[VAL_77]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// BASE:           %[[VAL_77:.*]] = cc.cast %[[VAL_76]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__mz__body(%[[VAL_28]], %[[VAL_77]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_78:.*]] = cc.address_of @cstr.7472697000 : !cc.ptr<!llvm.array<5 x i8>>
 // BASE:           %[[VAL_79:.*]] = cc.cast %[[VAL_78]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// BASE:           call @__quantum__rt__result_record_output(%[[VAL_77]], %[[VAL_79]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
+// BASE:           call @__quantum__rt__result_record_output(%[[VAL_77]], %[[VAL_79]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
 // BASE:           %[[VAL_80:.*]] = cc.undef i1
 // BASE:           %[[VAL_81:.*]] = cc.compute_ptr %[[VAL_66]][1] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // BASE:           %[[VAL_82:.*]] = cc.cast unsigned %[[VAL_80]] : (i1) -> i8
 // BASE:           cc.store %[[VAL_82]], %[[VAL_81]] : !cc.ptr<i8>
 // BASE:           %[[VAL_83:.*]] = arith.constant 6 : i64
 // BASE:           %[[VAL_84:.*]] = cc.cast %[[VAL_83]] : (i64) -> !cc.ptr<none>
-// BASE:           %[[VAL_85:.*]] = cc.cast %[[VAL_84]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// BASE:           call @__quantum__qis__mz__body(%[[VAL_31]], %[[VAL_85]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// BASE:           %[[VAL_85:.*]] = cc.cast %[[VAL_84]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// BASE:           call @__quantum__qis__mz__body(%[[VAL_31]], %[[VAL_85]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // BASE:           %[[VAL_86:.*]] = cc.address_of @cstr.7472697000 : !cc.ptr<!llvm.array<5 x i8>>
 // BASE:           %[[VAL_87:.*]] = cc.cast %[[VAL_86]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// BASE:           call @__quantum__rt__result_record_output(%[[VAL_85]], %[[VAL_87]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
+// BASE:           call @__quantum__rt__result_record_output(%[[VAL_85]], %[[VAL_87]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
 // BASE:           %[[VAL_88:.*]] = cc.undef i1
 // BASE:           %[[VAL_89:.*]] = cc.compute_ptr %[[VAL_66]][2] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // BASE:           %[[VAL_90:.*]] = cc.cast unsigned %[[VAL_88]] : (i1) -> i8
@@ -288,140 +288,140 @@ func.func @__nvqpp__mlirgen__comprehensive() attributes {"cudaq-entrypoint", "cu
 // ADAPT-DAG:       %[[VAL_10:.*]] = arith.constant -1.000000e+00 : f64
 // ADAPT-DAG:       %[[VAL_11:.*]] = arith.constant 0 : i64
 // ADAPT:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// ADAPT:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // ADAPT:           %[[VAL_14:.*]] = arith.constant 1 : i64
 // ADAPT:           %[[VAL_15:.*]] = cc.cast %[[VAL_14]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// ADAPT:           %[[VAL_16:.*]] = cc.cast %[[VAL_15]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // ADAPT:           %[[VAL_17:.*]] = arith.constant 2 : i64
 // ADAPT:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// ADAPT:           %[[VAL_19:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // ADAPT:           %[[VAL_20:.*]] = arith.constant 3 : i64
 // ADAPT:           %[[VAL_21:.*]] = cc.cast %[[VAL_20]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// ADAPT:           %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // ADAPT:           %[[VAL_23:.*]] = arith.constant 4 : i64
 // ADAPT:           %[[VAL_24:.*]] = cc.cast %[[VAL_23]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// ADAPT:           %[[VAL_25:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // ADAPT:           %[[VAL_26:.*]] = arith.constant 5 : i64
 // ADAPT:           %[[VAL_27:.*]] = cc.cast %[[VAL_26]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
+// ADAPT:           %[[VAL_28:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<none>) -> !cc.ptr<none>
 // ADAPT:           %[[VAL_29:.*]] = arith.constant 6 : i64
 // ADAPT:           %[[VAL_30:.*]] = cc.cast %[[VAL_29]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Qubit", opaque>>
-// ADAPT:           call @__quantum__qis__h__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__h__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__h__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__h__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__x__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__x__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__x__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__x__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__cnot__body(%[[VAL_19]], %[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__y__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__y__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__y__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__y__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__z__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__z__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__z__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__z__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__t__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__t__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__t__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__t__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__tdg__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__s__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__s__body(%[[VAL_25]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__s__body(%[[VAL_28]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__s__body(%[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__sdg__body(%[[VAL_13]]) : (!llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__rx__body(%[[VAL_7]], %[[VAL_28]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__rx__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__ry__body(%[[VAL_6]], %[[VAL_25]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__ry__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__rz__body(%[[VAL_4]], %[[VAL_28]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__rz__body(%[[VAL_3]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__r1__body(%[[VAL_2]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__r1__body(%[[VAL_1]], %[[VAL_31]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__r1__body(%[[VAL_0]], %[[VAL_13]]) : (f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__swap__body(%[[VAL_13]], %[[VAL_31]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
-// ADAPT:           call @__quantum__qis__u3__body(%[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_22]]) : (f64, f64, f64, !llvm.ptr<struct<"Qubit", opaque>>) -> ()
+// ADAPT:           %[[VAL_31:.*]] = cc.cast %[[VAL_30]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__h__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__h__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__h__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__h__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__x__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__x__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__x__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__x__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__cnot__body(%[[VAL_19]], %[[VAL_25]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__y__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__y__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__y__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__y__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__z__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__z__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__z__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__z__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__t__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__t__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__t__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__t__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__tdg__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__s__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__s__body(%[[VAL_25]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__s__body(%[[VAL_28]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__s__body(%[[VAL_31]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__sdg__body(%[[VAL_13]]) : (!cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__rx__body(%[[VAL_7]], %[[VAL_28]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__rx__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__ry__body(%[[VAL_6]], %[[VAL_25]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__ry__body(%[[VAL_5]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__rz__body(%[[VAL_4]], %[[VAL_28]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__rz__body(%[[VAL_3]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__r1__body(%[[VAL_2]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__r1__body(%[[VAL_1]], %[[VAL_31]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__r1__body(%[[VAL_0]], %[[VAL_13]]) : (f64, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__swap__body(%[[VAL_13]], %[[VAL_31]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
+// ADAPT:           call @__quantum__qis__u3__body(%[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_22]]) : (f64, f64, f64, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_32:.*]] = arith.constant 0 : i64
 // ADAPT:           %[[VAL_33:.*]] = cc.cast %[[VAL_32]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_13]], %[[VAL_34]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// ADAPT:           %[[VAL_34:.*]] = cc.cast %[[VAL_33]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_13]], %[[VAL_34]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_35:.*]] = cc.alloca i8
 // ADAPT:           %[[VAL_36:.*]] = cc.address_of @cstr.73696E676C65746F6E00 : !cc.ptr<!llvm.array<10 x i8>>
 // ADAPT:           %[[VAL_37:.*]] = cc.cast %[[VAL_36]] : (!cc.ptr<!llvm.array<10 x i8>>) -> !cc.ptr<i8>
-// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_34]], %[[VAL_37]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
-// ADAPT:           %[[VAL_38:.*]] = call @__quantum__qis__read_result__body(%[[VAL_34]]) : (!llvm.ptr<struct<"Result", opaque>>) -> i1
+// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_34]], %[[VAL_37]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
+// ADAPT:           %[[VAL_38:.*]] = call @__quantum__qis__read_result__body(%[[VAL_34]]) : (!cc.ptr<none>) -> i1
 // ADAPT:           %[[VAL_39:.*]] = cc.cast unsigned %[[VAL_38]] : (i1) -> i8
 // ADAPT:           cc.store %[[VAL_39]], %[[VAL_35]] : !cc.ptr<i8>
 // ADAPT:           %[[VAL_40:.*]] = cc.alloca !cc.array<i8 x 1>
 // ADAPT:           %[[VAL_41:.*]] = arith.constant 1 : i64
 // ADAPT:           %[[VAL_42:.*]] = cc.cast %[[VAL_41]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_16]], %[[VAL_43]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// ADAPT:           %[[VAL_43:.*]] = cc.cast %[[VAL_42]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_16]], %[[VAL_43]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_44:.*]] = cc.address_of @cstr.65696E7300 : !cc.ptr<!llvm.array<5 x i8>>
 // ADAPT:           %[[VAL_45:.*]] = cc.cast %[[VAL_44]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_43]], %[[VAL_45]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
-// ADAPT:           %[[VAL_46:.*]] = call @__quantum__qis__read_result__body(%[[VAL_43]]) : (!llvm.ptr<struct<"Result", opaque>>) -> i1
+// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_43]], %[[VAL_45]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
+// ADAPT:           %[[VAL_46:.*]] = call @__quantum__qis__read_result__body(%[[VAL_43]]) : (!cc.ptr<none>) -> i1
 // ADAPT:           %[[VAL_47:.*]] = cc.cast %[[VAL_40]] : (!cc.ptr<!cc.array<i8 x 1>>) -> !cc.ptr<i8>
 // ADAPT:           %[[VAL_48:.*]] = cc.cast unsigned %[[VAL_46]] : (i1) -> i8
 // ADAPT:           cc.store %[[VAL_48]], %[[VAL_47]] : !cc.ptr<i8>
 // ADAPT:           %[[VAL_49:.*]] = cc.alloca !cc.array<i8 x 2>
 // ADAPT:           %[[VAL_50:.*]] = arith.constant 2 : i64
 // ADAPT:           %[[VAL_51:.*]] = cc.cast %[[VAL_50]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_52:.*]] = cc.cast %[[VAL_51]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_19]], %[[VAL_52]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// ADAPT:           %[[VAL_52:.*]] = cc.cast %[[VAL_51]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_19]], %[[VAL_52]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_53:.*]] = cc.address_of @cstr.64756200 : !cc.ptr<!llvm.array<4 x i8>>
 // ADAPT:           %[[VAL_54:.*]] = cc.cast %[[VAL_53]] : (!cc.ptr<!llvm.array<4 x i8>>) -> !cc.ptr<i8>
-// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_52]], %[[VAL_54]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
-// ADAPT:           %[[VAL_55:.*]] = call @__quantum__qis__read_result__body(%[[VAL_52]]) : (!llvm.ptr<struct<"Result", opaque>>) -> i1
+// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_52]], %[[VAL_54]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
+// ADAPT:           %[[VAL_55:.*]] = call @__quantum__qis__read_result__body(%[[VAL_52]]) : (!cc.ptr<none>) -> i1
 // ADAPT:           %[[VAL_56:.*]] = cc.cast %[[VAL_49]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // ADAPT:           %[[VAL_57:.*]] = cc.cast unsigned %[[VAL_55]] : (i1) -> i8
 // ADAPT:           cc.store %[[VAL_57]], %[[VAL_56]] : !cc.ptr<i8>
 // ADAPT:           %[[VAL_58:.*]] = arith.constant 3 : i64
 // ADAPT:           %[[VAL_59:.*]] = cc.cast %[[VAL_58]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_60:.*]] = cc.cast %[[VAL_59]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_22]], %[[VAL_60]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// ADAPT:           %[[VAL_60:.*]] = cc.cast %[[VAL_59]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_22]], %[[VAL_60]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_61:.*]] = cc.address_of @cstr.64756200 : !cc.ptr<!llvm.array<4 x i8>>
 // ADAPT:           %[[VAL_62:.*]] = cc.cast %[[VAL_61]] : (!cc.ptr<!llvm.array<4 x i8>>) -> !cc.ptr<i8>
-// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_60]], %[[VAL_62]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
-// ADAPT:           %[[VAL_63:.*]] = call @__quantum__qis__read_result__body(%[[VAL_60]]) : (!llvm.ptr<struct<"Result", opaque>>) -> i1
+// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_60]], %[[VAL_62]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
+// ADAPT:           %[[VAL_63:.*]] = call @__quantum__qis__read_result__body(%[[VAL_60]]) : (!cc.ptr<none>) -> i1
 // ADAPT:           %[[VAL_64:.*]] = cc.compute_ptr %[[VAL_49]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // ADAPT:           %[[VAL_65:.*]] = cc.cast unsigned %[[VAL_63]] : (i1) -> i8
 // ADAPT:           cc.store %[[VAL_65]], %[[VAL_64]] : !cc.ptr<i8>
 // ADAPT:           %[[VAL_66:.*]] = cc.alloca !cc.array<i8 x 3>
 // ADAPT:           %[[VAL_67:.*]] = arith.constant 4 : i64
 // ADAPT:           %[[VAL_68:.*]] = cc.cast %[[VAL_67]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_69:.*]] = cc.cast %[[VAL_68]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_25]], %[[VAL_69]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// ADAPT:           %[[VAL_69:.*]] = cc.cast %[[VAL_68]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_25]], %[[VAL_69]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_70:.*]] = cc.address_of @cstr.7472697000 : !cc.ptr<!llvm.array<5 x i8>>
 // ADAPT:           %[[VAL_71:.*]] = cc.cast %[[VAL_70]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_69]], %[[VAL_71]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
-// ADAPT:           %[[VAL_72:.*]] = call @__quantum__qis__read_result__body(%[[VAL_69]]) : (!llvm.ptr<struct<"Result", opaque>>) -> i1
+// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_69]], %[[VAL_71]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
+// ADAPT:           %[[VAL_72:.*]] = call @__quantum__qis__read_result__body(%[[VAL_69]]) : (!cc.ptr<none>) -> i1
 // ADAPT:           %[[VAL_73:.*]] = cc.cast %[[VAL_66]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // ADAPT:           %[[VAL_74:.*]] = cc.cast unsigned %[[VAL_72]] : (i1) -> i8
 // ADAPT:           cc.store %[[VAL_74]], %[[VAL_73]] : !cc.ptr<i8>
 // ADAPT:           %[[VAL_75:.*]] = arith.constant 5 : i64
 // ADAPT:           %[[VAL_76:.*]] = cc.cast %[[VAL_75]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_77:.*]] = cc.cast %[[VAL_76]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_28]], %[[VAL_77]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// ADAPT:           %[[VAL_77:.*]] = cc.cast %[[VAL_76]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_28]], %[[VAL_77]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_78:.*]] = cc.address_of @cstr.7472697000 : !cc.ptr<!llvm.array<5 x i8>>
 // ADAPT:           %[[VAL_79:.*]] = cc.cast %[[VAL_78]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_77]], %[[VAL_79]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
-// ADAPT:           %[[VAL_80:.*]] = call @__quantum__qis__read_result__body(%[[VAL_77]]) : (!llvm.ptr<struct<"Result", opaque>>) -> i1
+// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_77]], %[[VAL_79]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
+// ADAPT:           %[[VAL_80:.*]] = call @__quantum__qis__read_result__body(%[[VAL_77]]) : (!cc.ptr<none>) -> i1
 // ADAPT:           %[[VAL_81:.*]] = cc.compute_ptr %[[VAL_66]][1] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // ADAPT:           %[[VAL_82:.*]] = cc.cast unsigned %[[VAL_80]] : (i1) -> i8
 // ADAPT:           cc.store %[[VAL_82]], %[[VAL_81]] : !cc.ptr<i8>
 // ADAPT:           %[[VAL_83:.*]] = arith.constant 6 : i64
 // ADAPT:           %[[VAL_84:.*]] = cc.cast %[[VAL_83]] : (i64) -> !cc.ptr<none>
-// ADAPT:           %[[VAL_85:.*]] = cc.cast %[[VAL_84]] : (!cc.ptr<none>) -> !llvm.ptr<struct<"Result", opaque>>
-// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_31]], %[[VAL_85]]) : (!llvm.ptr<struct<"Qubit", opaque>>, !llvm.ptr<struct<"Result", opaque>>) -> ()
+// ADAPT:           %[[VAL_85:.*]] = cc.cast %[[VAL_84]] : (!cc.ptr<none>) -> !cc.ptr<none>
+// ADAPT:           call @__quantum__qis__mz__body(%[[VAL_31]], %[[VAL_85]]) : (!cc.ptr<none>, !cc.ptr<none>) -> ()
 // ADAPT:           %[[VAL_86:.*]] = cc.address_of @cstr.7472697000 : !cc.ptr<!llvm.array<5 x i8>>
 // ADAPT:           %[[VAL_87:.*]] = cc.cast %[[VAL_86]] : (!cc.ptr<!llvm.array<5 x i8>>) -> !cc.ptr<i8>
-// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_85]], %[[VAL_87]]) : (!llvm.ptr<struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
-// ADAPT:           %[[VAL_88:.*]] = call @__quantum__qis__read_result__body(%[[VAL_85]]) : (!llvm.ptr<struct<"Result", opaque>>) -> i1
+// ADAPT:           call @__quantum__rt__result_record_output(%[[VAL_85]], %[[VAL_87]]) : (!cc.ptr<none>, !cc.ptr<i8>) -> ()
+// ADAPT:           %[[VAL_88:.*]] = call @__quantum__qis__read_result__body(%[[VAL_85]]) : (!cc.ptr<none>) -> i1
 // ADAPT:           %[[VAL_89:.*]] = cc.compute_ptr %[[VAL_66]][2] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
 // ADAPT:           %[[VAL_90:.*]] = cc.cast unsigned %[[VAL_88]] : (i1) -> i8
 // ADAPT:           cc.store %[[VAL_90]], %[[VAL_89]] : !cc.ptr<i8>
diff --git a/test/Translate/IQM/basic.qke b/test/Translate/IQM/basic.qke
index affaa1e6e34..e7f014260f3 100644
--- a/test/Translate/IQM/basic.qke
+++ b/test/Translate/IQM/basic.qke
@@ -31,16 +31,16 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__bell = "_ZN4bellc
     quake.phased_rx(%cst_1, %cst_1) %q1 : (f64, f64, !quake.ref) -> ()
     quake.phased_rx(%cst_0, %cst) %q1 : (f64, f64, !quake.ref) -> ()
 
-    %8 = llvm.alloca %c2_i64 x i1 : (i64) -> !llvm.ptr<i1>
+    %8 = llvm.alloca %c2_i64 x i1 : (i64) -> !llvm.ptr
 
     %bit = quake.mz %q0 : (!quake.ref) -> !quake.measure
     %bits = quake.discriminate %bit : (!quake.measure) -> i1
-    llvm.store %bits, %8 : !llvm.ptr<i1>
+    llvm.store %bits, %8 : i1, !llvm.ptr
 
     %bit_4 = quake.mz %q1 : (!quake.ref) -> !quake.measure
     %bits_4 = quake.discriminate %bit_4 : (!quake.measure) -> i1
-    %9 = llvm.getelementptr %8[1] : (!llvm.ptr<i1>) -> !llvm.ptr<i1>
-    llvm.store %bits_4, %9 : !llvm.ptr<i1>
+    %9 = llvm.getelementptr %8[1] : (!llvm.ptr) -> !llvm.ptr, i1
+    llvm.store %bits_4, %9 : i1, !llvm.ptr
     return
   }
 }
diff --git a/test/Translate/alloca_no_operand.qke b/test/Translate/alloca_no_operand.qke
index 97fdc8a4050..66476f89449 100644
--- a/test/Translate/alloca_no_operand.qke
+++ b/test/Translate/alloca_no_operand.qke
@@ -62,42 +62,42 @@ func.func @adder_n4() {
 }
 
 // CHECK-LABEL: define void @adder_n4() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 4)
-// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_6:.*]] = load %Qubit*, %Qubit** %[[VAL_5]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 3)
-// CHECK:         %[[VAL_8:.*]] = load %Qubit*, %Qubit** %[[VAL_7]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_8]])
-// CHECK:         %[[VAL_9:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
-// CHECK:         %[[VAL_10:.*]] = load %Qubit*, %Qubit** %[[VAL_9]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_10]], %Qubit* %[[VAL_8]])
-// CHECK:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_6]])
-// CHECK:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_10]])
-// CHECK:         tail call void @__quantum__qis__t__adj(%Qubit* %[[VAL_8]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_4]], %Qubit* %[[VAL_6]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_10]], %Qubit* %[[VAL_8]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_8]], %Qubit* %[[VAL_4]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_6]], %Qubit* %[[VAL_10]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_4]], %Qubit* %[[VAL_6]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_10]], %Qubit* %[[VAL_8]])
-// CHECK:         tail call void @__quantum__qis__t__adj(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__t__adj(%Qubit* %[[VAL_6]])
-// CHECK:         tail call void @__quantum__qis__t__adj(%Qubit* %[[VAL_10]])
-// CHECK:         tail call void @__quantum__qis__t(%Qubit* %[[VAL_8]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_4]], %Qubit* %[[VAL_6]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_10]], %Qubit* %[[VAL_8]])
-// CHECK:         tail call void @__quantum__qis__s(%Qubit* %[[VAL_8]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_8]], %Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_8]])
-// CHECK:         %[[VAL_11:.*]] = tail call %Result* @__quantum__qis__mz(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_13:.*]] = tail call %Result* @__quantum__qis__mz(%Qubit* %[[VAL_6]])
-// CHECK:         %[[VAL_14:.*]] = tail call %Result* @__quantum__qis__mz(%Qubit* %[[VAL_10]])
-// CHECK:         %[[VAL_15:.*]] = tail call %Result* @__quantum__qis__mz(%Qubit* %[[VAL_8]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 4)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 3)
+// CHECK:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_7]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_8]])
+// CHECK:         %[[VAL_9:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// CHECK:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_10]], ptr %[[VAL_8]])
+// CHECK:         tail call void @__quantum__qis__t(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__t(ptr %[[VAL_6]])
+// CHECK:         tail call void @__quantum__qis__t(ptr %[[VAL_10]])
+// CHECK:         tail call void @__quantum__qis__t__adj(ptr %[[VAL_8]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_4]], ptr %[[VAL_6]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_10]], ptr %[[VAL_8]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_8]], ptr %[[VAL_4]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_6]], ptr %[[VAL_10]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_4]], ptr %[[VAL_6]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_10]], ptr %[[VAL_8]])
+// CHECK:         tail call void @__quantum__qis__t__adj(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__t__adj(ptr %[[VAL_6]])
+// CHECK:         tail call void @__quantum__qis__t__adj(ptr %[[VAL_10]])
+// CHECK:         tail call void @__quantum__qis__t(ptr %[[VAL_8]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_4]], ptr %[[VAL_6]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_10]], ptr %[[VAL_8]])
+// CHECK:         tail call void @__quantum__qis__s(ptr %[[VAL_8]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_8]], ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_8]])
+// CHECK:         %[[VAL_11:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_13:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_6]])
+// CHECK:         %[[VAL_14:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_10]])
+// CHECK:         %[[VAL_15:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_8]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/apply_noise.qke b/test/Translate/apply_noise.qke
index c07982e3e1d..1d42e593614 100644
--- a/test/Translate/apply_noise.qke
+++ b/test/Translate/apply_noise.qke
@@ -16,10 +16,10 @@ func.func @test0() {
 }
 
 // CHECK-LABEL: define void @test0() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 1)
-// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]]
-// CHECK:         tail call void (i64, i64, i64, i64, i64, ...) @__quantum__qis__apply_kraus_channel_generalized(i64 1, i64 123456789, i64 0, i64 0, i64 1, %Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 1)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]]
+// CHECK:         tail call void (i64, i64, i64, i64, i64, ...) @__quantum__qis__apply_kraus_channel_generalized(i64 1, i64 123456789, i64 0, i64 0, i64 1, ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/argument.qke b/test/Translate/argument.qke
index 220e945eacf..bc393336eb0 100644
--- a/test/Translate/argument.qke
+++ b/test/Translate/argument.qke
@@ -30,47 +30,39 @@ func.func @test_0(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_0({ { i32, double }*, i64 } 
+// CHECK-LABEL: define void @__nvqpp__mlirgen__test_0({ ptr, i64 }
 // CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 0
-// CHECK:         %[[VAL_2:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 1
-// CHECK:         %[[VAL_3:.*]] = bitcast { i32, double }* %[[VAL_1]] to i8*
-// CHECK:         tail call void @anchor(i8* %[[VAL_3]], i64 %[[VAL_2]])
+// CHECK:         %[[VAL_1:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
+// CHECK:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
+// CHECK:         tail call void @anchor(ptr %[[VAL_1]], i64 %[[VAL_2]])
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_0(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i32, double }*, { i32, double }*, { i32, double }* }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_2:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 1
-// CHECK:         %[[VAL_3:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 0
-// CHECK:         %[[VAL_4:.*]] = load { i32, double }*, { i32, double }** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = load { i32, double }*, { i32, double }** %[[VAL_3]], align 8
-// CHECK:         %[[VAL_6:.*]] = ptrtoint { i32, double }* %[[VAL_4]] to i64
-// CHECK:         %[[VAL_7:.*]] = ptrtoint { i32, double }* %[[VAL_5]] to i64
-// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
-// CHECK:         %[[VAL_9:.*]] = add i64 %[[VAL_8]], 8
-// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_9]], align 8
-// CHECK:         %[[VAL_11:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
-// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_10]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_12]], align 8
-// CHECK:         %[[VAL_13:.*]] = bitcast { i32, double }* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_11]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
-// CHECK:         %[[VAL_14:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_15:.*]] = alloca [1 x i8*], align 8
-// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_15]], i64 0, i64 0
-// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_16]], i8*** %[[VAL_17]], align 8
-// CHECK:         %[[VAL_18:.*]] = ptrtoint [1 x i8*]* %[[VAL_15]] to i64
-// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_18]], 8
-// CHECK:         %[[VAL_20:.*]] = inttoptr i64 %[[VAL_19]] to i8**
-// CHECK:         %[[VAL_21:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_21]], align 8
-// CHECK:         %[[VAL_22:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_22]], align 8
-// CHECK:         %[[VAL_23:.*]] = bitcast [1 x i8*]* %[[VAL_15]] to { { i32, double }*, { i32, double }*, { i32, double }* }**
-// CHECK:         store { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], { { i32, double }*, { i32, double }*, { i32, double }* }** %[[VAL_23]], align 8
-// CHECK:         %[[VAL_24:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_14]] to i8*
-// CHECK:         %[[VAL_25:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647, i8* nonnull %[[VAL_24]])
+// CHECK-LABEL: define void @test_0(ptr readnone captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 8
+// CHECK:         %[[VAL_3:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_1]], align 8
+// CHECK:         %[[VAL_5:.*]] = ptrtoint ptr %[[VAL_3]] to i64
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = sub i64 %[[VAL_5]], %[[VAL_6]]
+// CHECK:         %[[VAL_8:.*]] = add i64 %[[VAL_7]], 8
+// CHECK:         %[[VAL_9:.*]] = alloca i8, i64 %{{.*}}, align 8
+// CHECK:         %[[VAL_10:.*]] = getelementptr i8, ptr %[[VAL_9]], i64 8
+// CHECK:         store i64 %[[VAL_7]], ptr %[[VAL_9]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[VAL_10]], ptr align 1 %[[VAL_4]], i64 %[[VAL_7]], i1 false)
+// CHECK:         %[[VAL_11:.*]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK:         %[[VAL_12:.*]] = alloca [1 x ptr], align 8
+// CHECK:         store ptr %[[VAL_12]], ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_13:.*]] = ptrtoint ptr %[[VAL_12]] to i64
+// CHECK:         %[[VAL_14:.*]] = add i64 %[[VAL_13]], 8
+// CHECK:         %[[VAL_15:.*]] = inttoptr i64 %[[VAL_14]] to ptr
+// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_11]], i64 8
+// CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_16]], align 8
+// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_11]], i64 16
+// CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_17]], align 8
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_12]], align 8
+// CHECK:         %[[VAL_18:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_0.kernelName, ptr nonnull @test_0.thunk, ptr nonnull %[[VAL_9]], i64 %[[VAL_8]], i64 2147483647, ptr nonnull %[[VAL_11]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -92,67 +84,56 @@ func.func @test_1(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_1({ { i16*, i64 }, { float*, i64 } } 
+// CHECK-LABEL: define void @__nvqpp__mlirgen__test_1({ { ptr, i64 }, { ptr, i64 } }
 // CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 0
-// CHECK:         %[[VAL_2:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 0
-// CHECK:         %[[VAL_3:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 1
-// CHECK:         %[[VAL_4:.*]] = bitcast i16* %[[VAL_2]] to i8*
-// CHECK:         tail call void @anchor(i8* %[[VAL_4]], i64 %[[VAL_3]])
-// CHECK:         %[[VAL_5:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 1
-// CHECK:         %[[VAL_6:.*]] = extractvalue { float*, i64 } %[[VAL_5]], 0
-// CHECK:         %[[VAL_7:.*]] = extractvalue { float*, i64 } %[[VAL_5]], 1
-// CHECK:         %[[VAL_8:.*]] = bitcast float* %[[VAL_6]] to i8*
-// CHECK:         tail call void @anchor(i8* %[[VAL_8]], i64 %[[VAL_7]])
+// CHECK:         %[[VAL_1:.*]] = extractvalue { { ptr, i64 }, { ptr, i64 } } %[[VAL_0]], 0
+// CHECK:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_1]], 0
+// CHECK:         %[[VAL_3:.*]] = extractvalue { ptr, i64 } %[[VAL_1]], 1
+// CHECK:         tail call void @anchor(ptr %[[VAL_2]], i64 %[[VAL_3]])
+// CHECK:         %[[VAL_5:.*]] = extractvalue { { ptr, i64 }, { ptr, i64 } } %[[VAL_0]], 1
+// CHECK:         %[[VAL_6:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 0
+// CHECK:         %[[VAL_7:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 1
+// CHECK:         tail call void @anchor(ptr %[[VAL_6]], i64 %[[VAL_7]])
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_1(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i16*, i16*, i16* }, { float*, float*, float* } }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_2:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 1
-// CHECK:         %[[VAL_3:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 0
-// CHECK:         %[[VAL_4:.*]] = load i16*, i16** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = load i16*, i16** %[[VAL_3]], align 8
-// CHECK:         %[[VAL_6:.*]] = ptrtoint i16* %[[VAL_4]] to i64
-// CHECK:         %[[VAL_7:.*]] = ptrtoint i16* %[[VAL_5]] to i64
-// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
-// CHECK:         %[[VAL_9:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 1, i32 1
-// CHECK:         %[[VAL_10:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 1, i32 0
-// CHECK:         %[[VAL_11:.*]] = load float*, float** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_12:.*]] = load float*, float** %[[VAL_10]], align 8
-// CHECK:         %[[VAL_13:.*]] = ptrtoint float* %[[VAL_11]] to i64
-// CHECK:         %[[VAL_14:.*]] = ptrtoint float* %[[VAL_12]] to i64
-// CHECK:         %[[VAL_15:.*]] = sub i64 %[[VAL_13]], %[[VAL_14]]
-// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_8]], 32
-// CHECK:         %[[VAL_17:.*]] = add i64 %[[VAL_16]], %[[VAL_15]]
-// CHECK:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_17]], align 8
-// CHECK:         %[[VAL_19:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 16
-// CHECK:         %[[VAL_20:.*]] = bitcast i8* %[[VAL_18]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_20]], align 8
-// CHECK:         %[[VAL_21:.*]] = bitcast i16* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_19]], i8* align 1 %[[VAL_21]], i64 %[[VAL_8]], i1 false)
-// CHECK:         %[[VAL_22:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 %[[VAL_8]]
-// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 8
-// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_23]] to i64*
-// CHECK:         store i64 %[[VAL_15]], i64* %[[VAL_24]], align 8
-// CHECK:         %[[VAL_25:.*]] = bitcast float* %[[VAL_12]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_22]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
-// CHECK:         %[[VAL_26:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_27:.*]] = alloca [1 x i8*], align 8
-// CHECK:         %[[VAL_28:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_27]], i64 0, i64 0
-// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_28]], i8*** %[[VAL_29]], align 8
-// CHECK:         %[[VAL_30:.*]] = ptrtoint [1 x i8*]* %[[VAL_27]] to i64
-// CHECK:         %[[VAL_31:.*]] = add i64 %[[VAL_30]], 8
-// CHECK:         %[[VAL_32:.*]] = inttoptr i64 %[[VAL_31]] to i8**
-// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_33]], align 8
-// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_34]], align 8
-// CHECK:         %[[VAL_35:.*]] = bitcast [1 x i8*]* %[[VAL_27]] to { { i16*, i16*, i16* }, { float*, float*, float* } }**
-// CHECK:         store { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], { { i16*, i16*, i16* }, { float*, float*, float* } }** %[[VAL_35]], align 8
-// CHECK:         %[[VAL_36:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_26]] to i8*
-// CHECK:         %[[VAL_37:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647, i8* nonnull %[[VAL_36]])
+// CHECK-LABEL: define void @test_1(ptr readnone captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 8
+// CHECK:         %[[VAL_3:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_1]], align 8
+// CHECK:         %[[VAL_5:.*]] = ptrtoint ptr %[[VAL_3]] to i64
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = sub i64 %[[VAL_5]], %[[VAL_6]]
+// CHECK:         %[[VAL_8:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 32
+// CHECK:         %[[VAL_9:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 24
+// CHECK:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_8]], align 8
+// CHECK:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         %[[VAL_12:.*]] = ptrtoint ptr %[[VAL_10]] to i64
+// CHECK:         %[[VAL_13:.*]] = ptrtoint ptr %[[VAL_11]] to i64
+// CHECK:         %[[VAL_14:.*]] = sub i64 %[[VAL_12]], %[[VAL_13]]
+// CHECK:         %[[VAL_15:.*]] = add i64 %{{.*}}
+// CHECK:         %[[VAL_16:.*]] = add i64 %{{.*}}
+// CHECK:         %[[VAL_17:.*]] = alloca i8, i64 %{{.*}}, align 8
+// CHECK:         %[[VAL_18:.*]] = getelementptr i8, ptr %[[VAL_17]], i64 16
+// CHECK:         store i64 %[[VAL_7]], ptr %[[VAL_17]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[VAL_18]], ptr align 1 %[[VAL_4]], i64 %[[VAL_7]], i1 false)
+// CHECK:         %[[VAL_19:.*]] = getelementptr i8, ptr %[[VAL_18]], i64 %[[VAL_7]]
+// CHECK:         %[[VAL_20:.*]] = getelementptr i8, ptr %[[VAL_17]], i64 8
+// CHECK:         store i64 %[[VAL_14]], ptr %[[VAL_20]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_19]], ptr align 1 %[[VAL_11]], i64 %[[VAL_14]], i1 false)
+// CHECK:         %[[VAL_21:.*]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK:         %[[VAL_22:.*]] = alloca [1 x ptr], align 8
+// CHECK:         store ptr %[[VAL_22]], ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_23:.*]] = ptrtoint ptr %[[VAL_22]] to i64
+// CHECK:         %[[VAL_24:.*]] = add i64 %[[VAL_23]], 8
+// CHECK:         %[[VAL_25:.*]] = inttoptr i64 %[[VAL_24]] to ptr
+// CHECK:         %[[VAL_26:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_21]], i64 8
+// CHECK:         store ptr %[[VAL_25]], ptr %[[VAL_26]], align 8
+// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_21]], i64 16
+// CHECK:         store ptr %[[VAL_25]], ptr %[[VAL_27]], align 8
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_22]], align 8
+// CHECK:         %[[VAL_28:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_1.kernelName, ptr nonnull @test_1.thunk, ptr nonnull %[[VAL_17]], i64 %[[VAL_16]], i64 2147483647, ptr nonnull %[[VAL_21]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -168,47 +149,39 @@ func.func @test_2(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_2({ { i32, double }*, i64 } 
+// CHECK-LABEL: define void @__nvqpp__mlirgen__test_2({ ptr, i64 }
 // CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 0
-// CHECK:         %[[VAL_2:.*]] = extractvalue { { i32, double }*, i64 } %[[VAL_0]], 1
-// CHECK:         %[[VAL_3:.*]] = bitcast { i32, double }* %[[VAL_1]] to i8*
-// CHECK:         tail call void @anchor(i8* %[[VAL_3]], i64 %[[VAL_2]])
+// CHECK:         %[[VAL_1:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
+// CHECK:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
+// CHECK:         tail call void @anchor(ptr %[[VAL_1]], i64 %[[VAL_2]])
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_2(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i32, double }*, { i32, double }*, { i32, double }* }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_2:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 1
-// CHECK:         %[[VAL_3:.*]] = getelementptr { { i32, double }*, { i32, double }*, { i32, double }* }, { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], i64 0, i32 0
-// CHECK:         %[[VAL_4:.*]] = load { i32, double }*, { i32, double }** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = load { i32, double }*, { i32, double }** %[[VAL_3]], align 8
-// CHECK:         %[[VAL_6:.*]] = ptrtoint { i32, double }* %[[VAL_4]] to i64
-// CHECK:         %[[VAL_7:.*]] = ptrtoint { i32, double }* %[[VAL_5]] to i64
-// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
-// CHECK:         %[[VAL_9:.*]] = add i64 %[[VAL_8]], 8
-// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_9]], align 8
-// CHECK:         %[[VAL_11:.*]] = getelementptr i8, i8* %[[VAL_10]], i64 8
-// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_10]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_12]], align 8
-// CHECK:         %[[VAL_13:.*]] = bitcast { i32, double }* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_11]], i8* align 1 %[[VAL_13]], i64 %[[VAL_8]], i1 false)
-// CHECK:         %[[VAL_14:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_15:.*]] = alloca [1 x i8*], align 8
-// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_15]], i64 0, i64 0
-// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_16]], i8*** %[[VAL_17]], align 8
-// CHECK:         %[[VAL_18:.*]] = ptrtoint [1 x i8*]* %[[VAL_15]] to i64
-// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_18]], 8
-// CHECK:         %[[VAL_20:.*]] = inttoptr i64 %[[VAL_19]] to i8**
-// CHECK:         %[[VAL_21:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_21]], align 8
-// CHECK:         %[[VAL_22:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_14]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_20]], i8*** %[[VAL_22]], align 8
-// CHECK:         %[[VAL_23:.*]] = bitcast [1 x i8*]* %[[VAL_15]] to { { i32, double }*, { i32, double }*, { i32, double }* }**
-// CHECK:         store { { i32, double }*, { i32, double }*, { i32, double }* }* %[[VAL_1]], { { i32, double }*, { i32, double }*, { i32, double }* }** %[[VAL_23]], align 8
-// CHECK:         %[[VAL_24:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_14]] to i8*
-// CHECK:         %[[VAL_25:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_10]], i64 %[[VAL_9]], i64 2147483647, i8* nonnull %[[VAL_24]])
+// CHECK-LABEL: define void @test_2(ptr readnone captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 8
+// CHECK:         %[[VAL_3:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_1]], align 8
+// CHECK:         %[[VAL_5:.*]] = ptrtoint ptr %[[VAL_3]] to i64
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = sub i64 %[[VAL_5]], %[[VAL_6]]
+// CHECK:         %[[VAL_8:.*]] = add i64 %[[VAL_7]], 8
+// CHECK:         %[[VAL_9:.*]] = alloca i8, i64 %{{.*}}, align 8
+// CHECK:         %[[VAL_10:.*]] = getelementptr i8, ptr %[[VAL_9]], i64 8
+// CHECK:         store i64 %[[VAL_7]], ptr %[[VAL_9]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[VAL_10]], ptr align 1 %[[VAL_4]], i64 %[[VAL_7]], i1 false)
+// CHECK:         %[[VAL_11:.*]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK:         %[[VAL_12:.*]] = alloca [1 x ptr], align 8
+// CHECK:         store ptr %[[VAL_12]], ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_13:.*]] = ptrtoint ptr %[[VAL_12]] to i64
+// CHECK:         %[[VAL_14:.*]] = add i64 %[[VAL_13]], 8
+// CHECK:         %[[VAL_15:.*]] = inttoptr i64 %[[VAL_14]] to ptr
+// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_11]], i64 8
+// CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_16]], align 8
+// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_11]], i64 16
+// CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_17]], align 8
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_12]], align 8
+// CHECK:         %[[VAL_18:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_2.kernelName, ptr nonnull @test_2.thunk, ptr nonnull %[[VAL_9]], i64 %[[VAL_8]], i64 2147483647, ptr nonnull %[[VAL_11]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -230,263 +203,208 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
   return
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__test_3({ { i16*, i64 }, { float*, i64 } } 
+// CHECK-LABEL: define void @__nvqpp__mlirgen__test_3({ { ptr, i64 }, { ptr, i64 } }
 // CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 0
-// CHECK:         %[[VAL_2:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 0
-// CHECK:         %[[VAL_3:.*]] = extractvalue { i16*, i64 } %[[VAL_1]], 1
-// CHECK:         %[[VAL_4:.*]] = bitcast i16* %[[VAL_2]] to i8*
-// CHECK:         tail call void @anchor(i8* %[[VAL_4]], i64 %[[VAL_3]])
-// CHECK:         %[[VAL_5:.*]] = extractvalue { { i16*, i64 }, { float*, i64 } } %[[VAL_0]], 1
-// CHECK:         %[[VAL_6:.*]] = extractvalue { float*, i64 } %[[VAL_5]], 0
-// CHECK:         %[[VAL_7:.*]] = extractvalue { float*, i64 } %[[VAL_5]], 1
-// CHECK:         %[[VAL_8:.*]] = bitcast float* %[[VAL_6]] to i8*
-// CHECK:         tail call void @anchor(i8* %[[VAL_8]], i64 %[[VAL_7]])
+// CHECK:         %[[VAL_1:.*]] = extractvalue { { ptr, i64 }, { ptr, i64 } } %[[VAL_0]], 0
+// CHECK:         %[[VAL_2:.*]] = extractvalue { ptr, i64 } %[[VAL_1]], 0
+// CHECK:         %[[VAL_3:.*]] = extractvalue { ptr, i64 } %[[VAL_1]], 1
+// CHECK:         tail call void @anchor(ptr %[[VAL_2]], i64 %[[VAL_3]])
+// CHECK:         %[[VAL_5:.*]] = extractvalue { { ptr, i64 }, { ptr, i64 } } %[[VAL_0]], 1
+// CHECK:         %[[VAL_6:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 0
+// CHECK:         %[[VAL_7:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 1
+// CHECK:         tail call void @anchor(ptr %[[VAL_6]], i64 %[[VAL_7]])
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_3(i8* nocapture readnone 
-// CHECK-SAME:    %[[VAL_0:.*]], { { i16*, i16*, i16* }, { float*, float*, float* } }* {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_2:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 1
-// CHECK:         %[[VAL_3:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 0, i32 0
-// CHECK:         %[[VAL_4:.*]] = load i16*, i16** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = load i16*, i16** %[[VAL_3]], align 8
-// CHECK:         %[[VAL_6:.*]] = ptrtoint i16* %[[VAL_4]] to i64
-// CHECK:         %[[VAL_7:.*]] = ptrtoint i16* %[[VAL_5]] to i64
-// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
-// CHECK:         %[[VAL_9:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 1, i32 1
-// CHECK:         %[[VAL_10:.*]] = getelementptr { { i16*, i16*, i16* }, { float*, float*, float* } }, { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], i64 0, i32 1, i32 0
-// CHECK:         %[[VAL_11:.*]] = load float*, float** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_12:.*]] = load float*, float** %[[VAL_10]], align 8
-// CHECK:         %[[VAL_13:.*]] = ptrtoint float* %[[VAL_11]] to i64
-// CHECK:         %[[VAL_14:.*]] = ptrtoint float* %[[VAL_12]] to i64
-// CHECK:         %[[VAL_15:.*]] = sub i64 %[[VAL_13]], %[[VAL_14]]
-// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_8]], 32
-// CHECK:         %[[VAL_17:.*]] = add i64 %[[VAL_16]], %[[VAL_15]]
-// CHECK:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_17]], align 8
-// CHECK:         %[[VAL_19:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 16
-// CHECK:         %[[VAL_20:.*]] = bitcast i8* %[[VAL_18]] to i64*
-// CHECK:         store i64 %[[VAL_8]], i64* %[[VAL_20]], align 8
-// CHECK:         %[[VAL_21:.*]] = bitcast i16* %[[VAL_5]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[VAL_19]], i8* align 1 %[[VAL_21]], i64 %[[VAL_8]], i1 false)
-// CHECK:         %[[VAL_22:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 %[[VAL_8]]
-// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 8
-// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_23]] to i64*
-// CHECK:         store i64 %[[VAL_15]], i64* %[[VAL_24]], align 8
-// CHECK:         %[[VAL_25:.*]] = bitcast float* %[[VAL_12]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_22]], i8* align 1 %[[VAL_25]], i64 %[[VAL_15]], i1 false)
-// CHECK:         %[[VAL_26:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_27:.*]] = alloca [1 x i8*], align 8
-// CHECK:         %[[VAL_28:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_27]], i64 0, i64 0
-// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_28]], i8*** %[[VAL_29]], align 8
-// CHECK:         %[[VAL_30:.*]] = ptrtoint [1 x i8*]* %[[VAL_27]] to i64
-// CHECK:         %[[VAL_31:.*]] = add i64 %[[VAL_30]], 8
-// CHECK:         %[[VAL_32:.*]] = inttoptr i64 %[[VAL_31]] to i8**
-// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_33]], align 8
-// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_26]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_32]], i8*** %[[VAL_34]], align 8
-// CHECK:         %[[VAL_35:.*]] = bitcast [1 x i8*]* %[[VAL_27]] to { { i16*, i16*, i16* }, { float*, float*, float* } }**
-// CHECK:         store { { i16*, i16*, i16* }, { float*, float*, float* } }* %[[VAL_1]], { { i16*, i16*, i16* }, { float*, float*, float* } }** %[[VAL_35]], align 8
-// CHECK:         %[[VAL_36:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_26]] to i8*
-// CHECK:         %[[VAL_37:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_18]], i64 %[[VAL_17]], i64 2147483647, i8* nonnull %[[VAL_36]])
+// CHECK-LABEL: define void @test_3(ptr readnone captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr {{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 8
+// CHECK:         %[[VAL_3:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_1]], align 8
+// CHECK:         %[[VAL_5:.*]] = ptrtoint ptr %[[VAL_3]] to i64
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = sub i64 %[[VAL_5]], %[[VAL_6]]
+// CHECK:         %[[VAL_8:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 32
+// CHECK:         %[[VAL_9:.*]] = getelementptr i8, ptr %[[VAL_1]], i64 24
+// CHECK:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_8]], align 8
+// CHECK:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         %[[VAL_12:.*]] = ptrtoint ptr %[[VAL_10]] to i64
+// CHECK:         %[[VAL_13:.*]] = ptrtoint ptr %[[VAL_11]] to i64
+// CHECK:         %[[VAL_14:.*]] = sub i64 %[[VAL_12]], %[[VAL_13]]
+// CHECK:         %[[VAL_15:.*]] = add i64 %{{.*}}
+// CHECK:         %[[VAL_16:.*]] = add i64 %{{.*}}
+// CHECK:         %[[VAL_17:.*]] = alloca i8, i64 %{{.*}}, align 8
+// CHECK:         %[[VAL_18:.*]] = getelementptr i8, ptr %[[VAL_17]], i64 16
+// CHECK:         store i64 %[[VAL_7]], ptr %[[VAL_17]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[VAL_18]], ptr align 1 %[[VAL_4]], i64 %[[VAL_7]], i1 false)
+// CHECK:         %[[VAL_19:.*]] = getelementptr i8, ptr %[[VAL_18]], i64 %[[VAL_7]]
+// CHECK:         %[[VAL_20:.*]] = getelementptr i8, ptr %[[VAL_17]], i64 8
+// CHECK:         store i64 %[[VAL_14]], ptr %[[VAL_20]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_19]], ptr align 1 %[[VAL_11]], i64 %[[VAL_14]], i1 false)
+// CHECK:         %[[VAL_21:.*]] = alloca { ptr, ptr, ptr }, align 8
+// CHECK:         %[[VAL_22:.*]] = alloca [1 x ptr], align 8
+// CHECK:         store ptr %[[VAL_22]], ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_23:.*]] = ptrtoint ptr %[[VAL_22]] to i64
+// CHECK:         %[[VAL_24:.*]] = add i64 %[[VAL_23]], 8
+// CHECK:         %[[VAL_25:.*]] = inttoptr i64 %[[VAL_24]] to ptr
+// CHECK:         %[[VAL_26:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_21]], i64 8
+// CHECK:         store ptr %[[VAL_25]], ptr %[[VAL_26]], align 8
+// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_21]], i64 16
+// CHECK:         store ptr %[[VAL_25]], ptr %[[VAL_27]], align 8
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_22]], align 8
+// CHECK:         %[[VAL_28:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_3.kernelName, ptr nonnull @test_3.thunk, ptr nonnull %[[VAL_17]], i64 %[[VAL_16]], i64 2147483647, ptr nonnull %[[VAL_21]])
 // CHECK:         ret void
 // CHECK:       }
 
 }
 
-// CHECK-LABEL: define { i8*, i64 } @test_0.thunk(i8* 
+// CHECK-LABEL: define { ptr, i64 } @test_0.thunk(ptr
 // CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
-// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 16
-// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
-// CHECK:         ret { i8*, i64 } zeroinitializer
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_3:.*]] = load i64, ptr %[[VAL_0]], align 4
+// CHECK:         %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 16
+// CHECK:         tail call void @anchor(ptr %[[VAL_2]], i64 %[[VAL_4]])
+// CHECK:         ret { ptr, i64 } zeroinitializer
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_0.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
-// CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to { i32, double }**
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_2]] to { i32, double }**
-// CHECK:         %[[VAL_6:.*]] = load { i32, double }*, { i32, double }** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_7:.*]] = load { i32, double }*, { i32, double }** %[[VAL_5]], align 8
-// CHECK:         %[[VAL_8:.*]] = ptrtoint { i32, double }* %[[VAL_6]] to i64
-// CHECK:         %[[VAL_9:.*]] = ptrtoint { i32, double }* %[[VAL_7]] to i64
-// CHECK:         %[[VAL_10:.*]] = sub i64 %[[VAL_8]], %[[VAL_9]]
-// CHECK:         %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8
-// CHECK:         %[[VAL_12:.*]] = tail call i8* @malloc(i64 %[[VAL_11]])
-// CHECK:         %[[VAL_13:.*]] = getelementptr i8, i8* %[[VAL_12]], i64 8
-// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_12]] to i64*
-// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_14]], align 4
-// CHECK:         %[[VAL_15:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_16:.*]] = load i8*, i8** %[[VAL_15]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_13]], i8* align 1 %[[VAL_16]], i64 %[[VAL_10]], i1 false)
-// CHECK:         store i8* %[[VAL_12]], i8** %[[VAL_1]], align 8
-// CHECK:         ret i64 %[[VAL_11]]
+// CHECK-LABEL: define noundef i64 @test_0.argsCreator(ptr readonly captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr writeonly captures(none) initializes((0, 8)) %[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = load ptr, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_3:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_3]], align 8
+// CHECK:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = ptrtoint ptr %[[VAL_5]] to i64
+// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
+// CHECK:         %[[VAL_9:.*]] = add i64 %[[VAL_8]], 8
+// CHECK:         %[[VAL_10:.*]] = tail call ptr @malloc(i64 %[[VAL_9]])
+// CHECK:         %[[VAL_11:.*]] = getelementptr i8, ptr %[[VAL_10]], i64 8
+// CHECK:         store i64 %[[VAL_8]], ptr %[[VAL_10]], align 4
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_11]], ptr align 1 %[[VAL_5]], i64 %[[VAL_8]], i1 false)
+// CHECK:         store ptr %[[VAL_10]], ptr %[[VAL_1]], align 8
+// CHECK:         ret i64 %[[VAL_9]]
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_1.thunk(i8* 
+// CHECK-LABEL: define { ptr, i64 } @test_1.thunk(ptr
 // CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
-// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 2
-// CHECK:         %[[VAL_6:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 %[[VAL_4]]
-// CHECK:         %[[VAL_7:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to i64*
-// CHECK:         %[[VAL_9:.*]] = load i64, i64* %[[VAL_8]], align 4
-// CHECK:         %[[VAL_10:.*]] = sdiv i64 %[[VAL_9]], 4
-// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
-// CHECK:         tail call void @anchor(i8* %[[VAL_6]], i64 %[[VAL_10]])
-// CHECK:         ret { i8*, i64 } zeroinitializer
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 16
+// CHECK:         %[[VAL_3:.*]] = load i64, ptr %[[VAL_0]], align 4
+// CHECK:         %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 2
+// CHECK:         %[[VAL_5:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 %[[VAL_3]]
+// CHECK:         %[[VAL_6:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_7:.*]] = load i64, ptr %[[VAL_6]], align 4
+// CHECK:         %[[VAL_8:.*]] = sdiv i64 %[[VAL_7]], 4
+// CHECK:         tail call void @anchor(ptr %[[VAL_2]], i64 %[[VAL_4]])
+// CHECK:         tail call void @anchor(ptr %[[VAL_5]], i64 %[[VAL_8]])
+// CHECK:         ret { ptr, i64 } zeroinitializer
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_1.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
-// CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to i16**
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_2]] to i16**
-// CHECK:         %[[VAL_6:.*]] = load i16*, i16** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_7:.*]] = load i16*, i16** %[[VAL_5]], align 8
-// CHECK:         %[[VAL_8:.*]] = ptrtoint i16* %[[VAL_6]] to i64
-// CHECK:         %[[VAL_9:.*]] = ptrtoint i16* %[[VAL_7]] to i64
-// CHECK:         %[[VAL_10:.*]] = sub i64 %[[VAL_8]], %[[VAL_9]]
-// CHECK:         %[[VAL_11:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 32
-// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_11]] to float**
-// CHECK:         %[[VAL_13:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 24
-// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_13]] to float**
-// CHECK:         %[[VAL_15:.*]] = load float*, float** %[[VAL_12]], align 8
-// CHECK:         %[[VAL_16:.*]] = load float*, float** %[[VAL_14]], align 8
-// CHECK:         %[[VAL_17:.*]] = ptrtoint float* %[[VAL_15]] to i64
-// CHECK:         %[[VAL_18:.*]] = ptrtoint float* %[[VAL_16]] to i64
-// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_10]], 32
-// CHECK:         %[[VAL_20:.*]] = add i64 %[[VAL_19]], %[[VAL_17]]
-// CHECK:         %[[VAL_21:.*]] = sub i64 %[[VAL_20]], %[[VAL_18]]
-// CHECK:         %[[VAL_22:.*]] = tail call i8* @malloc(i64 %[[VAL_21]])
-// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 16
-// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_22]] to { i64, i64 }*
-// CHECK:         %[[VAL_25:.*]] = bitcast i8* %[[VAL_22]] to i64*
-// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_25]], align 4
-// CHECK:         %[[VAL_26:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_27:.*]] = load i8*, i8** %[[VAL_26]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_23]], i8* align 1 %[[VAL_27]], i64 %[[VAL_10]], i1 false)
-// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_23]], i64 %[[VAL_10]]
-// CHECK:         %[[VAL_29:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_24]], i64 0, i32 1
-// CHECK:         %[[VAL_30:.*]] = load float*, float** %[[VAL_12]], align 8
-// CHECK:         %[[VAL_31:.*]] = load float*, float** %[[VAL_14]], align 8
-// CHECK:         %[[VAL_32:.*]] = ptrtoint float* %[[VAL_30]] to i64
-// CHECK:         %[[VAL_33:.*]] = ptrtoint float* %[[VAL_31]] to i64
-// CHECK:         %[[VAL_34:.*]] = sub i64 %[[VAL_32]], %[[VAL_33]]
-// CHECK:         store i64 %[[VAL_34]], i64* %[[VAL_29]], align 4
-// CHECK:         %[[VAL_35:.*]] = bitcast i8* %[[VAL_13]] to i8**
-// CHECK:         %[[VAL_36:.*]] = load i8*, i8** %[[VAL_35]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_28]], i8* align 1 %[[VAL_36]], i64 %[[VAL_34]], i1 false)
-// CHECK:         store i8* %[[VAL_22]], i8** %[[VAL_1]], align 8
-// CHECK:         ret i64 %[[VAL_21]]
+// CHECK-LABEL: define noundef i64 @test_1.argsCreator(ptr readonly captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr writeonly captures(none) initializes((0, 8)) %[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = load ptr, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_3:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_3]], align 8
+// CHECK:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = ptrtoint ptr %[[VAL_5]] to i64
+// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
+// CHECK:         %[[VAL_9:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 32
+// CHECK:         %[[VAL_10:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 24
+// CHECK:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         %[[VAL_12:.*]] = load ptr, ptr %[[VAL_10]], align 8
+// CHECK:         %[[VAL_13:.*]] = ptrtoint ptr %[[VAL_11]] to i64
+// CHECK:         %[[VAL_14:.*]] = ptrtoint ptr %[[VAL_12]] to i64
+// CHECK:         %[[VAL_15:.*]] = add i64 %[[VAL_8]], 32
+// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_15]], %[[VAL_13]]
+// CHECK:         %[[VAL_17:.*]] = sub i64 %[[VAL_16]], %[[VAL_14]]
+// CHECK:         %[[VAL_18:.*]] = tail call ptr @malloc(i64 %[[VAL_17]])
+// CHECK:         %[[VAL_19:.*]] = getelementptr i8, ptr %[[VAL_18]], i64 16
+// CHECK:         store i64 %[[VAL_8]], ptr %[[VAL_18]], align 4
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_19]], ptr align 1 %[[VAL_5]], i64 %[[VAL_8]], i1 false)
+// CHECK:         %[[VAL_20:.*]] = getelementptr i8, ptr %[[VAL_19]], i64 %[[VAL_8]]
+// CHECK:         %[[VAL_21:.*]] = getelementptr i8, ptr %[[VAL_18]], i64 8
+// CHECK:         %[[VAL_22:.*]] = sub i64 %[[VAL_13]], %[[VAL_14]]
+// CHECK:         store i64 %[[VAL_22]], ptr %[[VAL_21]], align 4
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_20]], ptr align 1 %[[VAL_12]], i64 %[[VAL_22]], i1 false)
+// CHECK:         store ptr %[[VAL_18]], ptr %[[VAL_1]], align 8
+// CHECK:         ret i64 %[[VAL_17]]
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_2.thunk(i8* 
+// CHECK-LABEL: define { ptr, i64 } @test_2.thunk(ptr
 // CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
-// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 16
-// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
-// CHECK:         ret { i8*, i64 } zeroinitializer
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_3:.*]] = load i64, ptr %[[VAL_0]], align 4
+// CHECK:         %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 16
+// CHECK:         tail call void @anchor(ptr %[[VAL_2]], i64 %[[VAL_4]])
+// CHECK:         ret { ptr, i64 } zeroinitializer
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_2.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
-// CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to { i32, double }**
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_2]] to { i32, double }**
-// CHECK:         %[[VAL_6:.*]] = load { i32, double }*, { i32, double }** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_7:.*]] = load { i32, double }*, { i32, double }** %[[VAL_5]], align 8
-// CHECK:         %[[VAL_8:.*]] = ptrtoint { i32, double }* %[[VAL_6]] to i64
-// CHECK:         %[[VAL_9:.*]] = ptrtoint { i32, double }* %[[VAL_7]] to i64
-// CHECK:         %[[VAL_10:.*]] = sub i64 %[[VAL_8]], %[[VAL_9]]
-// CHECK:         %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8
-// CHECK:         %[[VAL_12:.*]] = tail call i8* @malloc(i64 %[[VAL_11]])
-// CHECK:         %[[VAL_13:.*]] = getelementptr i8, i8* %[[VAL_12]], i64 8
-// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_12]] to i64*
-// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_14]], align 4
-// CHECK:         %[[VAL_15:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_16:.*]] = load i8*, i8** %[[VAL_15]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_13]], i8* align 1 %[[VAL_16]], i64 %[[VAL_10]], i1 false)
-// CHECK:         store i8* %[[VAL_12]], i8** %[[VAL_1]], align 8
-// CHECK:         ret i64 %[[VAL_11]]
+// CHECK-LABEL: define noundef i64 @test_2.argsCreator(ptr readonly captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr writeonly captures(none) initializes((0, 8)) %[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = load ptr, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_3:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_3]], align 8
+// CHECK:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = ptrtoint ptr %[[VAL_5]] to i64
+// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
+// CHECK:         %[[VAL_9:.*]] = add i64 %[[VAL_8]], 8
+// CHECK:         %[[VAL_10:.*]] = tail call ptr @malloc(i64 %[[VAL_9]])
+// CHECK:         %[[VAL_11:.*]] = getelementptr i8, ptr %[[VAL_10]], i64 8
+// CHECK:         store i64 %[[VAL_8]], ptr %[[VAL_10]], align 4
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_11]], ptr align 1 %[[VAL_5]], i64 %[[VAL_8]], i1 false)
+// CHECK:         store ptr %[[VAL_10]], ptr %[[VAL_1]], align 8
+// CHECK:         ret i64 %[[VAL_9]]
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_3.thunk(i8* 
+// CHECK-LABEL: define { ptr, i64 } @test_3.thunk(ptr
 // CHECK-SAME:      %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 4
-// CHECK:         %[[VAL_5:.*]] = sdiv i64 %[[VAL_4]], 2
-// CHECK:         %[[VAL_6:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 %[[VAL_4]]
-// CHECK:         %[[VAL_7:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to i64*
-// CHECK:         %[[VAL_9:.*]] = load i64, i64* %[[VAL_8]], align 4
-// CHECK:         %[[VAL_10:.*]] = sdiv i64 %[[VAL_9]], 4
-// CHECK:         tail call void @anchor(i8* %[[VAL_2]], i64 %[[VAL_5]])
-// CHECK:         tail call void @anchor(i8* %[[VAL_6]], i64 %[[VAL_10]])
-// CHECK:         ret { i8*, i64 } zeroinitializer
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 16
+// CHECK:         %[[VAL_3:.*]] = load i64, ptr %[[VAL_0]], align 4
+// CHECK:         %[[VAL_4:.*]] = sdiv i64 %[[VAL_3]], 2
+// CHECK:         %[[VAL_5:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 %[[VAL_3]]
+// CHECK:         %[[VAL_6:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_7:.*]] = load i64, ptr %[[VAL_6]], align 4
+// CHECK:         %[[VAL_8:.*]] = sdiv i64 %[[VAL_7]], 4
+// CHECK:         tail call void @anchor(ptr %[[VAL_2]], i64 %[[VAL_4]])
+// CHECK:         tail call void @anchor(ptr %[[VAL_5]], i64 %[[VAL_8]])
+// CHECK:         ret { ptr, i64 } zeroinitializer
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_3.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:    %[[VAL_0:.*]], i8** nocapture writeonly %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = load i8*, i8** %[[VAL_0]], align 8
-// CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to i16**
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_2]] to i16**
-// CHECK:         %[[VAL_6:.*]] = load i16*, i16** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_7:.*]] = load i16*, i16** %[[VAL_5]], align 8
-// CHECK:         %[[VAL_8:.*]] = ptrtoint i16* %[[VAL_6]] to i64
-// CHECK:         %[[VAL_9:.*]] = ptrtoint i16* %[[VAL_7]] to i64
-// CHECK:         %[[VAL_10:.*]] = sub i64 %[[VAL_8]], %[[VAL_9]]
-// CHECK:         %[[VAL_11:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 32
-// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_11]] to float**
-// CHECK:         %[[VAL_13:.*]] = getelementptr i8, i8* %[[VAL_2]], i64 24
-// CHECK:         %[[VAL_14:.*]] = bitcast i8* %[[VAL_13]] to float**
-// CHECK:         %[[VAL_15:.*]] = load float*, float** %[[VAL_12]], align 8
-// CHECK:         %[[VAL_16:.*]] = load float*, float** %[[VAL_14]], align 8
-// CHECK:         %[[VAL_17:.*]] = ptrtoint float* %[[VAL_15]] to i64
-// CHECK:         %[[VAL_18:.*]] = ptrtoint float* %[[VAL_16]] to i64
-// CHECK:         %[[VAL_19:.*]] = add i64 %[[VAL_10]], 32
-// CHECK:         %[[VAL_20:.*]] = add i64 %[[VAL_19]], %[[VAL_17]]
-// CHECK:         %[[VAL_21:.*]] = sub i64 %[[VAL_20]], %[[VAL_18]]
-// CHECK:         %[[VAL_22:.*]] = tail call i8* @malloc(i64 %[[VAL_21]])
-// CHECK:         %[[VAL_23:.*]] = getelementptr i8, i8* %[[VAL_22]], i64 16
-// CHECK:         %[[VAL_24:.*]] = bitcast i8* %[[VAL_22]] to { i64, i64 }*
-// CHECK:         %[[VAL_25:.*]] = bitcast i8* %[[VAL_22]] to i64*
-// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_25]], align 4
-// CHECK:         %[[VAL_26:.*]] = bitcast i8* %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_27:.*]] = load i8*, i8** %[[VAL_26]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_23]], i8* align 1 %[[VAL_27]], i64 %[[VAL_10]], i1 false)
-// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_23]], i64 %[[VAL_10]]
-// CHECK:         %[[VAL_29:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_24]], i64 0, i32 1
-// CHECK:         %[[VAL_30:.*]] = load float*, float** %[[VAL_12]], align 8
-// CHECK:         %[[VAL_31:.*]] = load float*, float** %[[VAL_14]], align 8
-// CHECK:         %[[VAL_32:.*]] = ptrtoint float* %[[VAL_30]] to i64
-// CHECK:         %[[VAL_33:.*]] = ptrtoint float* %[[VAL_31]] to i64
-// CHECK:         %[[VAL_34:.*]] = sub i64 %[[VAL_32]], %[[VAL_33]]
-// CHECK:         store i64 %[[VAL_34]], i64* %[[VAL_29]], align 4
-// CHECK:         %[[VAL_35:.*]] = bitcast i8* %[[VAL_13]] to i8**
-// CHECK:         %[[VAL_36:.*]] = load i8*, i8** %[[VAL_35]], align 8
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_28]], i8* align 1 %[[VAL_36]], i64 %[[VAL_34]], i1 false)
-// CHECK:         store i8* %[[VAL_22]], i8** %[[VAL_1]], align 8
-// CHECK:         ret i64 %[[VAL_21]]
+// CHECK-LABEL: define noundef i64 @test_3.argsCreator(ptr readonly captures(none)
+// CHECK-SAME:    %[[VAL_0:.*]], ptr writeonly captures(none) initializes((0, 8)) %[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = load ptr, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_3:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 8
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_3]], align 8
+// CHECK:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_6:.*]] = ptrtoint ptr %[[VAL_4]] to i64
+// CHECK:         %[[VAL_7:.*]] = ptrtoint ptr %[[VAL_5]] to i64
+// CHECK:         %[[VAL_8:.*]] = sub i64 %[[VAL_6]], %[[VAL_7]]
+// CHECK:         %[[VAL_9:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 32
+// CHECK:         %[[VAL_10:.*]] = getelementptr i8, ptr %[[VAL_2]], i64 24
+// CHECK:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         %[[VAL_12:.*]] = load ptr, ptr %[[VAL_10]], align 8
+// CHECK:         %[[VAL_13:.*]] = ptrtoint ptr %[[VAL_11]] to i64
+// CHECK:         %[[VAL_14:.*]] = ptrtoint ptr %[[VAL_12]] to i64
+// CHECK:         %[[VAL_15:.*]] = add i64 %[[VAL_8]], 32
+// CHECK:         %[[VAL_16:.*]] = add i64 %[[VAL_15]], %[[VAL_13]]
+// CHECK:         %[[VAL_17:.*]] = sub i64 %[[VAL_16]], %[[VAL_14]]
+// CHECK:         %[[VAL_18:.*]] = tail call ptr @malloc(i64 %[[VAL_17]])
+// CHECK:         %[[VAL_19:.*]] = getelementptr i8, ptr %[[VAL_18]], i64 16
+// CHECK:         store i64 %[[VAL_8]], ptr %[[VAL_18]], align 4
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_19]], ptr align 1 %[[VAL_5]], i64 %[[VAL_8]], i1 false)
+// CHECK:         %[[VAL_20:.*]] = getelementptr i8, ptr %[[VAL_19]], i64 %[[VAL_8]]
+// CHECK:         %[[VAL_21:.*]] = getelementptr i8, ptr %[[VAL_18]], i64 8
+// CHECK:         %[[VAL_22:.*]] = sub i64 %[[VAL_13]], %[[VAL_14]]
+// CHECK:         store i64 %[[VAL_22]], ptr %[[VAL_21]], align 4
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_20]], ptr align 1 %[[VAL_12]], i64 %[[VAL_22]], i1 false)
+// CHECK:         store ptr %[[VAL_18]], ptr %[[VAL_1]], align 8
+// CHECK:         ret i64 %[[VAL_17]]
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_3.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_3.argsCreator to i8*))
+// CHECK:         tail call void @cudaqRegisterKernelName(ptr nonnull @test_3.kernelName)
+// CHECK:         tail call void @cudaqRegisterArgsCreator(ptr nonnull @test_3.kernelName, ptr nonnull @test_3.argsCreator)
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/array_record_insert.qke b/test/Translate/array_record_insert.qke
index 2d6d62e6832..fbb3013e694 100644
--- a/test/Translate/array_record_insert.qke
+++ b/test/Translate/array_record_insert.qke
@@ -48,78 +48,78 @@ func.func @__nvqpp__mlirgen__function_multi_vector._Z12multi_vectorv() attribute
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_bell_pair._Z9bell_pairv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this, passthrough = ["entry_point", ["qir_profiles", "{{base_}}profile"], ["output_labeling_schema", "schema_id"], ["requiredQubits", "2"], ["requiredResults", "2"], ["output_names", "{{\[\[}}[0,[0,\22r00000\22]],[1,[1,\22r00001\22]]]]"]], "qir-api"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_1]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
+// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : i64
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[C0]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[C1]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
 // CHECK:           call @__quantum__qis__h__body(%[[VAL_3]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
 // CHECK:           call @__quantum__qis__cnot__body(%[[VAL_3]], %[[VAL_4]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
-// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_1]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[C0]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
 // CHECK:           call @__quantum__qis__mz__body(%[[VAL_3]], %[[VAL_5]]) {registerName = "r00000"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Result", opaque>>) -> ()
 // CHECK:           %[[VAL_6:.*]] = cc.address_of @cstr.72303030303000 : !cc.ptr<!llvm.array<7 x i8>>
 // CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!llvm.array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[C1]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
 // CHECK:           call @__quantum__qis__mz__body(%[[VAL_4]], %[[VAL_8]]) {registerName = "r00001"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Result", opaque>>) -> ()
 // CHECK:           %[[VAL_9:.*]] = cc.address_of @cstr.72303030303100 : !cc.ptr<!llvm.array<7 x i8>>
 // CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!llvm.array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_11:.*]] = cc.string_literal "array<i1 x 2>" : !cc.ptr<!cc.array<i8 x 14>>
 // CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x 14>>) -> !cc.ptr<i8>
-// CHECK:           call @__quantum__rt__array_record_output(%[[VAL_2]], %[[VAL_12]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           call @__quantum__rt__array_record_output(%[[C2]], %[[VAL_12]]) : (i64, !cc.ptr<i8>) -> ()
 // CHECK:           call @__quantum__rt__result_record_output(%[[VAL_5]], %[[VAL_7]]) {ResultIndex = 0 : i64, registerName = "r00000"} : (!cc.ptr<!llvm.struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
 // CHECK:           call @__quantum__rt__result_record_output(%[[VAL_8]], %[[VAL_10]]) {ResultIndex = 1 : i64, registerName = "r00001"} : (!cc.ptr<!llvm.struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_one_qubit._Z9one_qubitv() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this, passthrough = ["entry_point", ["qir_profiles", "base_profile"], ["output_labeling_schema", "schema_id"], ["requiredQubits", "1"], ["requiredResults", "1"], ["output_names", "{{\[\[}}[0,[0,\22r00000\22]]]]"]], "qir-api"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_2:.*]] = cc.cast %[[C0]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
 // CHECK:           call @__quantum__qis__x__body(%[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[C0]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
 // CHECK:           call @__quantum__qis__mz__body(%[[VAL_2]], %[[VAL_3]]) {registerName = "r00000"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Result", opaque>>) -> ()
 // CHECK:           %[[VAL_4:.*]] = cc.address_of @cstr.72303030303000 : !cc.ptr<!llvm.array<7 x i8>>
 // CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!llvm.array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_6:.*]] = cc.string_literal "array<i1 x 1>" : !cc.ptr<!cc.array<i8 x 14>>
 // CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!cc.array<i8 x 14>>) -> !cc.ptr<i8>
-// CHECK:           call @__quantum__rt__array_record_output(%[[VAL_1]], %[[VAL_7]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           call @__quantum__rt__array_record_output(%[[C1]], %[[VAL_7]]) : (i64, !cc.ptr<i8>) -> ()
 // CHECK:           call @__quantum__rt__result_record_output(%[[VAL_3]], %[[VAL_5]]) {ResultIndex = 0 : i64, registerName = "r00000"} : (!cc.ptr<!llvm.struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
 // CHECK:           return
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_multi_vector._Z12multi_vectorv() attributes {"cudaq-entrypoint", "cudaq-kernel", mapping_reorder_idx = [0, 1, 2, 3], mapping_v2p = [0, 1, 2, 3], no_this, passthrough = ["entry_point", ["qir_profiles", "base_profile"], ["output_labeling_schema", "schema_id"], ["requiredQubits", "4"], ["requiredResults", "4"], ["output_names", "{{\[\[}}[0,[0,\22r00000\22]],[1,[1,\22r00001\22]],[2,[2,\22r00002\22]],[3,[3,\22r00003\22]]]]"]], "qir-api"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_4:.*]] = arith.constant 4 : i64
-// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_2]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_1]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
+// CHECK-DAG:       %[[C4:.*]] = arith.constant 4 : i64
+// CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : i64
+// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : i64
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[C0]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
+// CHECK:           %[[VAL_6:.*]] = cc.cast %[[C1]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
+// CHECK:           %[[VAL_7:.*]] = cc.cast %[[C2]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[C3]] : (i64) -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
 // CHECK:           call @__quantum__qis__x__body(%[[VAL_5]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
 // CHECK:           call @__quantum__qis__x__body(%[[VAL_6]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
 // CHECK:           call @__quantum__qis__h__body(%[[VAL_8]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
 // CHECK:           call @__quantum__qis__cnot__body(%[[VAL_8]], %[[VAL_7]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_3]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[C0]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
 // CHECK:           call @__quantum__qis__mz__body(%[[VAL_5]], %[[VAL_9]]) {registerName = "r00000"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Result", opaque>>) -> ()
 // CHECK:           %[[VAL_10:.*]] = cc.address_of @cstr.72303030303000 : !cc.ptr<!llvm.array<7 x i8>>
 // CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!llvm.array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_2]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_12:.*]] = cc.cast %[[C1]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
 // CHECK:           call @__quantum__qis__mz__body(%[[VAL_6]], %[[VAL_12]]) {registerName = "r00001"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Result", opaque>>) -> ()
 // CHECK:           %[[VAL_13:.*]] = cc.address_of @cstr.72303030303100 : !cc.ptr<!llvm.array<7 x i8>>
 // CHECK:           %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!llvm.array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_1]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_15:.*]] = cc.cast %[[C2]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
 // CHECK:           call @__quantum__qis__mz__body(%[[VAL_7]], %[[VAL_15]]) {registerName = "r00002"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Result", opaque>>) -> ()
 // CHECK:           %[[VAL_16:.*]] = cc.address_of @cstr.72303030303200 : !cc.ptr<!llvm.array<7 x i8>>
 // CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!llvm.array<7 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_18:.*]] = cc.cast %[[C3]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
 // CHECK:           call @__quantum__qis__mz__body(%[[VAL_8]], %[[VAL_18]]) {registerName = "r00003"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<!llvm.struct<"Result", opaque>>) -> ()
 // CHECK:           %[[VAL_19:.*]] = cc.address_of @cstr.72303030303300 : !cc.ptr<!llvm.array<7 x i8>>
 // CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_19]] : (!cc.ptr<!llvm.array<7 x i8>>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_21:.*]] = cc.string_literal "array<i1 x 4>" : !cc.ptr<!cc.array<i8 x 14>>
 // CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<!cc.array<i8 x 14>>) -> !cc.ptr<i8>
-// CHECK:           call @__quantum__rt__array_record_output(%[[VAL_4]], %[[VAL_22]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           call @__quantum__rt__array_record_output(%[[C4]], %[[VAL_22]]) : (i64, !cc.ptr<i8>) -> ()
 // CHECK:           call @__quantum__rt__result_record_output(%[[VAL_9]], %[[VAL_11]]) {ResultIndex = 0 : i64, registerName = "r00000"} : (!cc.ptr<!llvm.struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
 // CHECK:           call @__quantum__rt__result_record_output(%[[VAL_12]], %[[VAL_14]]) {ResultIndex = 1 : i64, registerName = "r00001"} : (!cc.ptr<!llvm.struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
 // CHECK:           call @__quantum__rt__result_record_output(%[[VAL_15]], %[[VAL_17]]) {ResultIndex = 2 : i64, registerName = "r00002"} : (!cc.ptr<!llvm.struct<"Result", opaque>>, !cc.ptr<i8>) -> ()
diff --git a/test/Translate/base_profile-1.qke b/test/Translate/base_profile-1.qke
index 97bc2a315ba..b564341676a 100644
--- a/test/Translate/base_profile-1.qke
+++ b/test/Translate/base_profile-1.qke
@@ -31,15 +31,15 @@ func.func @__nvqpp__mlirgen__ghz() attributes {"cudaq-kernel"} {
 }
   
 // CHECK-LABEL: define void @__nvqpp__mlirgen__ghz()
-// CHECK:         tail call void @__quantum__qis__h__body(%[[VAL_0:.*]]* null)
-// CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* null, %[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*))
-// CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*), %[[VAL_0]]* nonnull inttoptr (i64 2 to %[[VAL_0]]*))
-// CHECK:         tail call void @__quantum__qis__mz__body(%[[VAL_0]]* null, %[[VAL_1:.*]]* null)
-// CHECK:         tail call void @__quantum__qis__mz__body(%[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*), %[[VAL_1]]* nonnull inttoptr (i64 1 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__qis__mz__body(%[[VAL_0]]* nonnull inttoptr (i64 2 to %[[VAL_0]]*), %[[VAL_1]]* nonnull inttoptr (i64 2 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%[[VAL_1]]* null, i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.72303030303000, i64 0, i64 0))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%[[VAL_1]]* nonnull inttoptr (i64 1 to %[[VAL_1]]*), i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.72303030303100, i64 0, i64 0))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%[[VAL_1]]* nonnull inttoptr (i64 2 to %[[VAL_1]]*), i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.72303030303200, i64 0, i64 0))
+// CHECK:         tail call void @__quantum__qis__h__body(ptr null)
+// CHECK:         tail call void @__quantum__qis__cnot__body(ptr null, ptr nonnull inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__qis__cnot__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 2 to ptr))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr null, ptr null)
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull inttoptr (i64 2 to ptr))
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr null, ptr nonnull @cstr.72303030303000)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.72303030303100)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull @cstr.72303030303200)
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/base_profile-2.qke b/test/Translate/base_profile-2.qke
index 557dedc56c7..63805803611 100644
--- a/test/Translate/base_profile-2.qke
+++ b/test/Translate/base_profile-2.qke
@@ -21,8 +21,8 @@ func.func @__nvqpp__mlirgen__t1() attributes {"cudaq-kernel"} {
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__t1()
-// CHECK:         tail call void @__quantum__qis__mz__body(%[[VAL_0:.*]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*), %[[VAL_1:.*]]* null)
-// CHECK:         tail call void @__quantum__rt__result_record_output(%[[VAL_1]]* null, i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.72303030303000, i64 0, i64 0))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr null)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr null, ptr nonnull @cstr.72303030303000)
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/base_profile-3.qke b/test/Translate/base_profile-3.qke
index 61f5022b16a..0d9d7085802 100644
--- a/test/Translate/base_profile-3.qke
+++ b/test/Translate/base_profile-3.qke
@@ -21,8 +21,8 @@ func.func @__nvqpp__mlirgen__t1() attributes {"cudaq-kernel"} {
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__t1()
-// CHECK:         tail call void @__quantum__qis__mz__body(%[[VAL_0:.*]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*), %[[VAL_1:.*]]* null)
-// CHECK:         tail call void @__quantum__rt__result_record_output(%[[VAL_1]]* null, i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.426F6200, i64 0, i64 0))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 1 to ptr), ptr null)
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr null, ptr nonnull @cstr.426F6200)
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/base_profile-4.qke b/test/Translate/base_profile-4.qke
index 408506384aa..544940fef30 100644
--- a/test/Translate/base_profile-4.qke
+++ b/test/Translate/base_profile-4.qke
@@ -48,18 +48,18 @@ func.func @sans_combine() {
 }
 
 // CHECK-LABEL: define void @sans_combine()
-// CHECK:         tail call void @__quantum__qis__h__body(%[[VAL_1:.*]]* null)
-// CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_1]]* null, %[[VAL_1]]* nonnull inttoptr (i64 3 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__qis__rx__body(double 4.300000e-01, %[[VAL_1]]* null)
-// CHECK:         tail call void @__quantum__qis__h__body(%[[VAL_1]]* nonnull inttoptr (i64 4 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_1]]* nonnull inttoptr (i64 4 to %[[VAL_1]]*), %[[VAL_1]]* nonnull inttoptr (i64 5 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__qis__rx__body(double 3.300000e-01, %[[VAL_1]]* nonnull inttoptr (i64 4 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__qis__h__body(%[[VAL_1]]* null)
-// CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_1]]* null, %[[VAL_1]]* nonnull inttoptr (i64 4 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__qis__rx__body(double 7.300000e-01, %[[VAL_1]]* nonnull inttoptr (i64 2 to %[[VAL_1]]*))
-// CHECK:         tail call void @__quantum__qis__mz__body(%[[VAL_1]]* null, %[[VAL_2:.*]]* null)
-// CHECK:         tail call void @__quantum__qis__mz__body(%[[VAL_1]]* nonnull inttoptr (i64 2 to %[[VAL_1]]*), %[[VAL_2]]* nonnull inttoptr (i64 1 to %[[VAL_2]]*))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%[[VAL_2]]* null, i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-// CHECK:         tail call void @__quantum__rt__result_record_output(%[[VAL_2]]* nonnull inttoptr (i64 1 to %[[VAL_2]]*), i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.{{.*}}, i64 0, i64 0))
+// CHECK:         tail call void @__quantum__qis__h__body(ptr null)
+// CHECK:         tail call void @__quantum__qis__cnot__body(ptr null, ptr nonnull inttoptr (i64 3 to ptr))
+// CHECK:         tail call void @__quantum__qis__rx__body(double 4.300000e-01, ptr null)
+// CHECK:         tail call void @__quantum__qis__h__body(ptr nonnull inttoptr (i64 4 to ptr))
+// CHECK:         tail call void @__quantum__qis__cnot__body(ptr nonnull inttoptr (i64 4 to ptr), ptr nonnull inttoptr (i64 5 to ptr))
+// CHECK:         tail call void @__quantum__qis__rx__body(double 3.300000e-01, ptr nonnull inttoptr (i64 4 to ptr))
+// CHECK:         tail call void @__quantum__qis__h__body(ptr null)
+// CHECK:         tail call void @__quantum__qis__cnot__body(ptr null, ptr nonnull inttoptr (i64 4 to ptr))
+// CHECK:         tail call void @__quantum__qis__rx__body(double 7.300000e-01, ptr nonnull inttoptr (i64 2 to ptr))
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr null, ptr null)
+// CHECK:         tail call void @__quantum__qis__mz__body(ptr nonnull inttoptr (i64 2 to ptr), ptr nonnull inttoptr (i64 1 to ptr))
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr null, ptr nonnull @cstr.{{.*}})
+// CHECK:         tail call void @__quantum__rt__result_record_output(ptr nonnull inttoptr (i64 1 to ptr), ptr nonnull @cstr.{{.*}})
 // CHECK:         ret void
 
diff --git a/test/Translate/basic.qke b/test/Translate/basic.qke
index b8493432c64..dbcc7db2f70 100644
--- a/test/Translate/basic.qke
+++ b/test/Translate/basic.qke
@@ -18,10 +18,10 @@ func.func @test_func(%p : i32) {
 // CHECK-LABEL: define void @test_func(i32 
 // CHECK-SAME:        %[[VAL_0:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_1:.*]] = zext i32 %[[VAL_0]] to i64
-// CHECK:         %[[VAL_2:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
-// CHECK:         %[[VAL_4:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 2)
-// CHECK-DAG:     tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_4]])
-// CHECK-DAG:     tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_2]])
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
+// CHECK:         %[[VAL_4:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 2)
+// CHECK-DAG:     tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_4]])
+// CHECK-DAG:     tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_2]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -50,17 +50,16 @@ func.func @test_func2() {
 }
 
 // CHECK-LABEL: define void @test_func2() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 5)
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 3)
-// CHECK:         %[[VAL_6:.*]] = bitcast %[[VAL_3]]** %[[VAL_5]] to i8**
-// CHECK:         %[[VAL_7:.*]] = load i8*, i8** %[[VAL_6]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_4]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_3]]* %[[VAL_4]], i8* %[[VAL_7]])
-// CHECK:         tail call void @__quantum__qis__rx(double 4.300000e-01, %[[VAL_3]]* %[[VAL_4]])
-// CHECK:         %[[VAL_8:.*]] = tail call %[[VAL_9:.*]]* @__quantum__qis__mz(%[[VAL_3]]* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 5)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 3)
+// CHECK:         %[[VAL_7:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_4]], ptr %[[VAL_7]])
+// CHECK:         tail call void @__quantum__qis__rx(double 4.300000e-01, ptr %[[VAL_4]])
+// CHECK:         %[[VAL_8:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -73,18 +72,15 @@ func.func @test_ctrl_swap_basic() {
 }
 
 // CHECK-LABEL: define void @test_ctrl_swap_basic() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = bitcast %[[VAL_3]]** %[[VAL_2]] to i8**
-// CHECK:         %[[VAL_5:.*]] = load i8*, i8** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_6:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_7:.*]] = bitcast %[[VAL_3]]** %[[VAL_6]] to i8**
-// CHECK:         %[[VAL_8:.*]] = load i8*, i8** %[[VAL_7]], align 8
-// CHECK:         %[[VAL_9:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 2)
-// CHECK:         %[[VAL_10:.*]] = bitcast %[[VAL_3]]** %[[VAL_9]] to i8**
-// CHECK:         %[[VAL_11:.*]] = load i8*, i8** %[[VAL_10]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 2, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*, %[[VAL_3]]*)* @__quantum__qis__swap__ctl to i8*), i8* %[[VAL_5]], i8* %[[VAL_8]], i8* %[[VAL_11]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_6:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_6]], align 8
+// CHECK:         %[[VAL_9:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// CHECK:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 2, ptr nonnull @__quantum__qis__swap__ctl, ptr %[[VAL_5]], ptr %[[VAL_8]], ptr %[[VAL_11]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -100,20 +96,19 @@ func.func @test_ctrl_swap_complex() {
 }
 
 // CHECK-LABEL: define void @test_ctrl_swap_complex() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 7)
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_1]]* @__quantum__rt__array_slice(%[[VAL_1]]* %[[VAL_0]], i32 1, i64 0, i64 1, i64 3)
-// CHECK:         %[[VAL_3:.*]] = tail call %[[VAL_4:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 4)
-// CHECK:         %[[VAL_5:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_3]], align 8
-// CHECK:         %[[VAL_6:.*]] = tail call %[[VAL_4]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 5)
-// CHECK:         %[[VAL_7:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_6]], align 8
-// CHECK:         %[[VAL_8:.*]] = tail call %[[VAL_4]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 6)
-// CHECK:         %[[VAL_9:.*]] = bitcast %[[VAL_4]]** %[[VAL_8]] to i8**
-// CHECK:         %[[VAL_10:.*]] = load i8*, i8** %[[VAL_9]], align 8
-// CHECK:         tail call void @__quantum__qis__swap__ctl(%[[VAL_1]]* %[[VAL_2]], %[[VAL_4]]* %[[VAL_5]], %[[VAL_4]]* %[[VAL_7]])
-// CHECK:         %[[VAL_11:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_1]]* %[[VAL_2]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 1, i64 1, i64 2, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_4]]*, %[[VAL_4]]*)* @__quantum__qis__swap__ctl to i8*), i64 %[[VAL_11]], %[[VAL_1]]* %[[VAL_2]], %[[VAL_4]]* %[[VAL_5]], %[[VAL_4]]* %[[VAL_7]], i8* %[[VAL_10]])
-// CHECK:         %[[VAL_12:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_1]]* %[[VAL_2]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 1, i64 1, i64 2, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_4]]*, %[[VAL_4]]*)* @__quantum__qis__swap__ctl to i8*), i64 %[[VAL_12]], %[[VAL_1]]* %[[VAL_2]], %[[VAL_4]]* %[[VAL_7]], %[[VAL_4]]* %[[VAL_5]], i8* %[[VAL_10]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 7)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_slice(ptr %[[VAL_0]], i32 1, i64 0, i64 1, i64 3)
+// CHECK:         %[[VAL_3:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 4)
+// CHECK:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_3]], align 8
+// CHECK:         %[[VAL_6:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 5)
+// CHECK:         %[[VAL_7:.*]] = load ptr, ptr %[[VAL_6]], align 8
+// CHECK:         %[[VAL_8:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 6)
+// CHECK:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_8]], align 8
+// CHECK:         tail call void @__quantum__qis__swap__ctl(ptr %[[VAL_2]], ptr %[[VAL_5]], ptr %[[VAL_7]])
+// CHECK:         %[[VAL_11:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_2]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 1, i64 1, i64 2, ptr nonnull @__quantum__qis__swap__ctl, i64 %[[VAL_11]], ptr %[[VAL_2]], ptr %[[VAL_5]], ptr %[[VAL_7]], ptr %[[VAL_10]])
+// CHECK:         %[[VAL_12:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_2]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 1, i64 1, i64 2, ptr nonnull @__quantum__qis__swap__ctl, i64 %[[VAL_12]], ptr %[[VAL_2]], ptr %[[VAL_7]], ptr %[[VAL_5]], ptr %[[VAL_10]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/callable.qke b/test/Translate/callable.qke
index a9b5b3a23da..6103267fd1d 100644
--- a/test/Translate/callable.qke
+++ b/test/Translate/callable.qke
@@ -22,13 +22,12 @@ func.func @foo(%0: !cc.callable<(!quake.veq<3>) -> ()>) {
   return
 }
 
-// CHECK-LABEL: define void @foo({ i8*, i8* } 
+// CHECK-LABEL: define void @foo({ ptr, ptr } 
 // CHECK-SAME:                                %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-// CHECK:         %[[VAL_4:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 0
-// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to void (%[[VAL_2]]*)*
-// CHECK:         tail call void %[[VAL_5]](%[[VAL_2]]* %[[VAL_1]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
+// CHECK:         %[[VAL_1:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+// CHECK:         %[[VAL_4:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 0
+// CHECK:         tail call void %[[VAL_4]](ptr %[[VAL_1]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_1]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -38,14 +37,12 @@ func.func @baz(%0: !cc.ptr<!cc.callable<(!quake.veq<3>) -> ()>>) {
   return
 }
 
-// CHECK-LABEL: define void @baz({ i8*, i8* }* nocapture writeonly 
-// CHECK-SAME:                                                     %[[VAL_0:.*]]) local_unnamed_addr #0 {
+// CHECK-LABEL: define void @baz(ptr writeonly captures(none)
+// CHECK-SAME:                                                     %[[VAL_0:.*]]) local_unnamed_addr
 // CHECK:         %[[VAL_1:.*]] = alloca {}, align 8
-// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds { i8*, i8* }, { i8*, i8* }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i8* bitcast (void ({ i8*, i8* }, %[[VAL_3:.*]]*)* @bar to i8*), i8** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds { i8*, i8* }, { i8*, i8* }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         %[[VAL_5:.*]] = bitcast i8** %[[VAL_4]] to {}**
-// CHECK:         store {}* %[[VAL_1]], {}** %[[VAL_5]], align 8
+// CHECK:         store ptr @bar, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_4]], align 8
 // CHECK:         ret void
 // CHECK:       }
 
@@ -57,15 +54,14 @@ func.func @thud(%2 : !cc.ptr<!cc.callable<(!quake.veq<3>) -> ()>>) {
   return
 }
 
-// CHECK-LABEL: define void @thud({ i8*, i8* }* nocapture writeonly 
+// CHECK-LABEL: define void @thud(ptr writeonly captures(none)
 // CHECK-SAME:                                                      %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds { i8*, i8* }, { i8*, i8* }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i8* bitcast (void (%[[VAL_2]]*)* @corge to i8*), i8** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds { i8*, i8* }, { i8*, i8* }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         store i8* null, i8** %[[VAL_5]], align 8
-// CHECK:         tail call void @corge(%[[VAL_2]]* %[[VAL_1]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
+// CHECK:         %[[VAL_1:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+// CHECK:         store ptr @corge, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// CHECK:         store ptr null, ptr %[[VAL_5]], align 8
+// CHECK:         tail call void @corge(ptr %[[VAL_1]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_1]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -74,21 +70,19 @@ func.func @qux(%0 : !cc.callable<(!quake.veq<3>) -> ()>, %1 : !quake.veq<3>) {
   return
 }
 
-// CHECK-LABEL: define void @qux({ i8*, i8* } 
-// CHECK-SAME:       %[[VAL_0:.*]], %[[VAL_1:.*]]* %[[VAL_2:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_3:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 0
-// CHECK:         %[[VAL_4:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 1
-// CHECK:         %[[VAL_5:.*]] = icmp eq i8* %[[VAL_4]], null
+// CHECK-LABEL: define void @qux({ ptr, ptr } 
+// CHECK-SAME:       %[[VAL_0:.*]], ptr %[[VAL_2:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_3:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 0
+// CHECK:         %[[VAL_4:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 1
+// CHECK:         %[[VAL_5:.*]] = icmp eq ptr %[[VAL_4]], null
 // CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       6:                                                ; preds = %[[VAL_8:.*]]
-// CHECK:         %[[VAL_9:.*]] = bitcast i8* %[[VAL_3]] to void (%[[VAL_1]]*)*
-// CHECK:         tail call void %[[VAL_9]](%[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         tail call void %[[VAL_3]](ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_10:.*]]
-// CHECK:       8:                                                ; preds = %[[VAL_8]]
-// CHECK:         %[[VAL_11:.*]] = bitcast i8* %[[VAL_3]] to void ({ i8*, i8* }, %[[VAL_1]]*)*
-// CHECK:         tail call void %[[VAL_11]]({ i8*, i8* } %[[VAL_0]], %[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         tail call void %[[VAL_3]]({ ptr, ptr } %[[VAL_0]], ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_10]]
-// CHECK:       10:                                               ; preds = %[[VAL_6]], %[[VAL_7]]
+// CHECK:       {{[0-9]+}}:
 // CHECK:         ret void
 // CHECK:       }
 
@@ -98,25 +92,22 @@ func.func @grault(%0 : !cc.ptr<!cc.callable<(!quake.veq<3>) -> ()>>, %1 : !quake
   return
 }
 
-// CHECK-LABEL: define void @grault({ i8*, i8* }* nocapture readonly 
-// CHECK-SAME:       %[[VAL_0:.*]], %[[VAL_1:.*]]* %[[VAL_2:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds { i8*, i8* }, { i8*, i8* }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         %[[VAL_4:.*]] = load i8*, i8** %[[VAL_3]], align 8
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds { i8*, i8* }, { i8*, i8* }* %[[VAL_0]], i64 0, i32 1
-// CHECK:         %[[VAL_6:.*]] = load i8*, i8** %[[VAL_5]], align 8
-// CHECK:         %[[VAL_7:.*]] = icmp eq i8* %[[VAL_6]], null
+// CHECK-LABEL: define void @grault(ptr readonly captures(none)
+// CHECK-SAME:       %[[VAL_0:.*]], ptr %[[VAL_2:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// CHECK:         %[[VAL_7:.*]] = icmp eq ptr %[[VAL_6]], null
 // CHECK:         br i1 %[[VAL_7]], label %[[VAL_8:.*]], label %[[VAL_9:.*]]
-// CHECK:       4:                                                ; preds = %[[VAL_10:.*]]
-// CHECK:         %[[VAL_11:.*]] = bitcast i8* %[[VAL_4]] to void (%[[VAL_1]]*)*
-// CHECK:         tail call void %[[VAL_11]](%[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         tail call void %[[VAL_4]](ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_12:.*]]
-// CHECK:       6:                                                ; preds = %[[VAL_10]]
-// CHECK:         %[[VAL_13:.*]] = insertvalue { i8*, i8* } poison, i8* %[[VAL_4]], 0
-// CHECK:         %[[VAL_14:.*]] = insertvalue { i8*, i8* } %[[VAL_13]], i8* %[[VAL_6]], 1
-// CHECK:         %[[VAL_15:.*]] = bitcast i8* %[[VAL_4]] to void ({ i8*, i8* }, %[[VAL_1]]*)*
-// CHECK:         tail call void %[[VAL_15]]({ i8*, i8* } %[[VAL_14]], %[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_13:.*]] = insertvalue { ptr, ptr } poison, ptr %[[VAL_4]], 0
+// CHECK:         %[[VAL_14:.*]] = insertvalue { ptr, ptr } %[[VAL_13]], ptr %[[VAL_6]], 1
+// CHECK:         tail call void %[[VAL_4]]({ ptr, ptr } %[[VAL_14]], ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_12]]
-// CHECK:       10:                                               ; preds = %[[VAL_8]], %[[VAL_9]]
+// CHECK:       {{[0-9]+}}:
 // CHECK:         ret void
 // CHECK:       }
 
@@ -125,22 +116,20 @@ func.func @quux(%0 : !cc.callable<(!quake.veq<3>) -> i32>, %1 : !quake.veq<3>) -
   return %2 : i32
 }
 
-// CHECK-LABEL: define i32 @quux({ i8*, i8* } 
-// CHECK-SAME:     %[[VAL_0:.*]], %[[VAL_1:.*]]* %[[VAL_2:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_3:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 0
-// CHECK:         %[[VAL_4:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 1
-// CHECK:         %[[VAL_5:.*]] = icmp eq i8* %[[VAL_4]], null
+// CHECK-LABEL: define i32 @quux({ ptr, ptr } 
+// CHECK-SAME:     %[[VAL_0:.*]], ptr %[[VAL_2:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_3:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 0
+// CHECK:         %[[VAL_4:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 1
+// CHECK:         %[[VAL_5:.*]] = icmp eq ptr %[[VAL_4]], null
 // CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       6:                                                ; preds = %[[VAL_8:.*]]
-// CHECK:         %[[VAL_9:.*]] = bitcast i8* %[[VAL_3]] to i32 (%[[VAL_1]]*)*
-// CHECK:         %[[VAL_10:.*]] = tail call i32 %[[VAL_9]](%[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_10:.*]] = tail call i32 %[[VAL_3]](ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_11:.*]]
-// CHECK:       9:                                                ; preds = %[[VAL_8]]
-// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_3]] to i32 ({ i8*, i8* }, %[[VAL_1]]*)*
-// CHECK:         %[[VAL_13:.*]] = tail call i32 %[[VAL_12]]({ i8*, i8* } %[[VAL_0]], %[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_13:.*]] = tail call i32 %[[VAL_3]]({ ptr, ptr } %[[VAL_0]], ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_11]]
-// CHECK:       12:                                               ; preds = %[[VAL_6]], %[[VAL_7]]
-// CHECK:         %[[VAL_14:.*]] = phi i32 [ %[[VAL_10]], %[[VAL_6]] ], [ %[[VAL_13]], %[[VAL_7]] ]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_14:.*]] = phi i32 [ %[[VAL_13]], %{{[0-9]+}} ], [ %[[VAL_10]], %{{[0-9]+}} ]
 // CHECK:         ret i32 %[[VAL_14]]
 // CHECK:       }
 
@@ -155,11 +144,9 @@ func.func @aloha() {
 
 // CHECK-LABEL: define void @aloha() local_unnamed_addr {
 // CHECK:         %[[VAL_0:.*]] = alloca { i32 }, align 8
-// CHECK:         %[[VAL_1:.*]] = getelementptr inbounds { i32 }, { i32 }* %[[VAL_0]], i64 0, i32 0
-// CHECK:         store i32 32, i32* %[[VAL_1]], align 8
-// CHECK:         %[[VAL_2:.*]] = bitcast { i32 }* %[[VAL_0]] to i8*
-// CHECK:         %[[VAL_3:.*]] = insertvalue { i8*, i8* } { i8* bitcast (i32 ({ i8*, i8* }, %[[VAL_4:.*]]*)* @waldo to i8*), i8* undef }, i8* %[[VAL_2]], 1
-// CHECK:         call void @ae({ i8*, i8* } %[[VAL_3]])
+// CHECK:         store i32 32, ptr %[[VAL_0]], align 8
+// CHECK:         %[[VAL_3:.*]] = insertvalue { ptr, ptr } { ptr @waldo, ptr undef }, ptr %[[VAL_0]], 1
+// CHECK:         call void @ae({ ptr, ptr } %[[VAL_3]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -168,22 +155,20 @@ func.func @ahupuaa(%0 : !cc.callable<(!quake.veq<3>) -> i16>, %1 : !quake.veq<3>
   return %2 : i16
 }
 
-// CHECK-LABEL: define i16 @ahupuaa({ i8*, i8* } 
-// CHECK-SAME:      %[[VAL_0:.*]], %[[VAL_1:.*]]* %[[VAL_2:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_3:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 0
-// CHECK:         %[[VAL_4:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 1
-// CHECK:         %[[VAL_5:.*]] = icmp eq i8* %[[VAL_4]], null
+// CHECK-LABEL: define i16 @ahupuaa({ ptr, ptr } 
+// CHECK-SAME:      %[[VAL_0:.*]], ptr %[[VAL_2:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_3:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 0
+// CHECK:         %[[VAL_4:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 1
+// CHECK:         %[[VAL_5:.*]] = icmp eq ptr %[[VAL_4]], null
 // CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       6:                                                ; preds = %[[VAL_8:.*]]
-// CHECK:         %[[VAL_9:.*]] = bitcast i8* %[[VAL_3]] to i16 (%[[VAL_1]]*)*
-// CHECK:         %[[VAL_10:.*]] = tail call i16 %[[VAL_9]](%[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_10:.*]] = tail call i16 %[[VAL_3]](ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_11:.*]]
-// CHECK:       9:                                                ; preds = %[[VAL_8]]
-// CHECK:         %[[VAL_12:.*]] = bitcast i8* %[[VAL_3]] to i16 ({ i8*, i8* }, %[[VAL_1]]*)*
-// CHECK:         %[[VAL_13:.*]] = tail call i16 %[[VAL_12]]({ i8*, i8* } %[[VAL_0]], %[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_13:.*]] = tail call i16 %[[VAL_3]]({ ptr, ptr } %[[VAL_0]], ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_11]]
-// CHECK:       12:                                               ; preds = %[[VAL_6]], %[[VAL_7]]
-// CHECK:         %[[VAL_14:.*]] = phi i16 [ %[[VAL_10]], %[[VAL_6]] ], [ %[[VAL_13]], %[[VAL_7]] ]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_14:.*]] = phi i16 [ %[[VAL_13]], %{{[0-9]+}} ], [ %[[VAL_10]], %{{[0-9]+}} ]
 // CHECK:         ret i16 %[[VAL_14]]
 // CHECK:       }
 
@@ -196,7 +181,7 @@ func.func @aina() {
 }
 
 // CHECK-LABEL: define void @aina() local_unnamed_addr {
-// CHECK:         tail call void @akamai({ i8*, i8* } { i8* bitcast (i16 (%[[VAL_0:.*]]*)* @fred to i8*), i8* null })
+// CHECK:         tail call void @akamai({ ptr, ptr } { ptr @fred, ptr null })
 // CHECK:         ret void
 // CHECK:       }
 
@@ -205,22 +190,20 @@ func.func @akua(%0: !cc.callable<(!quake.veq<3>) -> f64>, %1: !quake.veq<3>, %2:
   return %3 : f64
 }
 
-// CHECK-LABEL: define double @akua({ i8*, i8* } 
-// CHECK-SAME:     %[[VAL_0:.*]], %[[VAL_1:.*]]* %[[VAL_2:.*]], i64 %[[VAL_3:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_4:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 0
-// CHECK:         %[[VAL_5:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 1
-// CHECK:         %[[VAL_6:.*]] = icmp eq i8* %[[VAL_5]], null
+// CHECK-LABEL: define double @akua({ ptr, ptr } 
+// CHECK-SAME:     %[[VAL_0:.*]], ptr %[[VAL_2:.*]], i64 %[[VAL_3:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_4:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 0
+// CHECK:         %[[VAL_5:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 1
+// CHECK:         %[[VAL_6:.*]] = icmp eq ptr %[[VAL_5]], null
 // CHECK:         br i1 %[[VAL_6]], label %[[VAL_7:.*]], label %[[VAL_8:.*]]
-// CHECK:       7:                                                ; preds = %[[VAL_9:.*]]
-// CHECK:         %[[VAL_10:.*]] = bitcast i8* %[[VAL_4]] to double (%[[VAL_1]]*)*
-// CHECK:         %[[VAL_11:.*]] = tail call double %[[VAL_10]](%[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_11:.*]] = tail call double %[[VAL_4]](ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_12:.*]]
-// CHECK:       10:                                               ; preds = %[[VAL_9]]
-// CHECK:         %[[VAL_13:.*]] = bitcast i8* %[[VAL_4]] to double ({ i8*, i8* }, %[[VAL_1]]*)*
-// CHECK:         %[[VAL_14:.*]] = tail call double %[[VAL_13]]({ i8*, i8* } %[[VAL_0]], %[[VAL_1]]* %[[VAL_2]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_14:.*]] = tail call double %[[VAL_4]]({ ptr, ptr } %[[VAL_0]], ptr %[[VAL_2]])
 // CHECK:         br label %[[VAL_12]]
-// CHECK:       13:                                               ; preds = %[[VAL_7]], %[[VAL_8]]
-// CHECK:         %[[VAL_15:.*]] = phi double [ %[[VAL_11]], %[[VAL_7]] ], [ %[[VAL_14]], %[[VAL_8]] ]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_15:.*]] = phi double [ %[[VAL_14]], %{{[0-9]+}} ], [ %[[VAL_11]], %{{[0-9]+}} ]
 // CHECK:         ret double %[[VAL_15]]
 // CHECK:       }
 
@@ -235,13 +218,11 @@ func.func @ala(%0: i32, %1: i32) {
 // CHECK-LABEL: define void @ala(i32 
 // CHECK-SAME:      %[[VAL_0:.*]], i32 %[[VAL_1:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_2:.*]] = alloca { i32, i32 }, align 8
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* %[[VAL_2]], i64 0, i32 0
-// CHECK:         store i32 %[[VAL_0]], i32* %[[VAL_3]], align 8
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* %[[VAL_2]], i64 0, i32 1
-// CHECK:         store i32 %[[VAL_1]], i32* %[[VAL_4]], align 4
-// CHECK:         %[[VAL_5:.*]] = bitcast { i32, i32 }* %[[VAL_2]] to i8*
-// CHECK:         %[[VAL_6:.*]] = insertvalue { i8*, i8* } { i8* bitcast (double ({ i8*, i8* }, %[[VAL_7:.*]]*, i64)* @garply to i8*), i8* undef }, i8* %[[VAL_5]], 1
-// CHECK:         call void @alii({ i8*, i8* } %[[VAL_6]])
+// CHECK:         store i32 %[[VAL_0]], ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_2]], i64 4
+// CHECK:         store i32 %[[VAL_1]], ptr %[[VAL_4]], align 4
+// CHECK:         %[[VAL_6:.*]] = insertvalue { ptr, ptr } { ptr @garply, ptr undef }, ptr %[[VAL_2]], 1
+// CHECK:         call void @alii({ ptr, ptr } %[[VAL_6]])
 // CHECK:         ret void
 // CHECK:       }
 
@@ -250,22 +231,20 @@ func.func @aole(%0 : !cc.callable<(!quake.veq<3>, i32) -> f32>, %1 : !quake.veq<
   return %3 : f32
 }
 
-// CHECK-LABEL: define float @aole({ i8*, i8* } 
-// CHECK-SAME:      %[[VAL_0:.*]], %[[VAL_1:.*]]* %[[VAL_2:.*]], i32 %[[VAL_3:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_4:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 0
-// CHECK:         %[[VAL_5:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 1
-// CHECK:         %[[VAL_6:.*]] = icmp eq i8* %[[VAL_5]], null
+// CHECK-LABEL: define float @aole({ ptr, ptr } 
+// CHECK-SAME:      %[[VAL_0:.*]], ptr %[[VAL_2:.*]], i32 %[[VAL_3:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_4:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 0
+// CHECK:         %[[VAL_5:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 1
+// CHECK:         %[[VAL_6:.*]] = icmp eq ptr %[[VAL_5]], null
 // CHECK:         br i1 %[[VAL_6]], label %[[VAL_7:.*]], label %[[VAL_8:.*]]
-// CHECK:       7:                                                ; preds = %[[VAL_9:.*]]
-// CHECK:         %[[VAL_10:.*]] = bitcast i8* %[[VAL_4]] to float (%[[VAL_1]]*, i32)*
-// CHECK:         %[[VAL_11:.*]] = tail call float %[[VAL_10]](%[[VAL_1]]* %[[VAL_2]], i32 %[[VAL_3]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_11:.*]] = tail call float %[[VAL_4]](ptr %[[VAL_2]], i32 %[[VAL_3]])
 // CHECK:         br label %[[VAL_12:.*]]
-// CHECK:       10:                                               ; preds = %[[VAL_9]]
-// CHECK:         %[[VAL_13:.*]] = bitcast i8* %[[VAL_4]] to float ({ i8*, i8* }, %[[VAL_1]]*, i32)*
-// CHECK:         %[[VAL_14:.*]] = tail call float %[[VAL_13]]({ i8*, i8* } %[[VAL_0]], %[[VAL_1]]* %[[VAL_2]], i32 %[[VAL_3]])
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_14:.*]] = tail call float %[[VAL_4]]({ ptr, ptr } %[[VAL_0]], ptr %[[VAL_2]], i32 %[[VAL_3]])
 // CHECK:         br label %[[VAL_12]]
-// CHECK:       13:                                               ; preds = %[[VAL_7]], %[[VAL_8]]
-// CHECK:         %[[VAL_15:.*]] = phi float [ %[[VAL_11]], %[[VAL_7]] ], [ %[[VAL_14]], %[[VAL_8]] ]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_15:.*]] = phi float [ %[[VAL_14]], %{{[0-9]+}} ], [ %[[VAL_11]], %{{[0-9]+}} ]
 // CHECK:         ret float %[[VAL_15]]
 // CHECK:       }
 
@@ -278,7 +257,7 @@ func.func @auau() {
 }
 
 // CHECK-LABEL: define void @auau() local_unnamed_addr {
-// CHECK:         tail call void @aumakua({ i8*, i8* } { i8* bitcast (float (%[[VAL_0:.*]]*, i32)* @plugh to i8*), i8* null })
+// CHECK:         tail call void @aumakua({ ptr, ptr } { ptr @plugh, ptr null })
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/callable_closure.qke b/test/Translate/callable_closure.qke
index 991df576e57..92b64271875 100644
--- a/test/Translate/callable_closure.qke
+++ b/test/Translate/callable_closure.qke
@@ -61,45 +61,43 @@ module attributes {cc.python_uniqued = "kernel0..0x7c72351ce620", llvm.data_layo
   }
 }
 
-// CHECK-LABEL: define void @__nvqpp__mlirgen__kernel0..0x7c72351ce620(i64 
+// CHECK-LABEL: define void @__nvqpp__mlirgen__kernel0..0x7c72351ce620(i64
 // CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 4)
-// CHECK:         %[[VAL_3:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_2]]* %[[VAL_1]])
-// CHECK:         %[[VAL_4:.*]] = icmp sgt i64 %[[VAL_3]], 0
-// CHECK:         br i1 %[[VAL_4]], label %[[VAL_5:.*]], label %[[VAL_6:.*]]
-// CHECK:         ; preds = %[[VAL_7:.*]], %[[VAL_5]]
-// CHECK:         %[[VAL_8:.*]] = phi i64 [ %[[VAL_9:.*]], %[[VAL_5]] ], [ 0, %[[VAL_7]] ]
-// CHECK:         %[[VAL_10:.*]] = tail call %[[VAL_11:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 %[[VAL_8]])
-// CHECK:         %[[VAL_12:.*]] = load %[[VAL_11]]*, %[[VAL_11]]** %[[VAL_10]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_11]]* %[[VAL_12]])
-// CHECK:         %[[VAL_9]] = add nuw nsw i64 %[[VAL_8]], 1
-// CHECK:         %[[VAL_13:.*]] = icmp eq i64 %[[VAL_9]], %[[VAL_3]]
-// CHECK:         br i1 %[[VAL_13]], label %[[VAL_6]], label %[[VAL_5]]
-// CHECK:         ; preds = %[[VAL_5]], %[[VAL_7]]
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
+// CHECK:         %[[VAL_1:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 4)
+// CHECK:         %[[VAL_2:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_1]])
+// CHECK:         %[[VAL_3:.*]] = icmp sgt i64 %[[VAL_2]], 0
+// CHECK:         br i1 %[[VAL_3]], label %[[VAL_4:.*]], label %[[VAL_5:.*]]
+// CHECK:       .lr.ph:
+// CHECK:         %[[VAL_6:.*]] = phi i64 [ %[[VAL_9:.*]], %.lr.ph ], [ 0, %[[VAL_10:.*]] ]
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_1]], i64 %[[VAL_6]])
+// CHECK:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_7]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_8]])
+// CHECK:         %[[VAL_9]] = add nuw nsw i64 %[[VAL_6]], 1
+// CHECK:         %exitcond.not = icmp eq i64 %[[VAL_9]], %[[VAL_2]]
+// CHECK:         br i1 %exitcond.not, label %._crit_edge, label %.lr.ph
+// CHECK:       ._crit_edge:
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_1]])
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define void @__nvqpp__callable.thunk.lambda.0({ i8*, i8* } 
+// CHECK-LABEL: define void @__nvqpp__callable.thunk.lambda.0({ ptr, ptr }
 // CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = extractvalue { i8*, i8* } %[[VAL_0]], 1
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_1]] to i64**
-// CHECK:         %[[VAL_3:.*]] = load i64*, i64** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_4:.*]] = load i64, i64* %[[VAL_3]], align 8
-// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_6:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_4]])
-// CHECK:         %[[VAL_7:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_6]]* %[[VAL_5]])
-// CHECK:         %[[VAL_8:.*]] = icmp sgt i64 %[[VAL_7]], 0
-// CHECK:         br i1 %[[VAL_8]], label %[[VAL_9:.*]], label %[[VAL_10:.*]]
-// CHECK:         ; preds = %[[VAL_11:.*]], %[[VAL_9]]
-// CHECK:         %[[VAL_12:.*]] = phi i64 [ %[[VAL_13:.*]], %[[VAL_9]] ], [ 0, %[[VAL_11]] ]
-// CHECK:         %[[VAL_14:.*]] = tail call %[[VAL_15:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_6]]* %[[VAL_5]], i64 %[[VAL_12]])
-// CHECK:         %[[VAL_16:.*]] = load %[[VAL_15]]*, %[[VAL_15]]** %[[VAL_14]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_15]]* %[[VAL_16]])
-// CHECK:         %[[VAL_13]] = add nuw nsw i64 %[[VAL_12]], 1
-// CHECK:         %[[VAL_17:.*]] = icmp eq i64 %[[VAL_13]], %[[VAL_7]]
-// CHECK:         br i1 %[[VAL_17]], label %[[VAL_10]], label %[[VAL_9]]
-// CHECK:         ; preds = %[[VAL_9]], %[[VAL_11]]
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_6]]* %[[VAL_5]])
+// CHECK:         %[[VAL_1:.*]] = extractvalue { ptr, ptr } %[[VAL_0]], 1
+// CHECK:         %.unpack = load ptr, ptr %[[VAL_1]], align 8
+// CHECK:         %[[VAL_2:.*]] = load i64, ptr %.unpack, align 8
+// CHECK:         %[[VAL_3:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 %[[VAL_2]])
+// CHECK:         %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_3]])
+// CHECK:         %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0
+// CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
+// CHECK:       .lr.ph:
+// CHECK:         %[[VAL_8:.*]] = phi i64 [ %[[VAL_11:.*]], %.lr.ph ], [ 0, %[[VAL_12:.*]] ]
+// CHECK:         %[[VAL_9:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_3]], i64 %[[VAL_8]])
+// CHECK:         %[[VAL_10:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_10]])
+// CHECK:         %[[VAL_11]] = add nuw nsw i64 %[[VAL_8]], 1
+// CHECK:         %exitcond.not = icmp eq i64 %[[VAL_11]], %[[VAL_4]]
+// CHECK:         br i1 %exitcond.not, label %._crit_edge, label %.lr.ph
+// CHECK:       ._crit_edge:
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_3]])
 // CHECK:         ret void
 // CHECK:       }
-
diff --git a/test/Translate/cast.qke b/test/Translate/cast.qke
index 13c22c02e5c..5f97d00a240 100644
--- a/test/Translate/cast.qke
+++ b/test/Translate/cast.qke
@@ -18,8 +18,8 @@ func.func @simp5() -> !cc.ptr<i8> {
 
 // CHECK-LABEL: declare void @simp3()
 
-// CHECK-LABEL: define nonnull i8* @simp5() local_unnamed_addr
-// CHECK:         ret i8* bitcast (void ()* @simp3 to i8*)
+// CHECK-LABEL: define noundef nonnull ptr @simp5() local_unnamed_addr
+// CHECK:         ret ptr @simp3
 // CHECK:       }
 
 func.func @cup(%0 : complex<f32>) -> complex<f64> {
@@ -38,7 +38,7 @@ func.func @cdown(%0 : complex<f64>) -> complex<f32> {
 // CHECK:         %[[VAL_2:.*]] = extractvalue { float, float } %[[VAL_0]], 1
 // CHECK:         %[[VAL_3:.*]] = fpext float %[[VAL_1]] to double
 // CHECK:         %[[VAL_4:.*]] = fpext float %[[VAL_2]] to double
-// CHECK:         %[[VAL_5:.*]] = insertvalue { double, double } undef, double %[[VAL_3]], 0
+// CHECK:         %[[VAL_5:.*]] = insertvalue { double, double } poison, double %[[VAL_3]], 0
 // CHECK:         %[[VAL_6:.*]] = insertvalue { double, double } %[[VAL_5]], double %[[VAL_4]], 1
 // CHECK:         ret { double, double } %[[VAL_6]]
 // CHECK:       }
@@ -49,7 +49,7 @@ func.func @cdown(%0 : complex<f64>) -> complex<f32> {
 // CHECK:         %[[VAL_2:.*]] = extractvalue { double, double } %[[VAL_0]], 1
 // CHECK:         %[[VAL_3:.*]] = fptrunc double %[[VAL_1]] to float
 // CHECK:         %[[VAL_4:.*]] = fptrunc double %[[VAL_2]] to float
-// CHECK:         %[[VAL_5:.*]] = insertvalue { float, float } undef, float %[[VAL_3]], 0
+// CHECK:         %[[VAL_5:.*]] = insertvalue { float, float } poison, float %[[VAL_3]], 0
 // CHECK:         %[[VAL_6:.*]] = insertvalue { float, float } %[[VAL_5]], float %[[VAL_4]], 1
 // CHECK:         ret { float, float } %[[VAL_6]]
 // CHECK:       }
diff --git a/test/Translate/const_array.qke b/test/Translate/const_array.qke
index 25ad4fe84a0..9183470e33c 100644
--- a/test/Translate/const_array.qke
+++ b/test/Translate/const_array.qke
@@ -25,7 +25,7 @@ func.func @f() {
 // CHECK-LABEL: @f.rodata_0 = private constant [3 x i32] [i32 0, i32 1, i32 0]
 
 // CHECK-LABEL: define void @f() local_unnamed_addr {
-// CHECK:         tail call void @g({ i32*, i64 } { i32* getelementptr inbounds ([3 x i32], [3 x i32]* @f.rodata_0, i32 0, i32 0), i64 3 })
+// CHECK:         tail call void @g({ ptr, i64 } { ptr @f.rodata_0, i64 3 })
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/custom_operation.qke b/test/Translate/custom_operation.qke
index 40ab2a370df..66e85423ef7 100644
--- a/test/Translate/custom_operation.qke
+++ b/test/Translate/custom_operation.qke
@@ -27,21 +27,21 @@ module {
 }
  
 // CHECK-LABEL: define void @__nvqpp__mlirgen__function_kernel._Z6kernelv()
-// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 1)
-// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = tail call %Array* @__quantum__rt__array_create_1d(i32 8, i64 1)
-// CHECK:         %[[VAL_6:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_5]], i64 0)
-// CHECK:         store %Qubit* %[[VAL_4]], %Qubit** %[[VAL_6]], align 8
-// CHECK:         tail call void @__quantum__qis__custom_unitary__adj({ double, double }* nonnull getelementptr inbounds ([4 x { double, double }], [4 x { double, double }]* @__nvqpp__mlirgen__function_custom_s_generator_1.{{.*}}, i64 0, i64 0), %Array* null, %Array* %[[VAL_5]], i8* nonnull getelementptr inbounds ([18 x i8], [18 x i8]* @cstr.66756E6374696F6E5F637573746F6D5F7300, i64 0, i64 0))
-// CHECK:         %[[VAL_7:.*]] = tail call %Array* @__quantum__rt__array_create_1d(i32 8, i64 1)
-// CHECK:         %[[VAL_8:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_7]], i64 0)
-// CHECK:         store %Qubit* %[[VAL_4]], %Qubit** %[[VAL_8]], align 8
-// CHECK:         tail call void @__quantum__qis__custom_unitary({ double, double }* nonnull getelementptr inbounds ([4 x { double, double }], [4 x { double, double }]* @__nvqpp__mlirgen__function_custom_s_adj_generator_1.{{.*}}, i64 0, i64 0), %Array* null, %Array* %[[VAL_7]], i8* nonnull getelementptr inbounds ([22 x i8], [22 x i8]* @cstr.66756E6374696F6E5F637573746F6D5F735F61646A00, i64 0, i64 0))
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_9:.*]] = tail call %[[VAL_10:.*]]* @__quantum__qis__mz(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 1)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_create_1d(i32 8, i64 1)
+// CHECK:         %[[VAL_6:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_5]], i64 0)
+// CHECK:         store ptr %[[VAL_4]], ptr %[[VAL_6]], align 8
+// CHECK:         tail call void @__quantum__qis__custom_unitary__adj(ptr nonnull @__nvqpp__mlirgen__function_custom_s_generator_1.{{.*}}, ptr null, ptr %[[VAL_5]], ptr nonnull @cstr.66756E6374696F6E5F637573746F6D5F7300)
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__rt__array_create_1d(i32 8, i64 1)
+// CHECK:         %[[VAL_8:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_7]], i64 0)
+// CHECK:         store ptr %[[VAL_4]], ptr %[[VAL_8]], align 8
+// CHECK:         tail call void @__quantum__qis__custom_unitary(ptr nonnull @__nvqpp__mlirgen__function_custom_s_adj_generator_1.{{.*}}, ptr null, ptr %[[VAL_7]], ptr nonnull @cstr.66756E6374696F6E5F637573746F6D5F735F61646A00)
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_9:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/emit-mlir.qke b/test/Translate/emit-mlir.qke
index 0948d5f3f88..c9a915c044a 100644
--- a/test/Translate/emit-mlir.qke
+++ b/test/Translate/emit-mlir.qke
@@ -21,12 +21,12 @@ func.func @test_func(%p : i32) {
 // CHECK-SAME:          %[[VAL_0:.*]]: i32) {
 // CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(2 : i64) : i64
 // CHECK:           %[[VAL_2:.*]] = llvm.zext %[[VAL_0]] : i32 to i64
-// CHECK:           %[[VAL_3:.*]] = llvm.call @__quantum__rt__qubit_allocate_array(%[[VAL_2]]) : (i64) -> !llvm.ptr<struct<"Array", opaque>>
-// CHECK:           %[[VAL_4:.*]] = llvm.call @__quantum__rt__qubit_allocate_array(%[[VAL_1]]) : (i64) -> !llvm.ptr<struct<"Array", opaque>>
-// CHECK-DAG:       llvm.call @__quantum__rt__qubit_release_array(%[[VAL_4]]) : (!llvm.ptr<struct<"Array", opaque>>) -> ()
-// CHECK-DAG:       llvm.call @__quantum__rt__qubit_release_array(%[[VAL_3]]) : (!llvm.ptr<struct<"Array", opaque>>) -> ()
+// CHECK:           %[[VAL_3:.*]] = llvm.call @__quantum__rt__qubit_allocate_array(%[[VAL_2]]) : (i64) -> !llvm.ptr
+// CHECK:           %[[VAL_4:.*]] = llvm.call @__quantum__rt__qubit_allocate_array(%[[VAL_1]]) : (i64) -> !llvm.ptr
+// CHECK-DAG:       llvm.call @__quantum__rt__qubit_release_array(%[[VAL_4]]) : (!llvm.ptr) -> ()
+// CHECK-DAG:       llvm.call @__quantum__rt__qubit_release_array(%[[VAL_3]]) : (!llvm.ptr) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
 
-// CHECK:         llvm.func @__quantum__rt__qubit_allocate_array(i64) -> !llvm.ptr<struct<"Array", opaque>> attributes {sym_visibility = "private"}
-// CHECK:         llvm.func @__quantum__rt__qubit_release_array(!llvm.ptr<struct<"Array", opaque>>) attributes {sym_visibility = "private"}
+// CHECK:         llvm.func @__quantum__rt__qubit_allocate_array(i64) -> !llvm.ptr attributes {sym_visibility = "private"}
+// CHECK:         llvm.func @__quantum__rt__qubit_release_array(!llvm.ptr) attributes {sym_visibility = "private"}
diff --git a/test/Translate/exp_pauli-1.qke b/test/Translate/exp_pauli-1.qke
index 39a04299669..3dbbd3b5dbf 100644
--- a/test/Translate/exp_pauli-1.qke
+++ b/test/Translate/exp_pauli-1.qke
@@ -26,20 +26,18 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__Z4mainE3$_0 = "_Z
 
 // CHECK-LABEL: define void @"__nvqpp__mlirgen__Z4mainE3$_0"(double 
 // CHECK-SAME:    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = alloca [1 x { i8*, i64 }], align 8
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 4)
-// CHECK:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 0)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_5]]* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %[[VAL_5]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 1)
-// CHECK:         %[[VAL_8:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_7]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_5]]* %[[VAL_8]])
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds [1 x { i8*, i64 }], [1 x { i8*, i64 }]* %[[VAL_1]], i64 0, i64 0, i32 0
-// CHECK:         store i8* getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.5858585900, i64 0, i64 0), i8** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds [1 x { i8*, i64 }], [1 x { i8*, i64 }]* %[[VAL_1]], i64 0, i64 0, i32 1
-// CHECK:         store i64 4, i64* %[[VAL_10]], align 8
-// CHECK:         %[[VAL_11:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_1]] to i8*
-// CHECK:         call void @__quantum__qis__exp_pauli(double %[[VAL_0]], %[[VAL_3]]* %[[VAL_2]], i8* nonnull %[[VAL_11]])
-// CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
+// CHECK:         %[[VAL_1:.*]] = alloca [1 x { ptr, i64 }], align 8
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 4)
+// CHECK:         %[[VAL_4:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 0)
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_4]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 1)
+// CHECK:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_7]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_8]])
+// CHECK:         store ptr @cstr.5858585900, ptr %[[VAL_1]], align 8
+// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_1]], i64 8
+// CHECK:         store i64 4, ptr %[[VAL_10]], align 8
+// CHECK:         call void @__quantum__qis__exp_pauli(double %[[VAL_0]], ptr %[[VAL_2]], ptr nonnull %[[VAL_1]])
+// CHECK:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_2]])
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/exp_pauli-3.qke b/test/Translate/exp_pauli-3.qke
index 1b5a79ee512..a73e40e35a9 100644
--- a/test/Translate/exp_pauli-3.qke
+++ b/test/Translate/exp_pauli-3.qke
@@ -31,20 +31,17 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__Z4mainE3$_0 = "_Z
 }
 
 // CHECK-LABEL: __nvqpp__mlirgen__Z
-// CHECK-SAME:    (double %[[VAL_0:.*]])
-// CHECK:         %[[VAL_1:.*]] = alloca [1 x { i8*, i64 }], align 8
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 4)
-// CHECK:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 0)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_5]]* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %[[VAL_5]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 1)
-// CHECK:         %[[VAL_8:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_7]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_5]]* %[[VAL_8]])
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds [1 x { i8*, i64 }], [1 x { i8*, i64 }]* %[[VAL_1]], i64 0, i64 0, i32 0
-// CHECK:         store i8* getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.5858585900, i64 0, i64 0), i8** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds [1 x { i8*, i64 }], [1 x { i8*, i64 }]* %[[VAL_1]], i64 0, i64 0, i32 1
-// CHECK:         store i64 4, i64* %[[VAL_10]], align 8
-// CHECK:         %[[VAL_11:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_1]] to i8*
-// CHECK:         call void @__quantum__qis__exp_pauli(double %[[VAL_0]], %[[VAL_3]]* %[[VAL_2]], i8* nonnull %[[VAL_11]])
-// CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
+// CHECK:         %[[VAL_30:.*]] = alloca [1 x { ptr, i64 }], align 8
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 4)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_5:.*]] = load ptr, ptr %[[VAL_2]]
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_5]])
+// CHECK:         %[[VAL_6:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_8:.*]] = load ptr, ptr %[[VAL_6]]
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_8]])
+// CHECK:         store ptr @cstr.5858585900, ptr %[[VAL_30]], align 8
+// CHECK:         %[[VAL_32:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_30]], i64 8
+// CHECK:         store i64 4, ptr %[[VAL_32]], align 8
+// CHECK:         call void @__quantum__qis__exp_pauli(double %{{.*}}, ptr %[[VAL_0]], ptr nonnull %[[VAL_30]])
+// CHECK:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
diff --git a/test/Translate/ghz.qke b/test/Translate/ghz.qke
index b23ffafa25e..9093479411a 100644
--- a/test/Translate/ghz.qke
+++ b/test/Translate/ghz.qke
@@ -36,10 +36,10 @@
 // CHECK-LABEL: define void @ghz(i32 
 // CHECK-SAME:      %[[VAL_0:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_1:.*]] = zext i32 %[[VAL_0]] to i64
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
-// CHECK:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 0)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_5]]* %[[VAL_6]])
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
+// CHECK:         %[[VAL_4:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 0)
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_4]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_6]])
 // CHECK:         %[[VAL_7:.*]] = add i32 %[[VAL_0]], -1
 // CHECK:         %[[VAL_8:.*]] = icmp eq i32 %[[VAL_7]], 0
 // CHECK:         br i1 %[[VAL_8]], label %[[VAL_9:.*]], label %[[VAL_10:.*]]
@@ -48,18 +48,16 @@
 // CHECK:         br label %[[VAL_13:.*]]
 // CHECK:       :                                           ; preds = %[[VAL_10]], %[[VAL_13]]
 // CHECK:         %[[VAL_14:.*]] = phi i64 [ 0, %[[VAL_10]] ], [ %[[VAL_15:.*]], %[[VAL_13]] ]
-// CHECK:         %[[VAL_16:.*]] = tail call %[[VAL_5]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_14]])
-// CHECK:         %[[VAL_17:.*]] = bitcast %[[VAL_5]]** %[[VAL_16]] to i8**
-// CHECK:         %[[VAL_18:.*]] = load i8*, i8** %[[VAL_17]], align 8
+// CHECK:         %[[VAL_16:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 %[[VAL_14]])
+// CHECK:         %[[VAL_18:.*]] = load ptr, ptr %[[VAL_16]], align 8
 // CHECK:         %[[VAL_15]] = add nuw nsw i64 %[[VAL_14]], 1
-// CHECK:         %[[VAL_19:.*]] = tail call %[[VAL_5]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_15]])
-// CHECK:         %[[VAL_20:.*]] = bitcast %[[VAL_5]]** %[[VAL_19]] to i8**
-// CHECK:         %[[VAL_21:.*]] = load i8*, i8** %[[VAL_20]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_3]]*, %[[VAL_5]]*)* @__quantum__qis__x__ctl to i8*), i8* %[[VAL_18]], i8* %[[VAL_21]])
+// CHECK:         %[[VAL_19:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 %[[VAL_15]])
+// CHECK:         %[[VAL_21:.*]] = load ptr, ptr %[[VAL_19]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_18]], ptr %[[VAL_21]])
 // CHECK:         %[[VAL_22:.*]] = icmp eq i64 %[[VAL_15]], %[[VAL_12]]
 // CHECK:         br i1 %[[VAL_22]], label %[[VAL_9]], label %[[VAL_13]]
 // CHECK:       :                                      ; preds = %[[VAL_13]], %[[VAL_11]]
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_2]])
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/init_state.cpp b/test/Translate/init_state.cpp
index dc065388093..b609de7be15 100644
--- a/test/Translate/init_state.cpp
+++ b/test/Translate/init_state.cpp
@@ -22,22 +22,22 @@ struct kernel {
 
 // clang-format off
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call i8** @__nvqpp_cudaq_state_createFromData_complex_f64(i8* nonnull bitcast ([4 x { double, double }]* @__nvqpp__mlirgen__kernel.rodata_0 to i8*), i64 4)
-// CHECK:         %[[VAL_1:.*]] = tail call i64 @__nvqpp_cudaq_state_numberOfQubits(i8** %[[VAL_0]])
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_1]], i8** %[[VAL_0]])
-// CHECK:         tail call void @__nvqpp_cudaq_state_delete(i8** %[[VAL_0]])
-// CHECK:         %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_3]]* %[[VAL_2]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__nvqpp_cudaq_state_createFromData_complex_f64(ptr nonnull @__nvqpp__mlirgen__kernel.rodata_0, i64 4)
+// CHECK:         %[[VAL_1:.*]] = tail call i64 @__nvqpp_cudaq_state_numberOfQubits(ptr %[[VAL_0]])
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array_with_cudaq_state_ptr(i64 %[[VAL_1]], ptr %[[VAL_0]])
+// CHECK:         tail call void @__nvqpp_cudaq_state_delete(ptr %[[VAL_0]])
+// CHECK:         %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_2]])
 // CHECK:         %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0
 // CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
 // CHECK:         ; preds = %[[VAL_8:.*]], %[[VAL_6]]
 // CHECK:         %[[VAL_9:.*]] = phi i64 [ %[[VAL_10:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ]
-// CHECK:         %[[VAL_11:.*]] = tail call %[[VAL_12:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_9]])
-// CHECK:         %[[VAL_13:.*]] = load %[[VAL_12]]*, %[[VAL_12]]** %[[VAL_11]], align 8
-// CHECK:         %[[VAL_14:.*]] = tail call %[[VAL_15:.*]]* @__quantum__qis__mz__to__register(%[[VAL_12]]* %[[VAL_13]], i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.726573756C7400, i64 0, i64 0))
+// CHECK:         %[[VAL_11:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 %[[VAL_9]])
+// CHECK:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_14:.*]] = tail call ptr @__quantum__qis__mz__to__register(ptr %[[VAL_13]], ptr nonnull @cstr.726573756C7400)
 // CHECK:         %[[VAL_10]] = add nuw nsw i64 %[[VAL_9]], 1
 // CHECK:         %[[VAL_16:.*]] = icmp eq i64 %[[VAL_10]], %[[VAL_4]]
 // CHECK:         br i1 %[[VAL_16]], label %[[VAL_7]], label %[[VAL_6]]
 // CHECK:         ; preds = %[[VAL_6]], %[[VAL_8]]
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_2]])
 // CHECK:         ret void
 // clang-format on
diff --git a/test/Translate/issue_1703.qke b/test/Translate/issue_1703.qke
index 97045f93b44..f462bd58051 100644
--- a/test/Translate/issue_1703.qke
+++ b/test/Translate/issue_1703.qke
@@ -35,10 +35,22 @@ module attributes {
 }
 
 // CHECK-LABEL:   llvm.func @__nvqpp__mlirgen__kernel() attributes {"cudaq-entrypoint"} {
-// CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i64 : (i64) -> !llvm.ptr<i64>
-// CHECK:           %[[VAL_6:.*]] = llvm.mlir.constant(2 : i64) : i64
-// CHECK:           %[[VAL_7:.*]] = llvm.call @__quantum__rt__qubit_allocate_array(%[[VAL_6]]) : (i64) -> !llvm.ptr<struct<"Array", opaque>>
-// CHECK-NOT:       llvm.alloca
+// CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(1.000000e+00 : f64) : f64
+// CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(262144 : i64) : i64
+// CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(2 : i64) : i64
+// CHECK:           %[[VAL_5:.*]] = llvm.call @__quantum__rt__qubit_allocate_array(%[[VAL_4]]) : (i64) -> !llvm.ptr
+// CHECK:           llvm.br ^bb1(%[[VAL_0]] : i64)
+// CHECK:         ^bb1(%[[VAL_6:.*]]: i64):
+// CHECK:           %[[VAL_7:.*]] = llvm.icmp "slt" %[[VAL_6]], %[[VAL_3]] : i64
+// CHECK:           llvm.cond_br %[[VAL_7]], ^bb2, ^bb3
+// CHECK:         ^bb2:
+// CHECK:           %[[VAL_16:.*]] = llvm.mlir.addressof @__quantum__qis__r1__ctl : !llvm.ptr
+// CHECK:           %[[VAL_17:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:           %[[VAL_18:.*]] = llvm.alloca %[[VAL_17]] x i64 {{.*}} : (i64) -> !llvm.ptr
+// CHECK:           llvm.call @invokeRotationWithControlQubits({{.*}}) vararg(!llvm.func<void (f64, i64, ptr, ptr, ...)>)
+// CHECK:         ^bb3:
+// CHECK:           llvm.call @__quantum__rt__qubit_release_array(%[[VAL_5]]) : (!llvm.ptr) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
diff --git a/test/Translate/measure.qke b/test/Translate/measure.qke
index fe37fa1140a..53dc1790b3d 100644
--- a/test/Translate/measure.qke
+++ b/test/Translate/measure.qke
@@ -23,14 +23,14 @@ func.func @test_func2(){
 }
 
 // CHECK-LABEL: define void @test_func2()
-// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 2)
-// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_6:.*]]* @__quantum__qis__mz(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__s__adj(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_4]])
-// CHECK:         %[[VAL_7:.*]] = tail call %[[VAL_6]]* @__quantum__qis__mz(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 2)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__s__adj(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/qalloc_initfloat.qke b/test/Translate/qalloc_initfloat.qke
index a104b808ddd..8dde217f26d 100644
--- a/test/Translate/qalloc_initfloat.qke
+++ b/test/Translate/qalloc_initfloat.qke
@@ -18,11 +18,11 @@ func.func @__nvqpp__mlirgen__function_test._Z4testSt6vectorIfSaIfEE(%arg0: !cc.s
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__function_test.
-// CHECK-SAME:    ({ float*, i64 } %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = extractvalue { float*, i64 } %[[VAL_0]], 1
-// CHECK:         %[[VAL_2:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_1]], i1 false), !range !1
-// CHECK:         %[[VAL_3:.*]] = extractvalue { float*, i64 } %[[VAL_0]], 0
-// CHECK:         %[[VAL_5:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array_with_state_fp32(i64 %[[VAL_2]], float* %[[VAL_3]])
+// CHECK-SAME:    ({ ptr, i64 } %[[VAL_0:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_1:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 1
+// CHECK:         %[[VAL_2:.*]] = tail call range(i64 0, 65) i64 @llvm.cttz.i64(i64 %[[VAL_1]], i1 false)
+// CHECK:         %[[VAL_3:.*]] = extractvalue { ptr, i64 } %[[VAL_0]], 0
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array_with_state_fp32(i64 %[[VAL_2]], ptr %[[VAL_3]])
 // CHECK:         tail call void @__quantum__rt__qubit_release_array(
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/qalloc_initialization.qke b/test/Translate/qalloc_initialization.qke
index 3e43e5492a2..d260b96369c 100644
--- a/test/Translate/qalloc_initialization.qke
+++ b/test/Translate/qalloc_initialization.qke
@@ -52,15 +52,15 @@ module attributes {
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__function_test.
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, double* nonnull getelementptr inbounds ([4 x double], [4 x double]* @__nvqpp__mlirgen__function_test._Z4testSt6vectorIdSaIdEE.rodata_0, i64 0, i64 0))
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_2]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_5]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_6]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_3]]* %[[VAL_6]], %[[VAL_3]]* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, ptr nonnull @__nvqpp__mlirgen__function_test._Z4testSt6vectorIdSaIdEE.rodata_0)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_6]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_6]], ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/Translate/return_values.qke b/test/Translate/return_values.qke
index 102bf049c0b..cd23b46fb6a 100644
--- a/test/Translate/return_values.qke
+++ b/test/Translate/return_values.qke
@@ -59,92 +59,26 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
   return
 }
 
-// CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(i32 
-// CHECK-SAME:                                                    %[[VAL_0:.*]]) local_unnamed_addr {
+// CHECK-LABEL: define { ptr, i64 } @__nvqpp__mlirgen__test_0(i32
+// CHECK-SAME:      %[[VAL_0:.*]]) local_unnamed_addr {
 // CHECK:         %[[VAL_1:.*]] = sext i32 %[[VAL_0]] to i64
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
-// CHECK:         %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_3]]* %[[VAL_2]])
-// CHECK:         %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0
-// CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       ._crit_edge.thread:                               ; preds = %[[VAL_8:.*]]
-// CHECK:         %[[VAL_9:.*]] = alloca i8, i64 %[[VAL_4]], align 1
-// CHECK:         br label %[[VAL_10:.*]]
-// CHECK:       .lr.ph:                                           ; preds = %[[VAL_8]], %[[VAL_6]]
-// CHECK:         %[[VAL_11:.*]] = phi i64 [ %[[VAL_12:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ]
-// CHECK:         %[[VAL_13:.*]] = tail call %[[VAL_14:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_11]])
-// CHECK:         %[[VAL_15:.*]] = load %[[VAL_14]]*, %[[VAL_14]]** %[[VAL_13]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_14]]* %[[VAL_15]])
-// CHECK:         %[[VAL_12]] = add nuw nsw i64 %[[VAL_11]], 1
-// CHECK:         %[[VAL_16:.*]] = icmp eq i64 %[[VAL_12]], %[[VAL_4]]
-// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_6]]
-// CHECK:       ._crit_edge:                                      ; preds = %[[VAL_6]]
-// CHECK:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_4]], align 1
-// CHECK:         br i1 %[[VAL_5]], label %[[VAL_19:.*]], label %[[VAL_10]]
-// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_17]], %[[VAL_19]]
-// CHECK:         %[[VAL_20:.*]] = phi i64 [ %[[VAL_21:.*]], %[[VAL_19]] ], [ 0, %[[VAL_17]] ]
-// CHECK:         %[[VAL_22:.*]] = tail call %[[VAL_14]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_20]])
-// CHECK:         %[[VAL_23:.*]] = load %[[VAL_14]]*, %[[VAL_14]]** %[[VAL_22]], align 8
-// CHECK:         %[[VAL_24:.*]] = tail call %[[VAL_25:.*]]* @__quantum__qis__mz(%[[VAL_14]]* %[[VAL_23]])
-// CHECK:         %[[VAL_26:.*]] = bitcast %[[VAL_25]]* %[[VAL_24]] to i1*
-// CHECK:         %[[VAL_27:.*]] = load i1, i1* %[[VAL_26]], align 1
-// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 %[[VAL_20]]
-// CHECK:         %[[VAL_29:.*]] = zext i1 %[[VAL_27]] to i8
-// CHECK:         store i8 %[[VAL_29]], i8* %[[VAL_28]], align 1
-// CHECK:         %[[VAL_21]] = add nuw nsw i64 %[[VAL_20]], 1
-// CHECK:         %[[VAL_30:.*]] = icmp eq i64 %[[VAL_21]], %[[VAL_4]]
-// CHECK:         br i1 %[[VAL_30]], label %[[VAL_10]], label %[[VAL_19]]
-// CHECK:       ._crit_edge5:                                     ; preds = %[[VAL_19]], %[[VAL_7]], %[[VAL_17]]
-// CHECK:         %[[VAL_31:.*]] = phi i8* [ %[[VAL_9]], %[[VAL_7]] ], [ %[[VAL_18]], %[[VAL_17]] ], [ %[[VAL_18]], %[[VAL_19]] ]
-// CHECK:         %[[VAL_32:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_31]], i64 %[[VAL_4]], i64 1)
-// CHECK:         %[[VAL_33:.*]] = bitcast i8* %[[VAL_32]] to i1*
-// CHECK:         %[[VAL_34:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_33]], 0
-// CHECK:         %[[VAL_35:.*]] = insertvalue { i1*, i64 } %[[VAL_34]], i64 %[[VAL_4]], 1
-// CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
-// CHECK:         ret { i1*, i64 } %[[VAL_35]]
-// CHECK:       }
-
-// CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) 
-// CHECK-SAME:                                                                 %[[VAL_0:.*]], i8* nocapture readnone
-// CHECK-SAME:                                                                 %[[VAL_1:.*]], i32
-// CHECK-SAME:                                                                 %[[VAL_2:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_3:.*]] = alloca { i32, { i1*, i64 } }, align 4
-// CHECK:         %[[VAL_4:.*]] = bitcast { i32, { i1*, i64 } }* %[[VAL_3]] to i8*
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 0
-// CHECK:         store i32 %[[VAL_2]], i32* %[[VAL_5]], align 4
-// CHECK:         %[[VAL_6:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_7:.*]] = alloca [1 x i8*], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %[[VAL_7]], i64 0, i64 0
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_8]], i8*** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_10:.*]] = ptrtoint [1 x i8*]* %[[VAL_7]] to i64
-// CHECK:         %[[VAL_11:.*]] = add i64 %[[VAL_10]], 8
-// CHECK:         %[[VAL_12:.*]] = inttoptr i64 %[[VAL_11]] to i8**
-// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_12]], i8*** %[[VAL_13]], align 8
-// CHECK:         %[[VAL_14:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_6]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_12]], i8*** %[[VAL_14]], align 8
-// CHECK:         %[[VAL_15:.*]] = alloca i32, align 4
-// CHECK:         store i32 %[[VAL_2]], i32* %[[VAL_15]], align 4
-// CHECK:         %[[VAL_16:.*]] = bitcast [1 x i8*]* %[[VAL_7]] to i32**
-// CHECK:         store i32* %[[VAL_15]], i32** %[[VAL_16]], align 8
-// CHECK:         %[[VAL_17:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_6]] to i8*
-// CHECK:         %[[VAL_18:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_0.thunk to i8*), i8* nonnull %[[VAL_4]], i64 24, i64 8, i8* nonnull %[[VAL_17]])
-// CHECK:         %[[VAL_19:.*]] = extractvalue { i8*, i64 } %[[VAL_18]], 0
-// CHECK:         %[[VAL_20:.*]] = icmp eq i8* %[[VAL_19]], null
-// CHECK:         %[[VAL_21:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 8
-// CHECK:         %[[VAL_22:.*]] = bitcast i8* %[[VAL_21]] to { i1*, i64 }*
-// CHECK:         %[[VAL_23:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1
-// CHECK:         %[[VAL_24:.*]] = select i1 %[[VAL_20]], { i1*, i64 }* %[[VAL_23]], { i1*, i64 }* %[[VAL_22]]
-// CHECK:         %[[VAL_25:.*]] = bitcast { i1*, i64 }* %[[VAL_24]] to i8**
-// CHECK:         %[[VAL_26:.*]] = load i8*, i8** %[[VAL_25]], align 8
-// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds { i32, { i1*, i64 } }, { i32, { i1*, i64 } }* %[[VAL_3]], i64 0, i32 1, i32 1
-// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_19]], i64 16
-// CHECK:         %[[VAL_29:.*]] = bitcast i8* %[[VAL_28]] to i64*
-// CHECK:         %[[VAL_30:.*]] = select i1 %[[VAL_20]], i64* %[[VAL_27]], i64* %[[VAL_29]]
-// CHECK:         %[[VAL_31:.*]] = load i64, i64* %[[VAL_30]], align 4
-// CHECK:         %[[VAL_32:.*]] = bitcast { i8*, i8*, i8* }* %[[VAL_0]] to i8*
-// CHECK:         call void @__nvqpp_initializer_list_to_vector_bool(i8* %[[VAL_32]], i8* %[[VAL_26]], i64 %[[VAL_31]])
-// CHECK:         call void @free(i8* %[[VAL_19]])
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
+// CHECK:         %[[VAL_3:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_2]])
+// CHECK:         tail call void @__quantum__qis__h(ptr
+// CHECK:         tail call ptr @__quantum__qis__mz(ptr
+// CHECK:         %[[VAL_4:.*]] = call ptr @__nvqpp_vectorCopyCtor(ptr nonnull
+// CHECK:         %[[VAL_5:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_4]], 0
+// CHECK:         %[[VAL_6:.*]] = insertvalue { ptr, i64 } %[[VAL_5]], i64 %[[VAL_3]], 1
+// CHECK:         call void @__quantum__rt__qubit_release_array(ptr %[[VAL_2]])
+// CHECK:         ret { ptr, i64 } %[[VAL_6]]
+// CHECK:       }
+
+// CHECK-LABEL: define void @test_0(ptr sret({ ptr, ptr, ptr })
+// CHECK-SAME:      %[[VAL_0:.*]], ptr readnone{{.*}}%[[VAL_1:.*]], i32
+// CHECK-SAME:      %[[VAL_2:.*]]) local_unnamed_addr {
+// CHECK:         store i32 %[[VAL_2]], ptr
+// CHECK:         %[[VAL_3:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_0.kernelName, ptr nonnull @test_0.thunk, ptr nonnull
+// CHECK:         call void @__nvqpp_initializer_list_to_vector_bool(ptr %[[VAL_0]],
 // CHECK:         ret void
 // CHECK:       }
 
@@ -172,42 +106,36 @@ func.func @test_1(%this: !cc.ptr<i8>) -> i16 {
 
 
 // CHECK-LABEL: define { i1, i1 } @__nvqpp__mlirgen__test_1() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 2)
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_5]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_4]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_3]]* %[[VAL_4]], %[[VAL_3]]* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %[[VAL_8:.*]]* @__quantum__qis__mz(%[[VAL_3]]* %[[VAL_4]])
-// CHECK:         %[[VAL_9:.*]] = tail call %[[VAL_8]]* @__quantum__qis__mz(%[[VAL_3]]* %[[VAL_6]])
-// CHECK:         %[[VAL_10:.*]] = bitcast %[[VAL_8]]* %[[VAL_7]] to i1*
-// CHECK:         %[[VAL_11:.*]] = load i1, i1* %[[VAL_10]], align 1
-// CHECK:         %[[VAL_12:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_11]], 0
-// CHECK:         %[[VAL_13:.*]] = bitcast %[[VAL_8]]* %[[VAL_9]] to i1*
-// CHECK:         %[[VAL_14:.*]] = load i1, i1* %[[VAL_13]], align 1
-// CHECK:         %[[VAL_15:.*]] = insertvalue { i1, i1 } %[[VAL_12]], i1 %[[VAL_14]], 1
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
-// CHECK:         ret { i1, i1 } %[[VAL_15]]
-// CHECK:       }
-
-// CHECK-LABEL: define i16 @test_1(i8* nocapture readnone 
-// CHECK-SAME:        %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = alloca [0 x i8*], align 8
-// CHECK:         %[[VAL_2:.*]] = alloca i16
-// CHECK:         %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i16* %[[VAL_2]] to i8*
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8
-// CHECK:         %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8*
-// CHECK:         %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_1.thunk to i8*), i8* nonnull %[[VAL_4]], i64 2, i64 0, i8* nonnull %[[VAL_9]])
-// CHECK:         %[[VAL_11:.*]] = load i16, i16* %[[VAL_2]]
-// CHECK:         ret i16 %[[VAL_11]]
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 2)
+// CHECK:         %[[VAL_1:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_2:.*]] = load ptr, ptr %[[VAL_1]]
+// CHECK:         %[[VAL_3:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_3]]
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_2]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_2]], ptr %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_2]])
+// CHECK:         %[[VAL_6:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_7:.*]] = load i1, ptr %[[VAL_5]]
+// CHECK:         %[[VAL_8:.*]] = insertvalue { i1, i1 } undef, i1 %[[VAL_7]], 0
+// CHECK:         %[[VAL_9:.*]] = load i1, ptr %[[VAL_6]]
+// CHECK:         %[[VAL_10:.*]] = insertvalue { i1, i1 } %[[VAL_8]], i1 %[[VAL_9]], 1
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
+// CHECK:         ret { i1, i1 } %[[VAL_10]]
+// CHECK:       }
+
+// CHECK-LABEL: define i16 @test_1(
+// CHECK-SAME:     ptr readnone{{.*}}%[[VAL_0:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_1:.*]] = alloca [0 x ptr]
+// CHECK:         %[[VAL_2:.*]] = alloca [{{(2|8)}} x i8]
+// CHECK:         %[[VAL_3:.*]] = alloca { ptr, ptr, ptr }
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_3]]
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_3]], i64 8
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_4]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_3]], i64 16
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_5]]
+// CHECK:         %[[VAL_6:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_1.kernelName, ptr nonnull @test_1.thunk, ptr nonnull %[[VAL_2]], i64 2, i64 0, ptr nonnull %[[VAL_3]])
+// CHECK:         %[[VAL_7:.*]] = load i16, ptr %[[VAL_2]]
+// CHECK:         ret i16 %[[VAL_7]]
 // CHECK:       }
 
 // struct{i16, f32, f64, i64} -> sret ptr
@@ -232,24 +160,18 @@ func.func @test_2(%1: !cc.ptr<!cc.struct<{i16, f32, f64, i64}>> {llvm.sret = !cc
 // CHECK:         ret { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_2({ i16, float, double, i64 }* nocapture writeonly sret({ i16, float, double, i64 }) 
-// CHECK-SAME:      %[[VAL_0:.*]], i8* nocapture readnone
-// CHECK-SAME:      %[[VAL_1:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_2:.*]] = alloca [0 x i8*], align 8
-// CHECK:         %[[VAL_3:.*]] = alloca [24 x i8], align 1
-// CHECK:         %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [24 x i8], [24 x i8]* %[[VAL_3]], i64 0, i64 0
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8*
-// CHECK:         %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_2.thunk to i8*), i8* nonnull %[[VAL_5]], i64 24, i64 0, i8* nonnull %[[VAL_10]])
-// CHECK:         %[[VAL_12:.*]] = bitcast { i16, float, double, i64 }* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(24) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_5]], i64 24, i1 false)
+// CHECK-LABEL: define void @test_2(ptr{{.*}}sret({ i16, float, double, i64 })
+// CHECK-SAME:      %[[VAL_0:.*]], ptr readnone{{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = alloca [0 x ptr]
+// CHECK:         %[[VAL_3:.*]] = alloca [24 x i8]
+// CHECK:         %[[VAL_4:.*]] = alloca { ptr, ptr, ptr }
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_4]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_4]], i64 8
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_5]]
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_4]], i64 16
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_6]]
+// CHECK:         %[[VAL_7:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_2.kernelName, ptr nonnull @test_2.thunk, ptr nonnull %[[VAL_3]], i64 24, i64 0, ptr nonnull %[[VAL_4]])
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull {{.*}}%[[VAL_0]], ptr noundef nonnull {{.*}}%[[VAL_3]], i64 24, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -278,24 +200,18 @@ func.func @test_3(%1: !cc.ptr<!cc.array<i64 x 5>> {llvm.sret = !cc.array<i64 x 5
 // CHECK:         ret [5 x i64] [i64 5, i64 74, i64 299, i64 1659, i64 61234]
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_3([5 x i64]* nocapture writeonly sret([5 x i64]) 
-// CHECK-SAME:                                                                     %[[VAL_0:.*]], i8* nocapture readnone
-// CHECK-SAME:                                                                     %[[VAL_1:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_2:.*]] = alloca [0 x i8*], align 8
-// CHECK:         %[[VAL_3:.*]] = alloca [40 x i8], align 1
-// CHECK:         %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [40 x i8], [40 x i8]* %[[VAL_3]], i64 0, i64 0
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8*
-// CHECK:         %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_3.thunk to i8*), i8* nonnull %[[VAL_5]], i64 40, i64 0, i8* nonnull %[[VAL_10]])
-// CHECK:         %[[VAL_12:.*]] = bitcast [5 x i64]* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(40) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(40) %[[VAL_5]], i64 40, i1 false)
+// CHECK-LABEL: define void @test_3(ptr{{.*}}sret([5 x i64])
+// CHECK-SAME:      %[[VAL_0:.*]], ptr readnone{{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = alloca [0 x ptr]
+// CHECK:         %[[VAL_3:.*]] = alloca [40 x i8]
+// CHECK:         %[[VAL_4:.*]] = alloca { ptr, ptr, ptr }
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_4]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_4]], i64 8
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_5]]
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_4]], i64 16
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_6]]
+// CHECK:         %[[VAL_7:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_3.kernelName, ptr nonnull @test_3.thunk, ptr nonnull %[[VAL_3]], i64 40, i64 0, ptr nonnull %[[VAL_4]])
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull {{.*}}%[[VAL_0]], ptr noundef nonnull {{.*}}%[[VAL_3]], i64 40, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -314,24 +230,18 @@ func.func @test_4(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret { i64, double } { i64 537892, double 0x40578DA858793DD9 }
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_4({ i64, double }* nocapture writeonly sret({ i64, double }) 
-// CHECK-SAME:     %[[VAL_0:.*]], i8* nocapture readnone
-// CHECK-SAME:     %[[VAL_1:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_2:.*]] = alloca [0 x i8*], align 8
-// CHECK:         %[[VAL_3:.*]] = alloca [16 x i8], align 1
-// CHECK:         %[[VAL_4:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_3]], i64 0, i64 0
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_2]], i64 0, i64 0
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_7]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_8]], align 8
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_4]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_6]], i8*** %[[VAL_9]], align 8
-// CHECK:         %[[VAL_10:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_4]] to i8*
-// CHECK:         %[[VAL_11:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_4.thunk to i8*), i8* nonnull %[[VAL_5]], i64 16, i64 0, i8* nonnull %[[VAL_10]])
-// CHECK:         %[[VAL_12:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_12]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_5]], i64 16, i1 false)
+// CHECK-LABEL: define void @test_4(ptr{{.*}}sret({ i64, double })
+// CHECK-SAME:     %[[VAL_0:.*]], ptr readnone{{.*}}%[[VAL_1:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_2:.*]] = alloca [0 x ptr]
+// CHECK:         %[[VAL_3:.*]] = alloca [16 x i8]
+// CHECK:         %[[VAL_4:.*]] = alloca { ptr, ptr, ptr }
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_4]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_4]], i64 8
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_5]]
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_4]], i64 16
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_6]]
+// CHECK:         %[[VAL_7:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_4.kernelName, ptr nonnull @test_4.thunk, ptr nonnull %[[VAL_3]], i64 16, i64 0, ptr nonnull %[[VAL_4]])
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull {{.*}}%[[VAL_0]], ptr noundef nonnull {{.*}}%[[VAL_3]], i64 16, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
@@ -349,249 +259,214 @@ func.func @test_5(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret { i64, double } { i64 537892, double 0x40578DA858793DD9 }
 // CHECK:       }
 
-// CHECK-LABEL: define void @test_5({ i64, double }* nocapture writeonly sret({ i64, double }) 
-// CHECK-SAME:                                                                                 %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_1:.*]] = alloca [0 x i8*], align 8
-// CHECK:         %[[VAL_2:.*]] = alloca [16 x i8], align 1
-// CHECK:         %[[VAL_3:.*]] = alloca { i8**, i8**, i8** }, align 8
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[VAL_2]], i64 0, i64 0
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* %[[VAL_1]], i64 0, i64 0
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 0
-// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_6]], align 8
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 1
-// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_7]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds { i8**, i8**, i8** }, { i8**, i8**, i8** }* %[[VAL_3]], i64 0, i32 2
-// CHECK:         store i8** %[[VAL_5]], i8*** %[[VAL_8]], align 8
-// CHECK:         %[[VAL_9:.*]] = bitcast { i8**, i8**, i8** }* %[[VAL_3]] to i8*
-// CHECK:         %[[VAL_10:.*]] = call { i8*, i64 } @hybridLaunchKernel(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast ({ i8*, i64 } (i8*, i1)* @test_5.thunk to i8*), i8* nonnull %[[VAL_4]], i64 16, i64 0, i8* nonnull %[[VAL_9]])
-// CHECK:         %[[VAL_11:.*]] = bitcast { i64, double }* %[[VAL_0]] to i8*
-// CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 8 dereferenceable(16) %[[VAL_11]], i8* noundef nonnull align 1 dereferenceable(16) %[[VAL_4]], i64 16, i1 false)
+// CHECK-LABEL: define void @test_5(ptr{{.*}}sret({ i64, double })
+// CHECK-SAME:                      %[[VAL_0:.*]]) local_unnamed_addr {
+// CHECK:         %[[VAL_1:.*]] = alloca [0 x ptr]
+// CHECK:         %[[VAL_2:.*]] = alloca [16 x i8]
+// CHECK:         %[[VAL_3:.*]] = alloca { ptr, ptr, ptr }
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_3]]
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_3]], i64 8
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_4]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_3]], i64 16
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_5]]
+// CHECK:         %[[VAL_6:.*]] = call { ptr, i64 } @hybridLaunchKernel(ptr nonnull @test_5.kernelName, ptr nonnull @test_5.thunk, ptr nonnull %[[VAL_2]], i64 16, i64 0, ptr nonnull %[[VAL_3]])
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull {{.*}}%[[VAL_0]], ptr noundef nonnull {{.*}}%[[VAL_2]], i64 16, i1 false)
 // CHECK:         ret void
 // CHECK:       }
 
 }
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: define i64 @test_0.returnOffset() local_unnamed_addr {{.*}} {
+// CHECK-LABEL: define{{.*}}i64 @test_0.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 8
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_0.thunk(i8* nocapture
-// CHECK-SAME:       %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i32*
-// CHECK:         %[[VAL_3:.*]] = load i32, i32* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_4:.*]] = tail call { i1*, i64 } @__nvqpp__mlirgen__test_0(i32 %[[VAL_3]])
+// CHECK-LABEL: define { ptr, i64 } @test_0.thunk(
+// CHECK-SAME:      ptr{{.*}}%[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
+// CHECK:         %[[VAL_2:.*]] = load i32, ptr %[[VAL_0]]
+// CHECK:         %[[VAL_3:.*]] = tail call { ptr, i64 } @__nvqpp__mlirgen__test_0(i32 %[[VAL_2]])
 // CHECK:         tail call void @__nvqpp_cleanup_arrays()
-// CHECK:         %[[VAL_5:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_6:.*]] = bitcast i8* %[[VAL_5]] to i1**
-// CHECK:         %[[VAL_7:.*]] = extractvalue { i1*, i64 } %[[VAL_4]], 0
-// CHECK:         store i1* %[[VAL_7]], i1** %[[VAL_6]], align 8
-// CHECK:         %[[VAL_8:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to i64*
-// CHECK:         %[[VAL_10:.*]] = extractvalue { i1*, i64 } %[[VAL_4]], 1
-// CHECK:         store i64 %[[VAL_10]], i64* %[[VAL_9]], align 8
-// CHECK:         br i1 %[[VAL_1]], label %[[VAL_11:.*]], label %[[VAL_12:.*]]
-// CHECK:       common.ret:                                       ; preds = %[[VAL_13:.*]], %[[VAL_11]]
-// CHECK:         %[[VAL_14:.*]] = phi { i8*, i64 } [ %[[VAL_15:.*]], %[[VAL_11]] ], [ zeroinitializer, %[[VAL_13]] ]
-// CHECK:         ret { i8*, i64 } %[[VAL_14]]
-// CHECK:       8:                                                ; preds = %[[VAL_13]]
-// CHECK:         %[[VAL_16:.*]] = bitcast i1* %[[VAL_7]] to i8*
-// CHECK:         %[[VAL_17:.*]] = add i64 %[[VAL_10]], 24
-// CHECK:         %[[VAL_18:.*]] = tail call i8* @malloc(i64 %[[VAL_17]])
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_18]], i8* noundef nonnull align 1 dereferenceable(24) %[[VAL_0]], i64 24, i1 false)
-// CHECK:         %[[VAL_19:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 24
-// CHECK:         tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[VAL_19]], i8* align 1 %[[VAL_16]], i64 %[[VAL_10]], i1 false)
-// CHECK:         %[[VAL_20:.*]] = insertvalue { i8*, i64 } undef, i8* %[[VAL_18]], 0
-// CHECK:         %[[VAL_15]] = insertvalue { i8*, i64 } %[[VAL_20]], i64 %[[VAL_17]], 1
-// CHECK:         %[[VAL_21:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 8
-// CHECK:         %[[VAL_22:.*]] = bitcast i8* %[[VAL_21]] to i8**
-// CHECK:         store i8* %[[VAL_19]], i8** %[[VAL_22]], align 8
-// CHECK:         br label %[[VAL_12]]
-// CHECK:       }
-
-// CHECK-LABEL: define i64 @test_0.argsCreator(i8** nocapture readonly 
-// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = bitcast i8** %[[VAL_0]] to i32**
-// CHECK:         %[[VAL_3:.*]] = load i32*, i32** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_4:.*]] = load i32, i32* %[[VAL_3]], align 4
-// CHECK:         %[[VAL_5:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24)
-// CHECK:         %[[VAL_6:.*]] = bitcast i8* %[[VAL_5]] to i32*
-// CHECK:         store i32 %[[VAL_4]], i32* %[[VAL_6]], align 4
-// CHECK:         store i8* %[[VAL_5]], i8** %[[VAL_1]], align 8
+// CHECK:         %[[VAL_4:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 8
+// CHECK:         %[[VAL_5:.*]] = extractvalue { ptr, i64 } %[[VAL_3]], 0
+// CHECK:         store ptr %[[VAL_5]], ptr %[[VAL_4]]
+// CHECK:         %[[VAL_6:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 16
+// CHECK:         %[[VAL_7:.*]] = extractvalue { ptr, i64 } %[[VAL_3]], 1
+// CHECK:         store i64 %[[VAL_7]], ptr %[[VAL_6]]
+// CHECK:         br i1 %[[VAL_1]], label %[[VAL_8:.*]], label %[[VAL_9:.*]]
+// CHECK:       common.ret:
+// CHECK:         %[[VAL_10:.*]] = phi { ptr, i64 } [ %[[VAL_11:.*]], %[[VAL_8]] ], [ zeroinitializer,
+// CHECK:         ret { ptr, i64 } %[[VAL_10]]
+// CHECK:       {{[0-9]+}}:
+// CHECK:         %[[VAL_12:.*]] = add i64 %[[VAL_7]], 24
+// CHECK:         %[[VAL_13:.*]] = tail call ptr @malloc(i64 %[[VAL_12]])
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull {{.*}}%[[VAL_13]], ptr noundef nonnull {{.*}}%[[VAL_0]], i64 24, i1 false)
+// CHECK:         %[[VAL_14:.*]] = getelementptr i8, ptr %[[VAL_13]], i64 24
+// CHECK:         tail call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}%[[VAL_14]], ptr {{.*}}%[[VAL_5]], i64 %[[VAL_7]], i1 false)
+// CHECK:         %[[VAL_15:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_13]], 0
+// CHECK:         %[[VAL_11]] = insertvalue { ptr, i64 } %[[VAL_15]], i64 %[[VAL_12]], 1
+// CHECK:         %[[VAL_16:.*]] = getelementptr i8, ptr %[[VAL_13]], i64 8
+// CHECK:         store ptr %[[VAL_14]], ptr %[[VAL_16]]
+// CHECK:         br label %[[VAL_9]]
+// CHECK:       }
+
+// CHECK-LABEL: define{{.*}}i64 @test_0.argsCreator(
+// CHECK-SAME:      ptr{{.*}}%[[VAL_0:.*]], ptr{{.*}}%[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = load ptr, ptr %[[VAL_0]]
+// CHECK:         %[[VAL_3:.*]] = load i32, ptr %[[VAL_2]]
+// CHECK:         %[[VAL_4:.*]] = tail call dereferenceable_or_null(24) ptr @malloc(i64 24)
+// CHECK:         store i32 %[[VAL_3]], ptr %[[VAL_4]]
+// CHECK:         store ptr %[[VAL_4]], ptr %[[VAL_1]]
 // CHECK:         ret i64 24
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_0.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_0.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_0.argsCreator to i8*))
+// CHECK:         tail call void @cudaqRegisterKernelName(ptr nonnull @test_0.kernelName)
+// CHECK:         tail call void @cudaqRegisterArgsCreator(ptr nonnull @test_0.kernelName, ptr nonnull @test_0.argsCreator)
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_1.returnOffset() local_unnamed_addr {{.*}} {
+// CHECK-LABEL: define{{.*}}i64 @test_1.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_1.thunk(i8* nocapture writeonly
-// CHECK-SAME:    %[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 2)
-// CHECK:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 0)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_4]], align 8
-// CHECK:         %[[VAL_7:.*]] = tail call %[[VAL_5]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 1)
-// CHECK:         %[[VAL_8:.*]] = load %[[VAL_5]]*, %[[VAL_5]]** %[[VAL_7]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_5]]* %[[VAL_6]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_3]]*, %[[VAL_5]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_5]]* %[[VAL_6]], %[[VAL_5]]* %[[VAL_8]])
-// CHECK:         %[[VAL_9:.*]] = tail call %[[VAL_10:.*]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_6]])
-// CHECK:         %[[VAL_11:.*]] = tail call %[[VAL_10]]* @__quantum__qis__mz(%[[VAL_5]]* %[[VAL_8]])
-// CHECK:         %[[VAL_12:.*]] = bitcast %[[VAL_10]]* %[[VAL_9]] to i1*
-// CHECK:         %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1
-// CHECK:         %[[VAL_14:.*]] = bitcast %[[VAL_10]]* %[[VAL_11]] to i1*
-// CHECK:         %[[VAL_15:.*]] = load i1, i1* %[[VAL_14]], align 1
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
-// CHECK:         %[[VAL_16:.*]] = bitcast i8* %[[VAL_0]] to i1*
-// CHECK:         store i1 %[[VAL_13]], i1* %[[VAL_16]], align 1
-// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 1
-// CHECK:         %[[VAL_18:.*]] = bitcast i8* %[[VAL_17]] to i1*
-// CHECK:         store i1 %[[VAL_15]], i1* %[[VAL_18]], align 1
-// CHECK:         ret { i8*, i64 } zeroinitializer
-// CHECK:       }
-
-// CHECK-LABEL: define i64 @test_1.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(2) i8* @malloc(i64 2)
-// CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
+// CHECK-LABEL: define { ptr, i64 } @test_1.thunk(
+// CHECK-SAME:      ptr{{.*}}%[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 2)
+// CHECK:         %[[VAL_3:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_3]]
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_2]], i64 1)
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]]
+// CHECK:         tail call void @__quantum__qis__h(ptr %[[VAL_4]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, ptr %[[VAL_4]], ptr %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_4]])
+// CHECK:         %[[VAL_8:.*]] = tail call ptr @__quantum__qis__mz(ptr %[[VAL_6]])
+// CHECK:         %[[VAL_9:.*]] = load i1, ptr %[[VAL_7]]
+// CHECK:         %[[VAL_10:.*]] = load i1, ptr %[[VAL_8]]
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_2]])
+// CHECK:         tail call void @__nvqpp_cleanup_arrays()
+// CHECK:         store i1 %[[VAL_9]], ptr %[[VAL_0]]
+// CHECK:         %[[VAL_11:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 1
+// CHECK:         store i1 %[[VAL_10]], ptr %[[VAL_11]]
+// CHECK:         ret { ptr, i64 } zeroinitializer
+// CHECK:       }
+
+// CHECK-LABEL: define{{.*}}i64 @test_1.argsCreator(
+// CHECK-SAME:      ptr readnone{{.*}}%[[VAL_0:.*]], ptr{{.*}}%[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(2) ptr @malloc(i64 2)
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_1]]
 // CHECK:         ret i64 2
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_1.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_1.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_1.argsCreator to i8*))
+// CHECK:         tail call void @cudaqRegisterKernelName(ptr nonnull @test_1.kernelName)
+// CHECK:         tail call void @cudaqRegisterArgsCreator(ptr nonnull @test_1.kernelName, ptr nonnull @test_1.argsCreator)
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_2.returnOffset() local_unnamed_addr {{.*}} {
+// CHECK-LABEL: define{{.*}}i64 @test_2.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_2.thunk(i8* nocapture writeonly 
-// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
-// CHECK-SAME:                                                            %[[VAL_1:.*]]) {
+// CHECK-LABEL: define { ptr, i64 } @test_2.thunk(
+// CHECK-SAME:      ptr{{.*}}%[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
 // CHECK:         tail call void @__nvqpp_cleanup_arrays()
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to { i16, float, double, i64 }*
-// CHECK:         store { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }, { i16, float, double, i64 }* %[[VAL_2]], align 8
-// CHECK:         ret { i8*, i64 } zeroinitializer
+// CHECK:         store { i16, float, double, i64 } { i16 8, float 0x40159999A0000000, double 3.783000e+01, i64 1479 }, ptr %[[VAL_0]]
+// CHECK:         ret { ptr, i64 } zeroinitializer
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_2.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(24) i8* @malloc(i64 24)
-// CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
+// CHECK-LABEL: define{{.*}}i64 @test_2.argsCreator(
+// CHECK-SAME:      ptr readnone{{.*}}%[[VAL_0:.*]], ptr{{.*}}%[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(24) ptr @malloc(i64 24)
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_1]]
 // CHECK:         ret i64 24
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_2.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_2.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_2.argsCreator to i8*))
+// CHECK:         tail call void @cudaqRegisterKernelName(ptr nonnull @test_2.kernelName)
+// CHECK:         tail call void @cudaqRegisterArgsCreator(ptr nonnull @test_2.kernelName, ptr nonnull @test_2.argsCreator)
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_3.returnOffset() local_unnamed_addr {{.*}} {
+// CHECK-LABEL: define{{.*}}i64 @test_3.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_3.thunk(i8* nocapture writeonly 
-// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
-// CHECK-SAME:                                                            %[[VAL_1:.*]]) {
+// CHECK-LABEL: define { ptr, i64 } @test_3.thunk(
+// CHECK-SAME:      ptr{{.*}}%[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
 // CHECK:         tail call void @__nvqpp_cleanup_arrays()
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         store i64 5, i64* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to i64*
-// CHECK:         store i64 74, i64* %[[VAL_4]], align 4
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 16
-// CHECK:         %[[VAL_6:.*]] = bitcast i8* %[[VAL_5]] to i64*
-// CHECK:         store i64 299, i64* %[[VAL_6]], align 4
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 24
-// CHECK:         %[[VAL_8:.*]] = bitcast i8* %[[VAL_7]] to i64*
-// CHECK:         store i64 1659, i64* %[[VAL_8]], align 4
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 32
-// CHECK:         %[[VAL_10:.*]] = bitcast i8* %[[VAL_9]] to i64*
-// CHECK:         store i64 61234, i64* %[[VAL_10]], align 4
-// CHECK:         ret { i8*, i64 } zeroinitializer
-// CHECK:       }
-
-// CHECK-LABEL: define i64 @test_3.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(40) i8* @malloc(i64 40)
-// CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
+// CHECK:         store i64 5, ptr %[[VAL_0]]
+// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 8
+// CHECK:         store i64 74, ptr %[[VAL_2]]
+// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 16
+// CHECK:         store i64 299, ptr %[[VAL_3]]
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 24
+// CHECK:         store i64 1659, ptr %[[VAL_4]]
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds nuw i8, ptr %[[VAL_0]], i64 32
+// CHECK:         store i64 61234, ptr %[[VAL_5]]
+// CHECK:         ret { ptr, i64 } zeroinitializer
+// CHECK:       }
+
+// CHECK-LABEL: define{{.*}}i64 @test_3.argsCreator(
+// CHECK-SAME:      ptr readnone{{.*}}%[[VAL_0:.*]], ptr{{.*}}%[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(40) ptr @malloc(i64 40)
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_1]]
 // CHECK:         ret i64 40
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_3.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_3.argsCreator to i8*))
+// CHECK:         tail call void @cudaqRegisterKernelName(ptr nonnull @test_3.kernelName)
+// CHECK:         tail call void @cudaqRegisterArgsCreator(ptr nonnull @test_3.kernelName, ptr nonnull @test_3.argsCreator)
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_4.returnOffset() local_unnamed_addr {{.*}} {
+// CHECK-LABEL: define{{.*}}i64 @test_4.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_4.thunk(i8* nocapture writeonly 
-// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
-// CHECK-SAME:                                                            %[[VAL_1:.*]]) {
+// CHECK-LABEL: define { ptr, i64 } @test_4.thunk(
+// CHECK-SAME:      ptr{{.*}}%[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
 // CHECK:         tail call void @__nvqpp_cleanup_arrays()
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         store i64 537892, i64* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to double*
-// CHECK:         store double 0x40578DA858793DD9, double* %[[VAL_4]], align 8
-// CHECK:         ret { i8*, i64 } zeroinitializer
-// CHECK:       }
-
-// CHECK-LABEL: define i64 @test_4.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16)
-// CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
+// CHECK:         store i64 537892, ptr %[[VAL_0]]
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 8
+// CHECK:         store double 0x40578DA858793DD9, ptr %[[VAL_2]]
+// CHECK:         ret { ptr, i64 } zeroinitializer
+// CHECK:       }
+
+// CHECK-LABEL: define{{.*}}i64 @test_4.argsCreator(
+// CHECK-SAME:      ptr readnone{{.*}}%[[VAL_0:.*]], ptr{{.*}}%[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) ptr @malloc(i64 16)
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_1]]
 // CHECK:         ret i64 16
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_4.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_4.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_4.argsCreator to i8*))
+// CHECK:         tail call void @cudaqRegisterKernelName(ptr nonnull @test_4.kernelName)
+// CHECK:         tail call void @cudaqRegisterArgsCreator(ptr nonnull @test_4.kernelName, ptr nonnull @test_4.argsCreator)
 // CHECK:         ret void
 // CHECK:       }
 
-// CHECK-LABEL: define i64 @test_5.returnOffset() local_unnamed_addr {{.*}} {
+// CHECK-LABEL: define{{.*}}i64 @test_5.returnOffset() local_unnamed_addr {{.*}} {
 // CHECK:         ret i64 0
 // CHECK:       }
 
-// CHECK-LABEL: define { i8*, i64 } @test_5.thunk(i8* nocapture writeonly 
-// CHECK-SAME:                                                            %[[VAL_0:.*]], i1
-// CHECK-SAME:                                                            %[[VAL_1:.*]]) {
+// CHECK-LABEL: define { ptr, i64 } @test_5.thunk(
+// CHECK-SAME:      ptr{{.*}}%[[VAL_0:.*]], i1 %[[VAL_1:.*]]) {
 // CHECK:         tail call void @__nvqpp_cleanup_arrays()
-// CHECK:         %[[VAL_2:.*]] = bitcast i8* %[[VAL_0]] to i64*
-// CHECK:         store i64 537892, i64* %[[VAL_2]], align 4
-// CHECK:         %[[VAL_3:.*]] = getelementptr i8, i8* %[[VAL_0]], i64 8
-// CHECK:         %[[VAL_4:.*]] = bitcast i8* %[[VAL_3]] to double*
-// CHECK:         store double 0x40578DA858793DD9, double* %[[VAL_4]], align 8
-// CHECK:         ret { i8*, i64 } zeroinitializer
-// CHECK:       }
-
-// CHECK-LABEL: define i64 @test_5.argsCreator(i8** nocapture readnone 
-// CHECK-SAME:                                                         %[[VAL_0:.*]], i8** nocapture writeonly
-// CHECK-SAME:                                                         %[[VAL_1:.*]]) {{.*}} {
-// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) i8* @malloc(i64 16)
-// CHECK:         store i8* %[[VAL_2]], i8** %[[VAL_1]], align 8
+// CHECK:         store i64 537892, ptr %[[VAL_0]]
+// CHECK:         %[[VAL_2:.*]] = getelementptr i8, ptr %[[VAL_0]], i64 8
+// CHECK:         store double 0x40578DA858793DD9, ptr %[[VAL_2]]
+// CHECK:         ret { ptr, i64 } zeroinitializer
+// CHECK:       }
+
+// CHECK-LABEL: define{{.*}}i64 @test_5.argsCreator(
+// CHECK-SAME:      ptr readnone{{.*}}%[[VAL_0:.*]], ptr{{.*}}%[[VAL_1:.*]]) {{.*}} {
+// CHECK:         %[[VAL_2:.*]] = tail call dereferenceable_or_null(16) ptr @malloc(i64 16)
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_1]]
 // CHECK:         ret i64 16
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_5.kernelRegFunc() {
-// CHECK:         tail call void @cudaqRegisterKernelName(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0))
-// CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_5.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_5.argsCreator to i8*))
+// CHECK:         tail call void @cudaqRegisterKernelName(ptr nonnull @test_5.kernelName)
+// CHECK:         tail call void @cudaqRegisterArgsCreator(ptr nonnull @test_5.kernelName, ptr nonnull @test_5.argsCreator)
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/select.qke b/test/Translate/select.qke
index f50a0ee5cb7..1658474167a 100644
--- a/test/Translate/select.qke
+++ b/test/Translate/select.qke
@@ -29,5 +29,5 @@ func.func @__nvqpp__mlirgen__branching() attributes {"cudaq-entrypoint", "cudaq-
 }
 
 // CHECK-LABEL:   define void @__nvqpp__mlirgen__branching()
-// CHECK:   %[[VAL_0:.*]] = select i1 %{{.*}}, %Qubit* %{{.*}}, %Qubit* %{{.*}}
-// CHECK:   tail call void @__quantum__qis__h(%Qubit* %[[VAL_0]])
+// CHECK:   %[[VAL_0:.*]] = select i1 %{{.*}}, ptr %{{.*}}, ptr %{{.*}}
+// CHECK:   tail call void @__quantum__qis__h(ptr %[[VAL_0]])
diff --git a/test/Translate/value-0.qke b/test/Translate/value-0.qke
index 4508f3924fe..186da8ac77a 100644
--- a/test/Translate/value-0.qke
+++ b/test/Translate/value-0.qke
@@ -19,8 +19,8 @@ func.func @test() {
 }
 
 // CHECK-LABEL: define void @test() local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 2)
-// CHECK:         tail call void @callee(%[[VAL_1]]* %[[VAL_0]], float 1.0
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 2)
+// CHECK:         tail call void @callee(ptr %[[VAL_0]], float 1.0
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
diff --git a/test/Translate/veq_or_qubit_control_args.qke b/test/Translate/veq_or_qubit_control_args.qke
index f34dd91ad4e..19eb34d4030 100644
--- a/test/Translate/veq_or_qubit_control_args.qke
+++ b/test/Translate/veq_or_qubit_control_args.qke
@@ -28,22 +28,21 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__function_fancyCno
 }
 
 // CHECK-LABEL: define void @__nvqpp__mlirgen__function_toffoli
-// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 3)
-// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
-// CHECK:         %[[VAL_5:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
-// CHECK:         %[[VAL_6:.*]] = load %Qubit*, %Qubit** %[[VAL_5]], align 8
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_8:.*]] = bitcast %Qubit** %[[VAL_7]] to i8**
-// CHECK:         %[[VAL_9:.*]] = load i8*, i8** %[[VAL_8]], align 8
-// CHECK:         %[[VAL_10:.*]] = tail call %Array* @__quantum__rt__array_create_1d(i32 8, i64 1)
-// CHECK:         %[[VAL_11:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_10]], i64 0)
-// CHECK:         store %Qubit* %[[VAL_4]], %Qubit** %[[VAL_11]], align 8
-// CHECK:         %[[VAL_12:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%Array* %[[VAL_10]])
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 1, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), i64 %[[VAL_12]], %Array* %[[VAL_10]], i8* %[[VAL_9]], %Qubit* %[[VAL_6]])
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call ptr @__quantum__rt__qubit_allocate_array(i64 3)
+// CHECK:         %[[VAL_2:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_4:.*]] = load ptr, ptr %[[VAL_2]], align 8
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 2)
+// CHECK:         %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]], align 8
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__x(ptr %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_9:.*]] = load ptr, ptr %[[VAL_7]], align 8
+// CHECK:         %[[VAL_10:.*]] = tail call ptr @__quantum__rt__array_create_1d(i32 8, i64 1)
+// CHECK:         %[[VAL_11:.*]] = tail call ptr @__quantum__rt__array_get_element_ptr_1d(ptr %[[VAL_10]], i64 0)
+// CHECK:         store ptr %[[VAL_4]], ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_12:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(ptr %[[VAL_10]])
+// CHECK:         tail call void (i64, i64, i64, i64, ptr, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 1, i64 1, i64 1, ptr nonnull @__quantum__qis__x__ctl, i64 %[[VAL_12]], ptr %[[VAL_10]], ptr %[[VAL_9]], ptr %[[VAL_6]])
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(ptr %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
 
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index f0bca187850..bf9bf2f9d43 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -58,6 +58,12 @@
 # The root path where tests should be run.
 config.test_exec_root = os.path.join(config.cudaq_obj_root, 'test')
 
+# Check for optional plugin libraries.
+custom_pass_plugin = os.path.join(config.cudaq_lib_dir,
+                                  'CustomPassPlugin' + config.cudaq_plugin_ext)
+if os.path.isfile(custom_pass_plugin):
+    config.available_features.add('custom-pass-plugin')
+
 # Tweak the PATH to include the tools directory.
 llvm_config.with_environment('PATH', config.cudaq_tools_dir, append_path=True)
 llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
diff --git a/tools/cudaq-lsp-server/CMakeLists.txt b/tools/cudaq-lsp-server/CMakeLists.txt
index 841f82f037b..ed98c9d0645 100644
--- a/tools/cudaq-lsp-server/CMakeLists.txt
+++ b/tools/cudaq-lsp-server/CMakeLists.txt
@@ -24,6 +24,7 @@ set(LIBS
   MLIRAnalysis
   MLIRDialect
   MLIRLspServerLib
+  MLIRRegisterAllDialects
   MLIRParser
   MLIRPass
   MLIRTransforms
diff --git a/tools/cudaq-opt/CMakeLists.txt b/tools/cudaq-opt/CMakeLists.txt
index 051f390ba84..9b597b9aa40 100644
--- a/tools/cudaq-opt/CMakeLists.txt
+++ b/tools/cudaq-opt/CMakeLists.txt
@@ -17,6 +17,7 @@ target_link_libraries(cudaq-opt
   MLIROptLib
   MLIRIR
   MLIRFuncDialect
+  MLIRFuncInlinerExtension
   MLIRArithDialect
 
   CCDialect
diff --git a/tools/cudaq-opt/cudaq-opt.cpp b/tools/cudaq-opt/cudaq-opt.cpp
index 2970be5252a..8fe8eb07497 100644
--- a/tools/cudaq-opt/cudaq-opt.cpp
+++ b/tools/cudaq-opt/cudaq-opt.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/Pass.h"
@@ -79,6 +80,7 @@ int main(int argc, char **argv) {
   cudaq::registerAllDialects(registry);
   registry.insert<cudaq::codegen::CodeGenDialect>();
   registerInlinerExtension(registry);
+  mlir::func::registerInlinerExtension(registry);
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "nvq++ optimizer\n", registry));
 }
diff --git a/tools/cudaq-qpud/CMakeLists.txt b/tools/cudaq-qpud/CMakeLists.txt
index b655f0a361d..da41082305d 100644
--- a/tools/cudaq-qpud/CMakeLists.txt
+++ b/tools/cudaq-qpud/CMakeLists.txt
@@ -6,8 +6,13 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 set(TOOL_NAME cudaq-qpud)
+
+if (OPENSSL_FOUND AND CUDAQ_ENABLE_REST)
+
 add_executable(${TOOL_NAME} RestServerMain.cpp)
-set_target_properties(${TOOL_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/")
+set_target_properties(${TOOL_NAME}
+  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/")
+
 # Note: this app linked against CUDA-Q runtime (dynamic libraries).
 # Hence, we don't enforce static linking. 
 if (CMAKE_EXE_LINKER_FLAGS)
@@ -27,9 +32,18 @@ if (${CMAKE_EXE_LINKER_FLAGS} MATCHES "^.*static-libstdc\\+\\+.*")
         -Wl,--no-whole-archive
     )
 endif()
-target_link_libraries(${TOOL_NAME} PRIVATE cudaq rest-remote-platform-server cudaq-mlir-runtime cudaq-platform-default)
+
+target_link_libraries(${TOOL_NAME}
+  PRIVATE
+    cudaq
+    rest-remote-platform-server
+    cudaq-mlir-runtime
+    cudaq-platform-default
+)
 target_compile_options(${TOOL_NAME} PRIVATE -fno-rtti)
 export_executable_symbols_for_plugins(${TOOL_NAME})
 
 install(TARGETS ${TOOL_NAME} DESTINATION bin)
 install(FILES ${TOOL_NAME}.py DESTINATION bin)
+
+endif()
diff --git a/tools/cudaq-quake/cudaq-quake.cpp b/tools/cudaq-quake/cudaq-quake.cpp
index a3ad7cebe72..bc958268b9f 100644
--- a/tools/cudaq-quake/cudaq-quake.cpp
+++ b/tools/cudaq-quake/cudaq-quake.cpp
@@ -187,10 +187,6 @@ class CudaQASTConsumer : public clang::ASTConsumer {
     applyConsumers(&clang::ASTConsumer::CompleteTentativeDefinition,
                    std::move(D));
   }
-  void CompleteExternalDeclaration(clang::VarDecl *D) override {
-    applyConsumers(&clang::ASTConsumer::CompleteExternalDeclaration,
-                   std::move(D));
-  }
   void AssignInheritanceModel(clang::CXXRecordDecl *RD) override {
     applyConsumers(&clang::ASTConsumer::AssignInheritanceModel, std::move(RD));
   }
diff --git a/tools/cudaq-translate/CMakeLists.txt b/tools/cudaq-translate/CMakeLists.txt
index 2b1ee932177..d0ee93212bb 100644
--- a/tools/cudaq-translate/CMakeLists.txt
+++ b/tools/cudaq-translate/CMakeLists.txt
@@ -25,7 +25,10 @@ target_link_libraries(${TOOL_NAME}
     MLIRTransforms
     MLIRTargetLLVMIRExport
     MLIRLLVMCommonConversion
+    MLIRBuiltinToLLVMIRTranslation
     MLIRLLVMToLLVMIRTranslation
+    MLIRFuncInlinerExtension
+    MLIRLLVMIRTransforms
 
     CCDialect
     OptCodeGen
diff --git a/tools/cudaq-translate/cudaq-translate.cpp b/tools/cudaq-translate/cudaq-translate.cpp
index 12fdba50a00..cc2cd5fcaa9 100644
--- a/tools/cudaq-translate/cudaq-translate.cpp
+++ b/tools/cudaq-translate/cudaq-translate.cpp
@@ -27,11 +27,14 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Tools/mlir-translate/Translation.h"
@@ -108,6 +111,10 @@ int main(int argc, char **argv) {
   DialectRegistry registry;
   registry.insert<cudaq::cc::CCDialect, quake::QuakeDialect>();
   cudaq::registerAllDialects(registry);
+  mlir::func::registerInlinerExtension(registry);
+  mlir::LLVM::registerInlinerInterface(registry);
+  registerBuiltinDialectTranslation(registry);
+  registerLLVMDialectTranslation(registry);
   MLIRContext context(registry);
   context.loadAllAvailableDialects();
 
@@ -142,7 +149,8 @@ int main(int argc, char **argv) {
 
   PassManager pm(&context);
   // Apply any generic pass manager command line options and run the pipeline.
-  applyPassManagerCLOptions(pm);
+  if (failed(applyPassManagerCLOptions(pm)))
+    return 1;
 
   std::error_code ec;
   llvm::ToolOutputFile out(outputFilename, ec, llvm::sys::fs::OF_None);
@@ -169,7 +177,7 @@ int main(int argc, char **argv) {
   StringRef convertValue = convertTo.getValue();
   auto convertPair = convertValue.split(':');
   llvm::StringSwitch<std::function<void()>>(convertPair.first)
-      .Cases("qir", "qir-full", "qir-adaptive", "qir-base",
+      .Cases({"qir", "qir-full", "qir-adaptive", "qir-base"},
              [&]() {
                cudaq::opt::addAggressiveInlining(pm);
                cudaq::opt::createTargetFinalizePipeline(pm);
@@ -218,7 +226,6 @@ int main(int argc, char **argv) {
 
   // Convert the module to LLVM IR in a new LLVM IR context.
   llvm::LLVMContext llvmContext;
-  llvmContext.setOpaquePointers(false);
   auto llvmModule = translateModuleToLLVMIR(module.get(), llvmContext);
   if (!llvmModule)
     cudaq::emitFatalError(module->getLoc(), "Failed to emit LLVM IR");
@@ -226,7 +233,22 @@ int main(int argc, char **argv) {
   // Initialize LLVM targets.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
-  ExecutionEngine::setupTargetTriple(llvmModule.get());
+
+  // Create target machine and configure the LLVM Module
+  auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+  if (!tmBuilderOrError) {
+    llvm::errs() << "Could not create JITTargetMachineBuilder\n";
+    std::exit(1);
+  }
+
+  auto tmOrError = tmBuilderOrError->createTargetMachine();
+  if (!tmOrError) {
+    llvm::errs() << "Could not create TargetMachine\n";
+    std::exit(1);
+  }
+
+  ExecutionEngine::setupTargetTripleAndDataLayout(llvmModule.get(),
+                                                  tmOrError.get().get());
 
   // Optionally run an optimization pipeline over the llvm module.
   auto optPipeline =
diff --git a/tpls/Crow b/tpls/Crow
index 94a011b9f7c..f8c060c51fe 160000
--- a/tpls/Crow
+++ b/tpls/Crow
@@ -1 +1 @@
-Subproject commit 94a011b9f7c0a991e5382927a2dbe5a7d9a056b8
+Subproject commit f8c060c51feeca2c65828fb6f538603db4392d55
diff --git a/tpls/customizations/llvm/fix_region_simplification.diff b/tpls/customizations/llvm/fix_region_simplification.diff
deleted file mode 100644
index 4cb0d9e4963..00000000000
--- a/tpls/customizations/llvm/fix_region_simplification.diff
+++ /dev/null
@@ -1,21 +0,0 @@
-diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
-index 996588243..63ab385be 100644
---- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
-+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
-@@ -679,6 +679,15 @@ static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter,
-       if (hasNonEmptyRegion)
-         continue;
- 
-+      // Don't allow merging if this block's arguments are used outside of the
-+      // original block.
-+      bool argHasExternalUsers = llvm::any_of(
-+          block->getArguments(), [block](mlir::BlockArgument &arg) {
-+            return arg.isUsedOutsideOfBlock(block);
-+          });
-+      if (argHasExternalUsers)
-+        continue;
-+
-       // Try to add this block to an existing cluster.
-       bool addedToCluster = false;
-       for (auto &cluster : clusters)
- 
\ No newline at end of file
diff --git a/tpls/customizations/llvm/idempotent_option_category.diff b/tpls/customizations/llvm/idempotent_option_category.old
similarity index 100%
rename from tpls/customizations/llvm/idempotent_option_category.diff
rename to tpls/customizations/llvm/idempotent_option_category.old
diff --git a/tpls/customizations/llvm/llvm_pr71968_mod.diff b/tpls/customizations/llvm/llvm_pr71968_mod.diff
deleted file mode 100644
index 624c35ab39a..00000000000
--- a/tpls/customizations/llvm/llvm_pr71968_mod.diff
+++ /dev/null
@@ -1,473 +0,0 @@
-diff --git a/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h b/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h
-index 455efc9f90..97e03ba553 100644
---- a/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h
-+++ b/llvm/include/llvm/ExecutionEngine/SectionMemoryManager.h
-@@ -104,11 +104,24 @@ public:
-   /// Creates a SectionMemoryManager instance with \p MM as the associated
-   /// memory mapper.  If \p MM is nullptr then a default memory mapper is used
-   /// that directly calls into the operating system.
--  SectionMemoryManager(MemoryMapper *MM = nullptr);
-+  ///
-+  /// If \p ReserveAlloc is true all memory will be pre-allocated, and any
-+  /// attempts to allocate beyond pre-allocated memory will fail.
-+  SectionMemoryManager(MemoryMapper *MM = nullptr, bool ReserveAlloc = true);
-   SectionMemoryManager(const SectionMemoryManager &) = delete;
-   void operator=(const SectionMemoryManager &) = delete;
-   ~SectionMemoryManager() override;
- 
-+  /// Enable reserveAllocationSpace when requested.
-+  bool needsToReserveAllocationSpace() override { return ReserveAllocation; }
-+
-+  /// Implements allocating all memory in a single block. This is required to
-+  /// limit memory offsets to fit the ARM ABI; large memory systems may
-+  /// otherwise allocate separate sections too far apart.
-+  void reserveAllocationSpace(uintptr_t CodeSize, Align CodeAlign,
-+                              uintptr_t RODataSize, Align RODataAlign,
-+                              uintptr_t RWDataSize, Align RWDataAlign) override;
-+
-   /// Allocates a memory block of (at least) the given size suitable for
-   /// executable code.
-   ///
-@@ -180,12 +193,15 @@ private:
-   std::error_code applyMemoryGroupPermissions(MemoryGroup &MemGroup,
-                                               unsigned Permissions);
- 
-+  bool hasSpace(const MemoryGroup &MemGroup, uintptr_t Size) const;
-+
-   void anchor() override;
- 
-   MemoryGroup CodeMem;
-   MemoryGroup RWDataMem;
-   MemoryGroup RODataMem;
-   MemoryMapper &MMapper;
-+  bool ReserveAllocation;
- };
- 
- } // end namespace llvm
-diff --git a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
-index b23e33039c..4e0ed6f217 100644
---- a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
-+++ b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
-@@ -18,6 +18,97 @@
- 
- namespace llvm {
- 
-+bool SectionMemoryManager::hasSpace(const MemoryGroup &MemGroup,
-+                                    uintptr_t Size) const {
-+  for (const FreeMemBlock &FreeMB : MemGroup.FreeMem) {
-+    if (FreeMB.Free.allocatedSize() >= Size)
-+      return true;
-+  }
-+  return false;
-+}
-+
-+void SectionMemoryManager::reserveAllocationSpace(
-+    uintptr_t CodeSize, Align CodeAlign, uintptr_t RODataSize,
-+    Align RODataAlign, uintptr_t RWDataSize, Align RWDataAlign) {
-+  if (CodeSize == 0 && RODataSize == 0 && RWDataSize == 0)
-+    return;
-+
-+  static const size_t PageSize = sys::Process::getPageSizeEstimate();
-+
-+  // Code alignment needs to be at least the stub alignment - however, we
-+  // don't have an easy way to get that here so as a workaround, we assume
-+  // it's 8, which is the largest value I observed across all platforms.
-+  constexpr uint64_t StubAlign = 8;
-+  CodeAlign = Align(std::max(CodeAlign.value(), StubAlign));
-+  RODataAlign = Align(std::max(RODataAlign.value(), StubAlign));
-+  RWDataAlign = Align(std::max(RWDataAlign.value(), StubAlign));
-+
-+  // Get space required for each section. Use the same calculation as
-+  // allocateSection because we need to be able to satisfy it.
-+  uint64_t RequiredCodeSize = alignTo(CodeSize, CodeAlign) + CodeAlign.value();
-+  uint64_t RequiredRODataSize =
-+      alignTo(RODataSize, RODataAlign) + RODataAlign.value();
-+  uint64_t RequiredRWDataSize =
-+      alignTo(RWDataSize, RWDataAlign) + RWDataAlign.value();
-+
-+  if (hasSpace(CodeMem, RequiredCodeSize) &&
-+      hasSpace(RODataMem, RequiredRODataSize) &&
-+      hasSpace(RWDataMem, RequiredRWDataSize)) {
-+    // Sufficient space in contiguous block already available.
-+    return;
-+  }
-+
-+  // MemoryManager does not have functions for releasing memory after it's
-+  // allocated. Normally it tries to use any excess blocks that were allocated
-+  // due to page alignment, but if we have insufficient free memory for the
-+  // request this can lead to allocating disparate memory that can violate the
-+  // ARM ABI. Clear free memory so only the new allocations are used, but do
-+  // not release allocated memory as it may still be in-use.
-+  CodeMem.FreeMem.clear();
-+  RODataMem.FreeMem.clear();
-+  RWDataMem.FreeMem.clear();
-+
-+  // Round up to the nearest page size. Blocks must be page-aligned.
-+  RequiredCodeSize = alignTo(RequiredCodeSize, PageSize);
-+  RequiredRODataSize = alignTo(RequiredRODataSize, PageSize);
-+  RequiredRWDataSize = alignTo(RequiredRWDataSize, PageSize);
-+  uint64_t RequiredSize =
-+      RequiredCodeSize + RequiredRODataSize + RequiredRWDataSize;
-+
-+  std::error_code ec;
-+  sys::MemoryBlock MB = MMapper.allocateMappedMemory(
-+      AllocationPurpose::RWData, RequiredSize, nullptr,
-+      sys::Memory::MF_READ | sys::Memory::MF_WRITE, ec);
-+  if (ec) {
-+    return;
-+  }
-+  // CodeMem will arbitrarily own this MemoryBlock to handle cleanup.
-+  CodeMem.AllocatedMem.push_back(MB);
-+  uintptr_t Addr = (uintptr_t)MB.base();
-+  FreeMemBlock FreeMB;
-+  FreeMB.PendingPrefixIndex = (unsigned)-1;
-+
-+  if (CodeSize > 0) {
-+    assert(isAddrAligned(CodeAlign, (void *)Addr));
-+    FreeMB.Free = sys::MemoryBlock((void *)Addr, RequiredCodeSize);
-+    CodeMem.FreeMem.push_back(FreeMB);
-+    Addr += RequiredCodeSize;
-+  }
-+
-+  if (RODataSize > 0) {
-+    assert(isAddrAligned(RODataAlign, (void *)Addr));
-+    FreeMB.Free = sys::MemoryBlock((void *)Addr, RequiredRODataSize);
-+    RODataMem.FreeMem.push_back(FreeMB);
-+    Addr += RequiredRODataSize;
-+  }
-+
-+  if (RWDataSize > 0) {
-+    assert(isAddrAligned(RWDataAlign, (void *)Addr));
-+    FreeMB.Free = sys::MemoryBlock((void *)Addr, RequiredRWDataSize);
-+    RWDataMem.FreeMem.push_back(FreeMB);
-+  }
-+}
-+
- uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
-                                                    unsigned Alignment,
-                                                    unsigned SectionID,
-@@ -267,7 +358,9 @@ public:
- DefaultMMapper DefaultMMapperInstance;
- } // namespace
- 
--SectionMemoryManager::SectionMemoryManager(MemoryMapper *MM)
--    : MMapper(MM ? *MM : DefaultMMapperInstance) {}
-+SectionMemoryManager::SectionMemoryManager(MemoryMapper *MM,
-+                                           bool ReserveAlloc)
-+    : MMapper(MM ? *MM : DefaultMMapperInstance),
-+      ReserveAllocation(ReserveAlloc) {}
- 
- } // namespace llvm
-diff --git a/llvm/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
-index 7a756a7071..109e38be0a 100644
---- a/llvm/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
-+++ b/llvm/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
-@@ -7,6 +7,7 @@
- //===----------------------------------------------------------------------===//
- 
- #include "llvm/ExecutionEngine/SectionMemoryManager.h"
-+#include "llvm/Support/Process.h"
- #include "gtest/gtest.h"
- 
- using namespace llvm;
-@@ -16,15 +17,17 @@ namespace {
- TEST(MCJITMemoryManagerTest, BasicAllocations) {
-   std::unique_ptr<SectionMemoryManager> MemMgr(new SectionMemoryManager());
- 
-+  EXPECT_FALSE(MemMgr->needsToReserveAllocationSpace());
-+
-   uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1, "");
-   uint8_t *data1 = MemMgr->allocateDataSection(256, 0, 2, "", true);
-   uint8_t *code2 = MemMgr->allocateCodeSection(256, 0, 3, "");
-   uint8_t *data2 = MemMgr->allocateDataSection(256, 0, 4, "", false);
- 
--  EXPECT_NE((uint8_t*)nullptr, code1);
--  EXPECT_NE((uint8_t*)nullptr, code2);
--  EXPECT_NE((uint8_t*)nullptr, data1);
--  EXPECT_NE((uint8_t*)nullptr, data2);
-+  EXPECT_NE((uint8_t *)nullptr, code1);
-+  EXPECT_NE((uint8_t *)nullptr, code2);
-+  EXPECT_NE((uint8_t *)nullptr, data1);
-+  EXPECT_NE((uint8_t *)nullptr, data2);
- 
-   // Initialize the data
-   for (unsigned i = 0; i < 256; ++i) {
-@@ -54,10 +57,10 @@ TEST(MCJITMemoryManagerTest, LargeAllocations) {
-   uint8_t *code2 = MemMgr->allocateCodeSection(0x100000, 0, 3, "");
-   uint8_t *data2 = MemMgr->allocateDataSection(0x100000, 0, 4, "", false);
- 
--  EXPECT_NE((uint8_t*)nullptr, code1);
--  EXPECT_NE((uint8_t*)nullptr, code2);
--  EXPECT_NE((uint8_t*)nullptr, data1);
--  EXPECT_NE((uint8_t*)nullptr, data2);
-+  EXPECT_NE((uint8_t *)nullptr, code1);
-+  EXPECT_NE((uint8_t *)nullptr, code2);
-+  EXPECT_NE((uint8_t *)nullptr, data1);
-+  EXPECT_NE((uint8_t *)nullptr, data2);
- 
-   // Initialize the data
-   for (unsigned i = 0; i < 0x100000; ++i) {
-@@ -82,8 +85,8 @@ TEST(MCJITMemoryManagerTest, LargeAllocations) {
- TEST(MCJITMemoryManagerTest, ManyAllocations) {
-   std::unique_ptr<SectionMemoryManager> MemMgr(new SectionMemoryManager());
- 
--  uint8_t* code[10000];
--  uint8_t* data[10000];
-+  uint8_t *code[10000];
-+  uint8_t *data[10000];
- 
-   for (unsigned i = 0; i < 10000; ++i) {
-     const bool isReadOnly = i % 2 == 0;
-@@ -117,8 +120,8 @@ TEST(MCJITMemoryManagerTest, ManyAllocations) {
- TEST(MCJITMemoryManagerTest, ManyVariedAllocations) {
-   std::unique_ptr<SectionMemoryManager> MemMgr(new SectionMemoryManager());
- 
--  uint8_t* code[10000];
--  uint8_t* data[10000];
-+  uint8_t *code[10000];
-+  uint8_t *data[10000];
- 
-   for (unsigned i = 0; i < 10000; ++i) {
-     uintptr_t CodeSize = i % 16 + 1;
-@@ -165,5 +168,241 @@ TEST(MCJITMemoryManagerTest, ManyVariedAllocations) {
-   }
- }
- 
-+TEST(MCJITMemoryManagerTest, PreAllocation) {
-+  std::unique_ptr<SectionMemoryManager> MemMgr(
-+      new SectionMemoryManager(nullptr, true));
-+
-+  EXPECT_TRUE(MemMgr->needsToReserveAllocationSpace());
-+
-+  llvm::Align Align{16};
-+  MemMgr->reserveAllocationSpace(512, Align, 256, Align, 256, Align);
-+
-+  uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1, "");
-+  uint8_t *data1 = MemMgr->allocateDataSection(256, 0, 2, "", true);
-+  uint8_t *code2 = MemMgr->allocateCodeSection(256, 0, 3, "");
-+  uint8_t *data2 = MemMgr->allocateDataSection(256, 0, 4, "", false);
-+
-+  uint8_t *minAddr = std::min({code1, data1, code2, data2});
-+  uint8_t *maxAddr = std::max({code1, data1, code2, data2});
-+
-+  EXPECT_NE((uint8_t *)nullptr, code1);
-+  EXPECT_NE((uint8_t *)nullptr, code2);
-+  EXPECT_NE((uint8_t *)nullptr, data1);
-+  EXPECT_NE((uint8_t *)nullptr, data2);
-+
-+  // Initialize the data
-+  for (unsigned i = 0; i < 256; ++i) {
-+    code1[i] = 1;
-+    code2[i] = 2;
-+    data1[i] = 3;
-+    data2[i] = 4;
-+  }
-+
-+  // Verify the data (this is checking for overlaps in the addresses)
-+  for (unsigned i = 0; i < 256; ++i) {
-+    EXPECT_EQ(1, code1[i]);
-+    EXPECT_EQ(2, code2[i]);
-+    EXPECT_EQ(3, data1[i]);
-+    EXPECT_EQ(4, data2[i]);
-+  }
-+
-+  std::string Error;
-+  EXPECT_FALSE(MemMgr->finalizeMemory(&Error));
-+
-+  MemMgr->reserveAllocationSpace(512, Align, 256, Align, 256, Align);
-+
-+  code1 = MemMgr->allocateCodeSection(256, 0, 1, "");
-+  data1 = MemMgr->allocateDataSection(256, 0, 2, "", true);
-+  code2 = MemMgr->allocateCodeSection(256, 0, 3, "");
-+  data2 = MemMgr->allocateDataSection(256, 0, 4, "", false);
-+
-+  EXPECT_NE((uint8_t *)nullptr, code1);
-+  EXPECT_NE((uint8_t *)nullptr, code2);
-+  EXPECT_NE((uint8_t *)nullptr, data1);
-+  EXPECT_NE((uint8_t *)nullptr, data2);
-+
-+  // Validate difference is more than 3x PageSize (the original reservation).
-+  minAddr = std::min({minAddr, code1, data1, code2, data2});
-+  maxAddr = std::max({maxAddr, code1, data1, code2, data2});
-+  EXPECT_GT(maxAddr - minAddr, 3 * sys::Process::getPageSizeEstimate());
-+
-+  // Initialize the data
-+  for (unsigned i = 0; i < 256; ++i) {
-+    code1[i] = 1;
-+    code2[i] = 2;
-+    data1[i] = 3;
-+    data2[i] = 4;
-+  }
-+
-+  // Verify the data (this is checking for overlaps in the addresses)
-+  for (unsigned i = 0; i < 256; ++i) {
-+    EXPECT_EQ(1, code1[i]);
-+    EXPECT_EQ(2, code2[i]);
-+    EXPECT_EQ(3, data1[i]);
-+    EXPECT_EQ(4, data2[i]);
-+  }
-+
-+  EXPECT_FALSE(MemMgr->finalizeMemory(&Error));
-+}
-+
-+TEST(MCJITMemoryManagerTest, PreAllocationReuse) {
-+  std::unique_ptr<SectionMemoryManager> MemMgr(
-+      new SectionMemoryManager(nullptr, true));
-+
-+  EXPECT_TRUE(MemMgr->needsToReserveAllocationSpace());
-+
-+  // Reserve PageSize, because finalizeMemory eliminates blocks that aren't a
-+  // full page size. Alignment adjustment will ensure that 2 pages are
-+  // allocated for each section.
-+  const unsigned PageSize = sys::Process::getPageSizeEstimate();
-+  EXPECT_GE(PageSize, 512u);
-+  llvm::Align Align{16};
-+  MemMgr->reserveAllocationSpace(PageSize, Align, PageSize, Align, PageSize,
-+                                 Align);
-+
-+  uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1, "");
-+  uint8_t *data1 = MemMgr->allocateDataSection(256, 0, 2, "", true);
-+  uint8_t *code2 = MemMgr->allocateCodeSection(256, 0, 3, "");
-+  uint8_t *data2 = MemMgr->allocateDataSection(256, 0, 4, "", false);
-+
-+  uint8_t *minAddr = std::min({code1, data1, code2, data2});
-+  uint8_t *maxAddr = std::max({code1, data1, code2, data2});
-+
-+  EXPECT_NE((uint8_t *)nullptr, code1);
-+  EXPECT_NE((uint8_t *)nullptr, code2);
-+  EXPECT_NE((uint8_t *)nullptr, data1);
-+  EXPECT_NE((uint8_t *)nullptr, data2);
-+
-+  // Initialize the data
-+  for (unsigned i = 0; i < 256; ++i) {
-+    code1[i] = 1;
-+    code2[i] = 2;
-+    data1[i] = 3;
-+    data2[i] = 4;
-+  }
-+
-+  // Verify the data (this is checking for overlaps in the addresses)
-+  for (unsigned i = 0; i < 256; ++i) {
-+    EXPECT_EQ(1, code1[i]);
-+    EXPECT_EQ(2, code2[i]);
-+    EXPECT_EQ(3, data1[i]);
-+    EXPECT_EQ(4, data2[i]);
-+  }
-+
-+  std::string Error;
-+  EXPECT_FALSE(MemMgr->finalizeMemory(&Error));
-+
-+  // Each type of data is allocated on PageSize (usually 4KB). Allocate again
-+  // and guarantee we get requests in the same block.
-+  MemMgr->reserveAllocationSpace(512, Align, 256, Align, 256, Align);
-+
-+  code1 = MemMgr->allocateCodeSection(256, 0, 5, "");
-+  data1 = MemMgr->allocateDataSection(256, 0, 6, "", true);
-+  code2 = MemMgr->allocateCodeSection(256, 0, 7, "");
-+  data2 = MemMgr->allocateDataSection(256, 0, 8, "", false);
-+
-+  EXPECT_NE((uint8_t *)nullptr, code1);
-+  EXPECT_NE((uint8_t *)nullptr, code2);
-+  EXPECT_NE((uint8_t *)nullptr, data1);
-+  EXPECT_NE((uint8_t *)nullptr, data2);
-+
-+  // Validate difference is less than 6x PageSize
-+  minAddr = std::min({minAddr, code1, data1, code2, data2});
-+  maxAddr = std::max({maxAddr, code1, data1, code2, data2});
-+  EXPECT_LT(maxAddr - minAddr, 6 * PageSize);
-+
-+  // Initialize the data
-+  for (unsigned i = 0; i < 256; ++i) {
-+    code1[i] = 1;
-+    code2[i] = 2;
-+    data1[i] = 3;
-+    data2[i] = 4;
-+  }
-+
-+  // Verify the data (this is checking for overlaps in the addresses)
-+  for (unsigned i = 0; i < 256; ++i) {
-+    EXPECT_EQ(1, code1[i]);
-+    EXPECT_EQ(2, code2[i]);
-+    EXPECT_EQ(3, data1[i]);
-+    EXPECT_EQ(4, data2[i]);
-+  }
-+
-+  EXPECT_FALSE(MemMgr->finalizeMemory(&Error));
-+}
-+
-+TEST(MCJITMemoryManagerTest, ManyPreAllocation) {
-+  std::unique_ptr<SectionMemoryManager> MemMgr(
-+      new SectionMemoryManager(nullptr, true));
-+
-+  uint8_t *code[10000];
-+  uint8_t *data[10000];
-+
-+  // Total size computation needs to take into account how much memory will be
-+  // used including alignment.
-+  uintptr_t CodeSize = 0, RODataSize = 0, RWDataSize = 0;
-+  for (unsigned i = 0; i < 10000; ++i) {
-+    unsigned Align = 8 << (i % 4);
-+    CodeSize += alignTo(i % 16 + 1, Align);
-+    if (i % 3 == 0) {
-+      RODataSize += alignTo(i % 8 + 1, Align);
-+    } else {
-+      RWDataSize += alignTo(i % 8 + 1, Align);
-+    }
-+  }
-+  llvm::Align Align = llvm::Align(8);
-+  MemMgr->reserveAllocationSpace(CodeSize, Align, RODataSize, Align, RWDataSize,
-+                                 Align);
-+  uint8_t *minAddr = (uint8_t *)std::numeric_limits<uintptr_t>::max();
-+  uint8_t *maxAddr = (uint8_t *)std::numeric_limits<uintptr_t>::min();
-+
-+  for (unsigned i = 0; i < 10000; ++i) {
-+    uintptr_t CodeSize = i % 16 + 1;
-+    uintptr_t DataSize = i % 8 + 1;
-+
-+    bool isReadOnly = i % 3 == 0;
-+    unsigned Align = 8 << (i % 4);
-+
-+    code[i] = MemMgr->allocateCodeSection(CodeSize, Align, i, "");
-+    data[i] =
-+        MemMgr->allocateDataSection(DataSize, Align, i + 10000, "", isReadOnly);
-+    minAddr = std::min({minAddr, code[i], data[i]});
-+    maxAddr = std::max({maxAddr, code[i], data[i]});
-+
-+    EXPECT_NE((uint8_t *)nullptr, code[i]);
-+    EXPECT_NE((uint8_t *)nullptr, data[i]);
-+
-+    for (unsigned j = 0; j < CodeSize; j++) {
-+      code[i][j] = 1 + (i % 254);
-+    }
-+
-+    for (unsigned j = 0; j < DataSize; j++) {
-+      data[i][j] = 2 + (i % 254);
-+    }
-+
-+    uintptr_t CodeAlign = Align ? (uintptr_t)code[i] % Align : 0;
-+    uintptr_t DataAlign = Align ? (uintptr_t)data[i] % Align : 0;
-+
-+    EXPECT_EQ((uintptr_t)0, CodeAlign);
-+    EXPECT_EQ((uintptr_t)0, DataAlign);
-+  }
-+
-+  EXPECT_LT(maxAddr - minAddr, 1024 * 1024 * 1024);
-+
-+  for (unsigned i = 0; i < 10000; ++i) {
-+    uintptr_t CodeSize = i % 16 + 1;
-+    uintptr_t DataSize = i % 8 + 1;
-+
-+    for (unsigned j = 0; j < CodeSize; j++) {
-+      uint8_t ExpectedCode = 1 + (i % 254);
-+      EXPECT_EQ(ExpectedCode, code[i][j]);
-+    }
-+
-+    for (unsigned j = 0; j < DataSize; j++) {
-+      uint8_t ExpectedData = 2 + (i % 254);
-+      EXPECT_EQ(ExpectedData, data[i][j]);
-+    }
-+  }
-+}
-+
- } // Namespace
- 
diff --git a/tpls/llvm b/tpls/llvm
index 7cbf1a25915..def143a6c62 160000
--- a/tpls/llvm
+++ b/tpls/llvm
@@ -1 +1 @@
-Subproject commit 7cbf1a2591520c2491aa35339f227775f4d3adf6
+Subproject commit def143a6c624dc9b991ebfdfec5c36a7084171eb
diff --git a/tpls/nanobind b/tpls/nanobind
index 2a61ad2494d..116e098cfa9 160000
--- a/tpls/nanobind
+++ b/tpls/nanobind
@@ -1 +1 @@
-Subproject commit 2a61ad2494d09fecb2e13322c1383342c299900d
+Subproject commit 116e098cfa96effca2a54e32e0ce5b93abe25393
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 1ed30b91a54..0dab675df2e 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -537,7 +537,7 @@ target_link_libraries(test_ptsbe
   nvqir-qpp
   fmt::fmt-header-only
   gtest_main)
-gtest_discover_tests(test_ptsbe PROPERTIES PROCESSORS ${CUDAQ_TEST_OMP_SLOTS})
+gtest_discover_tests(test_ptsbe PROPERTIES PROCESSORS ${CUDAQ_TEST_OMP_SLOTS} DISCOVERY_TIMEOUT 120)
 
 add_executable(test_nvqir_verify main.cpp qir/NVQIRVerify.cpp)
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
@@ -618,6 +618,11 @@ if (CUDAQ_ENABLE_PYTHON)
     return()
   endif()
 
+  if(NOT TARGET cudaq-pyscf)
+    message(STATUS "cudaq-pyscf plugin not enabled; skipping chemistry tests.")
+    return()
+  endif()
+
   message(STATUS "OpenFermion PySCF found, enabling chemistry tests.")
   add_executable(test_domains main.cpp domains/ChemistryTester.cpp)
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
diff --git a/unittests/Optimizer/CMakeLists.txt b/unittests/Optimizer/CMakeLists.txt
index d57fd7b6e18..27a1c53da28 100644
--- a/unittests/Optimizer/CMakeLists.txt
+++ b/unittests/Optimizer/CMakeLists.txt
@@ -38,7 +38,7 @@ target_include_directories(OptimizerUnitTests
 # to avoid linker errors (undefined typeinfo for base classes)
 target_compile_options(OptimizerUnitTests PRIVATE -fno-rtti)
 
-gtest_discover_tests(OptimizerUnitTests)
+gtest_discover_tests(OptimizerUnitTests DISCOVERY_TIMEOUT 120)
 
 add_executable(test_quake_synth QuakeSynthTester.cpp)
 target_link_libraries(test_quake_synth
diff --git a/unittests/Optimizer/DecompositionPatternSelectionTest.cpp b/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
index 03b596caeec..19b6bfe448c 100644
--- a/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
+++ b/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
@@ -37,6 +37,10 @@ class PatternTest : public mlir::RewritePattern {
       : mlir::RewritePattern(patternName, 0, context, {}) {
     setDebugName(patternName);
   }
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    return failure();
+  }
 };
 
 /// A mock pattern type for testing.
@@ -149,7 +153,7 @@ class BaseDecompositionPatternSelectionTest : public ::testing::Test {
 
     // Create a function to hold the operation
     auto funcType = builder.getFunctionType({}, {});
-    auto func = builder.create<func::FuncOp>(loc, "test_func", funcType);
+    auto func = func::FuncOp::create(builder, loc, "test_func", funcType);
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
@@ -157,14 +161,14 @@ class BaseDecompositionPatternSelectionTest : public ::testing::Test {
     SmallVector<Value> controls;
     auto wireType = quake::WireType::get(context.get());
     for (unsigned i = 0; i < nCtrls; ++i) {
-      auto qubit = builder.create<quake::AllocaOp>(loc, wireType);
+      auto qubit = quake::AllocaOp::create(builder, loc, wireType);
       controls.push_back(qubit.getResult());
     }
-    auto targetQubit = builder.create<quake::AllocaOp>(loc, wireType);
+    auto targetQubit = quake::AllocaOp::create(builder, loc, wireType);
     SmallVector<Value> targets{targetQubit};
 
     // Create the operation of type Op with the qubits
-    auto op = builder.create<Op>(loc, controls, targets);
+    auto op = Op::create(builder, loc, controls, targets);
 
     // Get the operation pointer and check if it is legal
     Operation *operation_ptr = op.getOperation();
diff --git a/unittests/Optimizer/DecompositionPatternsTest.cpp b/unittests/Optimizer/DecompositionPatternsTest.cpp
index 465cab3d656..57c328e3046 100644
--- a/unittests/Optimizer/DecompositionPatternsTest.cpp
+++ b/unittests/Optimizer/DecompositionPatternsTest.cpp
@@ -12,22 +12,20 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Pass/PassManager.h"
 #include <gtest/gtest.h>
-#include <iterator>
-#include <llvm/ADT/APFloat.h>
-#include <llvm/ADT/STLExtras.h>
-#include <llvm/ADT/StringMap.h>
 #include <memory>
-#include <mlir/IR/BuiltinOps.h>
 
 using namespace mlir;
 
@@ -68,7 +66,7 @@ GateSpec parseGateSpec(StringRef gateSpec) {
   size_t numControls = 0;
   if (gateSpec.consume_front("(")) {
     gateSpec = gateSpec.ltrim();
-    if (gateSpec.startswith("n")) {
+    if (gateSpec.starts_with("n")) {
       numControls = std::numeric_limits<size_t>::max();
     } else {
       gateSpec.consumeInteger(10, numControls);
@@ -95,7 +93,7 @@ ModuleOp createTestModule(MLIRContext *context, StringRef gateSpecStr) {
   }
 
   OpBuilder builder(context);
-  auto module = builder.create<ModuleOp>(builder.getUnknownLoc());
+  auto module = ModuleOp::create(builder, builder.getUnknownLoc());
   builder.setInsertionPointToEnd(module.getBody());
 
   // Create function type: (qubits...) -> ()
@@ -107,8 +105,8 @@ ModuleOp createTestModule(MLIRContext *context, StringRef gateSpecStr) {
   auto funcType = builder.getFunctionType(inputTypes, {});
 
   // Create function
-  auto func = builder.create<func::FuncOp>(builder.getUnknownLoc(), "test_func",
-                                           funcType);
+  auto func = func::FuncOp::create(builder, builder.getUnknownLoc(),
+                                   "test_func", funcType);
   auto *entry = func.addEntryBlock();
   builder.setInsertionPointToStart(entry);
 
@@ -126,55 +124,60 @@ ModuleOp createTestModule(MLIRContext *context, StringRef gateSpecStr) {
                                                         builder.getF64Type());
 
   if (gateName == "h") {
-    builder.create<quake::HOp>(loc, isAdj, controls, target);
+    quake::HOp::create(builder, loc, isAdj, controls, target);
   } else if (gateName == "s") {
-    builder.create<quake::SOp>(loc, isAdj, controls, target);
+    quake::SOp::create(builder, loc, isAdj, controls, target);
   } else if (gateName == "t") {
-    builder.create<quake::TOp>(loc, isAdj, controls, target);
+    quake::TOp::create(builder, loc, isAdj, controls, target);
   } else if (gateName == "x") {
-    builder.create<quake::XOp>(loc, isAdj, controls, target);
+    quake::XOp::create(builder, loc, isAdj, controls, target);
   } else if (gateName == "y") {
-    builder.create<quake::YOp>(loc, isAdj, controls, target);
+    quake::YOp::create(builder, loc, isAdj, controls, target);
   } else if (gateName == "z") {
-    builder.create<quake::ZOp>(loc, isAdj, controls, target);
+    quake::ZOp::create(builder, loc, isAdj, controls, target);
   } else if (gateName == "rx") {
-    builder.create<quake::RxOp>(loc, isAdj, ValueRange{pi_2}, controls, target);
+    quake::RxOp::create(builder, loc, isAdj, ValueRange{pi_2}, controls,
+                        target);
   } else if (gateName == "ry") {
-    builder.create<quake::RyOp>(loc, isAdj, ValueRange{pi_2}, controls, target);
+    quake::RyOp::create(builder, loc, isAdj, ValueRange{pi_2}, controls,
+                        target);
   } else if (gateName == "rz") {
-    builder.create<quake::RzOp>(loc, isAdj, ValueRange{pi_2}, controls, target);
+    quake::RzOp::create(builder, loc, isAdj, ValueRange{pi_2}, controls,
+                        target);
   } else if (gateName == "r1") {
-    builder.create<quake::R1Op>(loc, isAdj, ValueRange{pi_2}, controls, target);
+    quake::R1Op::create(builder, loc, isAdj, ValueRange{pi_2}, controls,
+                        target);
   } else if (gateName == "u3") {
-    builder.create<quake::U3Op>(loc, isAdj, ValueRange{pi_2, pi_2, pi_2},
-                                controls, target);
+    quake::U3Op::create(builder, loc, isAdj, ValueRange{pi_2, pi_2, pi_2},
+                        controls, target);
   } else if (gateName == "phased_rx") {
-    builder.create<quake::PhasedRxOp>(loc, isAdj, ValueRange{{pi_2, pi_2}},
-                                      controls, target);
+    quake::PhasedRxOp::create(builder, loc, isAdj, ValueRange{{pi_2, pi_2}},
+                              controls, target);
   } else if (gateName == "swap") {
     // Swap needs 2 targets
     Value target = entry->getArgument(0);
     Value target2 = entry->getArgument(1);
-    builder.create<quake::SwapOp>(loc, ValueRange{target, target2});
+    quake::SwapOp::create(builder, loc, ValueRange{target, target2});
   } else if (gateName == "exp_pauli") {
     Value target = entry->getArgument(0);
     Value target2 = entry->getArgument(1);
     // Create a veq from the two target qubits using ConcatOp
     SmallVector<Value> targetValues = {target, target2};
-    Value qubitsVal = builder.create<quake::ConcatOp>(
-        loc, quake::VeqType::get(builder.getContext(), 2), targetValues);
-
-    builder.create<quake::ExpPauliOp>(loc,
-                                      /* parameters = */ ValueRange{pi_2},
-                                      /* controls = */ ValueRange{},
-                                      /* targets = */ qubitsVal,
-                                      /* pauliLiteral = */ "XX");
+    Value qubitsVal = quake::ConcatOp::create(
+        builder, loc, quake::VeqType::get(builder.getContext(), 2),
+        targetValues);
+
+    quake::ExpPauliOp::create(builder, loc,
+                              /* parameters = */ ValueRange{pi_2},
+                              /* controls = */ ValueRange{},
+                              /* targets = */ qubitsVal,
+                              /* pauliLiteral = */ "XX");
   } else {
     // Unsupported gate for this test
     ADD_FAILURE() << "unknown gate: " << gateName;
   }
 
-  builder.create<func::ReturnOp>(loc);
+  func::ReturnOp::create(builder, loc);
   return module;
 }
 
@@ -243,8 +246,15 @@ TEST_F(DecompositionPatternsTest, PatternNamesMatchDebugNames) {
 
   for (auto &entry : patternEntries) {
     std::string patternName = entry.getName().str();
-    // Create the pattern
-    auto patternType = entry.instantiate();
+    std::unique_ptr<cudaq::DecompositionPatternType> patternType;
+    for (auto it = cudaq::DecompositionPatternType::RegistryType::begin(),
+              ie = cudaq::DecompositionPatternType::RegistryType::end();
+         it != ie; ++it) {
+      if (patternName == it->getName()) {
+        patternType = it->instantiate();
+        break;
+      }
+    }
     ASSERT_NE(patternType, nullptr)
         << "Failed to recover registered pattern type: " << patternName;
 
@@ -272,15 +282,12 @@ TEST_F(DecompositionPatternsTest, MetadataConsistency) {
     std::string sourceGate = patternType->getSourceOp().str();
     auto targetGates = patternType->getTargetOps();
 
-    // Source gate should not be empty
     EXPECT_FALSE(sourceGate.empty())
         << "Pattern '" << patternName << "' has empty source gate";
 
-    // Target gates should not be empty
     EXPECT_FALSE(targetGates.empty())
         << "Pattern '" << patternName << "' has empty target gates";
 
-    // All target gates should be non-empty
     for (auto targetGate : targetGates) {
       EXPECT_FALSE(targetGate.empty())
           << "Pattern '" << patternName << "' has empty target gate in list";
@@ -304,8 +311,7 @@ TEST_F(DecompositionPatternsTest, DecompositionProducesOnlyTargetGates) {
     // Apply the decomposition pass with only this pattern enabled
     PassManager pm(context.get());
     cudaq::opt::DecompositionOptions options;
-    std::string ownedEnabledPatterns[]{patternName};
-    options.enabledPatterns = ownedEnabledPatterns;
+    options.enabledPatterns = llvm::SmallVector<std::string>{patternName};
     pm.addPass(cudaq::opt::createDecomposition(options));
 
     // Run the pass
diff --git a/unittests/Optimizer/HermitianTrait.cpp b/unittests/Optimizer/HermitianTrait.cpp
index aa5e292a094..997d4838ecb 100644
--- a/unittests/Optimizer/HermitianTrait.cpp
+++ b/unittests/Optimizer/HermitianTrait.cpp
@@ -18,8 +18,8 @@ TEST(Quake, HermitianTrait) {
   context.loadDialect<quake::QuakeDialect>();
   OpBuilder builder(&context);
 
-  Value qubit = builder.create<quake::AllocaOp>(builder.getUnknownLoc());
-  Operation *op = builder.create<quake::HOp>(builder.getUnknownLoc(), qubit);
+  Value qubit = quake::AllocaOp::create(builder, builder.getUnknownLoc());
+  Operation *op = quake::HOp::create(builder, builder.getUnknownLoc(), qubit);
   ASSERT_TRUE(op->hasTrait<cudaq::Hermitian>());
 
   auto optor = dyn_cast<quake::OperatorInterface>(op);
@@ -27,7 +27,7 @@ TEST(Quake, HermitianTrait) {
   // The following does not work because of an MLIR bug
   // ASSERT_TRUE(optor.hasTrait<cudaq::Hermitian>());
 
-  op = builder.create<quake::TOp>(builder.getUnknownLoc(), qubit);
+  op = quake::TOp::create(builder, builder.getUnknownLoc(), qubit);
   ASSERT_FALSE(op->hasTrait<cudaq::Hermitian>());
 
   optor = dyn_cast<quake::OperatorInterface>(op);
diff --git a/unittests/backends/CMakeLists.txt b/unittests/backends/CMakeLists.txt
index ed42c11cc55..b5d5fb3241a 100644
--- a/unittests/backends/CMakeLists.txt
+++ b/unittests/backends/CMakeLists.txt
@@ -13,11 +13,13 @@ set(default_backend_unittest_libs
   cudaq
   cudaq-builder
   cudaq-mlir-runtime
-  cudaq-rest-qpu
   cudaq-operator
   nvqir nvqir-qpp
   cudaq-platform-default 
   gtest_main)
+if (CUDAQ_ENABLE_REST)
+  list(APPEND default_backend_unittest_libs cudaq-rest-qpu)
+endif()
 
 define_property(DIRECTORY PROPERTY BACKEND_UNITTEST_LIBS INHERITED
   BRIEF_DOCS "Default libraries for backend unit tests"
diff --git a/unittests/backends/qpp_observe/CMakeLists.txt b/unittests/backends/qpp_observe/CMakeLists.txt
index ae1d1c8ef5c..5affb1d50ea 100644
--- a/unittests/backends/qpp_observe/CMakeLists.txt
+++ b/unittests/backends/qpp_observe/CMakeLists.txt
@@ -33,4 +33,4 @@ add_backend_unittest_executable(test_observe_backend
   cudaq-em-default
   )
 
-gtest_discover_tests(test_observe_backend)
+gtest_discover_tests(test_observe_backend DISCOVERY_TIMEOUT 120)
diff --git a/unittests/backends/quake_backend/mock_server.py b/unittests/backends/quake_backend/mock_server.py
index 9adc4456af0..6719d15a197 100644
--- a/unittests/backends/quake_backend/mock_server.py
+++ b/unittests/backends/quake_backend/mock_server.py
@@ -21,7 +21,6 @@
 # Define the REST Server App
 app = FastAPI()
 
-llvm.initialize()
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()
 target = llvm.Target.from_default_triple()
@@ -53,16 +52,16 @@ async def postJob(request: Request):
     pm = PassManager.parse(
         "builtin.module(canonicalize,distributed-device-call,cse)", context=ctx)
     try:
-        pm.run(recovered_mod)
-    except:
+        pm.run(recovered_mod.operation)
+    except Exception as e:
         raise RuntimeError(
-            f"Failed to run pass manager on the recovered module.")
+            f"Failed to run pass manager on the recovered module: {e}")
 
     entry_func_name = ""
     for op in recovered_mod.body.operations:
         if isinstance(op, func.FuncOp):
             for attr in op.attributes:
-                if attr.name == "cudaq-entrypoint":
+                if attr == "cudaq-entrypoint":
                     entry_func_name = op.name.value
                     break
     # Lower the module to LLVM IR
diff --git a/unittests/backends/quantinuum/CMakeLists.txt b/unittests/backends/quantinuum/CMakeLists.txt
index a748b22a1e9..5d618836013 100644
--- a/unittests/backends/quantinuum/CMakeLists.txt
+++ b/unittests/backends/quantinuum/CMakeLists.txt
@@ -74,4 +74,4 @@ add_backend_unittest_executable(test_quantinuum_util
   cudaq-platform-default
   gtest_main
 )
-gtest_discover_tests(test_quantinuum_util)
+gtest_discover_tests(test_quantinuum_util DISCOVERY_TIMEOUT 120)
diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp
index 35998547a97..47ebba6c27f 100644
--- a/unittests/integration/builder_tester.cpp
+++ b/unittests/integration/builder_tester.cpp
@@ -643,6 +643,7 @@ CUDAQ_TEST(BuilderTester, checkSwap) {
     // `first` and `second` should SWAP.
     kernel.swap<cudaq::ctrl>(ctrls0, ctrls1, ctrls2, first, second);
 
+    std::cout << kernel.to_quake() << "\n";
     auto counts = cudaq::sample(kernel);
     counts.dump();
     std::string ctrls_state = "11111";
diff --git a/unittests/integration/noise_tester.cpp b/unittests/integration/noise_tester.cpp
index b9d3ee84f0e..8b89609991e 100644
--- a/unittests/integration/noise_tester.cpp
+++ b/unittests/integration/noise_tester.cpp
@@ -565,7 +565,9 @@ CUDAQ_TEST(NoiseTest, checkBitFlipType) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+#if defined(CUDAQ_BACKEND_DM) ||                                               \
+    (defined(CUDAQ_BACKEND_STIM) && defined(NDEBUG) &&                         \
+     !defined(_GLIBCXX_ASSERTIONS)) ||                                         \
     defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkBitFlipTypeSimple) {
@@ -582,7 +584,9 @@ CUDAQ_TEST(NoiseTest, checkBitFlipTypeSimple) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+#if defined(CUDAQ_BACKEND_DM) ||                                               \
+    (defined(CUDAQ_BACKEND_STIM) && defined(NDEBUG) &&                         \
+     !defined(_GLIBCXX_ASSERTIONS)) ||                                         \
     defined(CUDAQ_BACKEND_TENSORNET)
 // Same as above but use alternate sample interface that specifies the number of
 // shots and the noise model to use.
@@ -603,7 +607,9 @@ CUDAQ_TEST(NoiseTest, checkBitFlipTypeSimpleOptions) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+#if defined(CUDAQ_BACKEND_DM) ||                                               \
+    (defined(CUDAQ_BACKEND_STIM) && defined(NDEBUG) &&                         \
+     !defined(_GLIBCXX_ASSERTIONS)) ||                                         \
     defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkPhaseFlipType) {
@@ -681,7 +687,9 @@ struct xOpAll {
 };
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+#if defined(CUDAQ_BACKEND_DM) ||                                               \
+    (defined(CUDAQ_BACKEND_STIM) && defined(NDEBUG) &&                         \
+     !defined(_GLIBCXX_ASSERTIONS)) ||                                         \
     defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkAllQubitChannel) {
@@ -845,7 +853,9 @@ CUDAQ_TEST(NoiseTest, checkAllQubitChannelWithControlPrefix) {
 }
 
 #endif
-#if defined(CUDAQ_BACKEND_DM) || defined(CUDAQ_BACKEND_STIM) ||                \
+#if defined(CUDAQ_BACKEND_DM) ||                                               \
+    (defined(CUDAQ_BACKEND_STIM) && defined(NDEBUG) &&                         \
+     !defined(_GLIBCXX_ASSERTIONS)) ||                                         \
     defined(CUDAQ_BACKEND_TENSORNET)
 
 CUDAQ_TEST(NoiseTest, checkCallbackChannel) {
diff --git a/unittests/ptsbe/PTSBESampleTester.cpp b/unittests/ptsbe/PTSBESampleTester.cpp
index 4c78ca64d56..10aa115314d 100644
--- a/unittests/ptsbe/PTSBESampleTester.cpp
+++ b/unittests/ptsbe/PTSBESampleTester.cpp
@@ -604,7 +604,7 @@ CUDAQ_TEST(PTSBESampleTest, SampleAsyncPropagatesException) {
       "test_kernel", 10, PTSBEOptions{}, /*qpu_id=*/0, noise);
 
   try {
-    future.get();
+    [[maybe_unused]] auto result = future.get();
     FAIL() << "Expected exception from async PTSBE sampling";
   } catch (const std::runtime_error &e) {
     EXPECT_NE(std::string(e.what()).find("injected async failure"),
diff --git a/unittests/qir/NVQIRVerify.cpp b/unittests/qir/NVQIRVerify.cpp
index fd8dc53589e..7f51cb71ba2 100644
--- a/unittests/qir/NVQIRVerify.cpp
+++ b/unittests/qir/NVQIRVerify.cpp
@@ -34,10 +34,10 @@ TEST(NVQIRVerify, check1) {
   StringRef theQuake = R"#(
     llvm.func @indirectCallFunc() -> i32
     llvm.func @entryPoint() {
-      %0 = llvm.mlir.addressof @indirectCallFunc : !llvm.ptr<func<i32()>>
+      %0 = llvm.mlir.addressof @indirectCallFunc : !llvm.ptr
       // expected-error @+2 {{unexpected indirect call in NVQIR}}
       // expected-note @+1 {{}}
-      %1 = llvm.call %0() : () -> i32
+      %1 = llvm.call %0() : !llvm.ptr, () -> i32
       llvm.return
     }
     )#";
diff --git a/unittests/target_config/CMakeLists.txt b/unittests/target_config/CMakeLists.txt
index 7467a415e62..63ae5dde0ff 100644
--- a/unittests/target_config/CMakeLists.txt
+++ b/unittests/target_config/CMakeLists.txt
@@ -17,4 +17,4 @@ target_link_libraries(test_target_config
     gtest_main
 )
 
-gtest_discover_tests(test_target_config)
+gtest_discover_tests(test_target_config DISCOVERY_TIMEOUT 120)
diff --git a/utils/CircuitCheck/CircuitCheck.cpp b/utils/CircuitCheck/CircuitCheck.cpp
index 612e9f75098..614662ee989 100644
--- a/utils/CircuitCheck/CircuitCheck.cpp
+++ b/utils/CircuitCheck/CircuitCheck.cpp
@@ -11,6 +11,7 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Pass/PassManager.h"
@@ -58,7 +59,7 @@ int main(int argc, char **argv) {
 
   MLIRContext context;
   context.loadDialect<cudaq::cc::CCDialect, quake::QuakeDialect,
-                      func::FuncDialect>();
+                      arith::ArithDialect, func::FuncDialect>();
 
   ParserConfig config(&context);
   auto checkMod = parseSourceFile<mlir::ModuleOp>(checkFilename, config);
diff --git a/utils/mock_qpu/anyon/__init__.py b/utils/mock_qpu/anyon/__init__.py
index be772b37146..e12784ae591 100644
--- a/utils/mock_qpu/anyon/__init__.py
+++ b/utils/mock_qpu/anyon/__init__.py
@@ -30,7 +30,6 @@ class Job(BaseModel):
 # Could how many times the client has requested the Job
 countJobGetRequests = 0
 
-llvm.initialize()
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()
 target = llvm.Target.from_default_triple()
diff --git a/utils/mock_qpu/braket/__init__.py b/utils/mock_qpu/braket/__init__.py
index be772b37146..e12784ae591 100644
--- a/utils/mock_qpu/braket/__init__.py
+++ b/utils/mock_qpu/braket/__init__.py
@@ -30,7 +30,6 @@ class Job(BaseModel):
 # Could how many times the client has requested the Job
 countJobGetRequests = 0
 
-llvm.initialize()
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()
 target = llvm.Target.from_default_triple()
diff --git a/utils/mock_qpu/ionq/__init__.py b/utils/mock_qpu/ionq/__init__.py
index 8d678e958f0..9432b4850c8 100644
--- a/utils/mock_qpu/ionq/__init__.py
+++ b/utils/mock_qpu/ionq/__init__.py
@@ -39,7 +39,6 @@ class Job(BaseModel):
 # Save how many qubits were needed for each test (emulates real backend)
 numQubitsRequired = 0
 
-llvm.initialize()
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()
 target = llvm.Target.from_default_triple()
diff --git a/utils/mock_qpu/oqc/__init__.py b/utils/mock_qpu/oqc/__init__.py
index 398f219995d..0573f4c7dad 100644
--- a/utils/mock_qpu/oqc/__init__.py
+++ b/utils/mock_qpu/oqc/__init__.py
@@ -49,7 +49,6 @@ class TaskIdRequest(BaseModel):
 # Could how many times the client has requested the Job
 countJobGetRequests = 0
 
-llvm.initialize()
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()
 target = llvm.Target.from_default_triple()
diff --git a/utils/mock_qpu/qci/__init__.py b/utils/mock_qpu/qci/__init__.py
index b0533ddbaaa..1eb7b4531dc 100644
--- a/utils/mock_qpu/qci/__init__.py
+++ b/utils/mock_qpu/qci/__init__.py
@@ -44,7 +44,6 @@ class JobRequest(BaseModel):
     options: dict[str, Any] = {}
 
 
-llvm.initialize()
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()
 target = llvm.Target.from_default_triple()
diff --git a/utils/mock_qpu/quantinuum/__init__.py b/utils/mock_qpu/quantinuum/__init__.py
index ddb6e75e03f..6fcff034aed 100644
--- a/utils/mock_qpu/quantinuum/__init__.py
+++ b/utils/mock_qpu/quantinuum/__init__.py
@@ -41,7 +41,6 @@ class Job(BaseModel):
 # Keep track of created decoder configurations
 createdDecoderConfigs = {}
 
-llvm.initialize()
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()
 target = llvm.Target.from_default_triple()